In [1]:
from pyspark.sql import SparkSession



In [2]:
spark = SparkSession \
    .builder \
    .appName('test') \
    .master('spark://spark-master:7077') \
    .config('spark.ui.port', '4051') \
    .config('spark.dynamicAllocation.enabled', 'true') \
    .config('spark.eventLog.enabled', 'true') \
    .config('spark.eventLog.dir', '/home/spark-events') \
    .config('spark.executor.cores', 1) \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/26 14:58:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
!mkdir -p /home/data/nyc_taxi_data/green_tripdata && \
    rm -f /home/data/nyc_taxi_data/green_tripdata/* && \
    wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet -P /home/data/nyc_taxi_data/green_tripdata/

--2023-10-26 14:58:38--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 13.225.242.58, 13.225.242.37, 13.225.242.202, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|13.225.242.58|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1427002 (1.4M) [binary/octet-stream]
Saving to: '/home/data/nyc_taxi_data/green_tripdata/green_tripdata_2023-01.parquet'


2023-10-26 14:58:50 (166 KB/s) - '/home/data/nyc_taxi_data/green_tripdata/green_tripdata_2023-01.parquet' saved [1427002/1427002]



In [4]:
df = spark.read.format('parquet').load('/home/data/nyc_taxi_data/green_tripdata/')

                                                                                

In [5]:
df.count()

                                                                                

68211

In [6]:
df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- lpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- lpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: integer (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: double (nullable = true)
 |-- trip_type: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [7]:
from pyspark.sql.functions import date_format

# fix error in pandas 2.1.1
# Casting to unit-less dtype 'datetime64' is not supported. Pass e.g. 'datetime64[ns]' instead.
df = df.withColumn("lpep_pickup_datetime", date_format("lpep_pickup_datetime", "yyyy-MM-dd HH:mm:ss"))
df = df.withColumn("lpep_dropoff_datetime", date_format("lpep_dropoff_datetime", "yyyy-MM-dd HH:mm:ss"))

In [8]:
df.limit(10).toPandas()

                                                                                

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2023-01-01 00:26:10,2023-01-01 00:37:11,N,1.0,166,143,1.0,2.58,14.9,1.0,0.5,4.03,0.0,,1.0,24.18,1.0,1.0,2.75
1,2,2023-01-01 00:51:03,2023-01-01 00:57:49,N,1.0,24,43,1.0,1.81,10.7,1.0,0.5,2.64,0.0,,1.0,15.84,1.0,1.0,0.0
2,2,2023-01-01 00:35:12,2023-01-01 00:41:32,N,1.0,223,179,1.0,0.0,7.2,1.0,0.5,1.94,0.0,,1.0,11.64,1.0,1.0,0.0
3,1,2023-01-01 00:13:14,2023-01-01 00:19:03,N,1.0,41,238,1.0,1.3,6.5,0.5,1.5,1.7,0.0,,1.0,10.2,1.0,1.0,0.0
4,1,2023-01-01 00:33:04,2023-01-01 00:39:02,N,1.0,41,74,1.0,1.1,6.0,0.5,1.5,0.0,0.0,,1.0,8.0,1.0,1.0,0.0
5,2,2023-01-01 00:53:31,2023-01-01 01:11:04,N,1.0,41,262,1.0,2.78,17.7,1.0,0.5,0.0,0.0,,1.0,22.95,2.0,1.0,2.75
6,1,2023-01-01 00:09:14,2023-01-01 00:26:39,N,1.0,181,45,2.0,3.8,19.1,3.75,1.5,4.85,0.0,,1.0,29.2,1.0,1.0,2.75
7,2,2023-01-01 00:11:58,2023-01-01 00:24:55,N,1.0,24,75,1.0,1.88,14.2,1.0,0.5,0.0,0.0,,1.0,16.7,2.0,1.0,0.0
8,2,2023-01-01 00:41:29,2023-01-01 00:46:26,N,1.0,41,166,2.0,1.11,7.2,1.0,0.5,1.0,0.0,,1.0,10.7,1.0,1.0,0.0
9,2,2023-01-01 00:50:32,2023-01-01 01:13:42,N,1.0,24,140,1.0,4.22,24.7,1.0,0.5,3.0,0.0,,1.0,32.95,1.0,1.0,2.75


In [9]:
spark.stop()