In [11]:
import pyspark
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("NYTaxi") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/25 21:11:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


We want to have a single data set containing both green and yellow taxi data.

In [3]:
df_green = spark.read.parquet("data/pq/green/*/*")
df_green.printSchema()

                                                                                

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [4]:
df_yellow = spark.read.parquet("data/pq/yellow/*/*")
df_yellow.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [6]:
df_green = df_green \
    .withColumnRenamed("lpep_pickup_datetime", "pickup_datetime") \
    .withColumnRenamed("lpep_dropoff_datetime", "dropoff_datetime")

In [5]:
df_yellow = df_yellow \
    .withColumnRenamed("tpep_pickup_datetime", "pickup_datetime") \
    .withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime")

In [7]:
set(df_green.columns) & set(df_yellow.columns)

{'DOLocationID',
 'PULocationID',
 'RatecodeID',
 'VendorID',
 'congestion_surcharge',
 'dropoff_datetime',
 'extra',
 'fare_amount',
 'improvement_surcharge',
 'mta_tax',
 'passenger_count',
 'payment_type',
 'pickup_datetime',
 'store_and_fwd_flag',
 'tip_amount',
 'tolls_amount',
 'total_amount',
 'trip_distance'}

But that will mess up the columns' original ordering. So let's do it differently:

In [9]:
common_columns = []

yellow_columns = set(df_yellow.columns)

for col in df_green.columns:
    if col in yellow_columns:
        common_columns.append(col)

common_columns

['VendorID',
 'pickup_datetime',
 'dropoff_datetime',
 'store_and_fwd_flag',
 'RatecodeID',
 'PULocationID',
 'DOLocationID',
 'passenger_count',
 'trip_distance',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'payment_type',
 'congestion_surcharge']

Nice. Now let's make sure we can trace each record back to where it came from.

In [13]:
df_green = df_green \
    .select(common_columns) \
    .withColumn("service_type", F.lit("green"))

In [14]:
df_yellow = df_yellow \
    .select(common_columns) \
    .withColumn("service_type", F.lit("yellow"))

In [15]:
df_trips_data = df_green.unionAll(df_yellow)

In [16]:
df_trips_data.groupby("service_type").count().show()

                                                                                

+------------+--------+
|service_type|   count|
+------------+--------+
|       green| 2304517|
|      yellow|39649199|
+------------+--------+



And now we can use SQL to query our data. But first we need to give it an alias to use in our SQL queries.

In [18]:
# registerTempTable() is deprecated in favour of createOrReplaceTempView()
df_trips_data.createOrReplaceTempView("trips_data")

In [22]:
spark.sql("""
SELECT
    service_type,
    COUNT(1) AS count
FROM
    trips_data
GROUP BY
    service_type
""").show()



+------------+--------+
|service_type|   count|
+------------+--------+
|       green| 2304517|
|      yellow|39649199|
+------------+--------+



                                                                                

In [27]:
df_result = spark.sql("""
SELECT 
    -- Reveneue grouping 
    PULocationID AS revenue_zone,
    date_trunc('month', pickup_datetime) AS revenue_month,
    service_type, 

    -- Revenue calculation 
    SUM(fare_amount) AS revenue_monthly_fare,
    SUM(extra) AS revenue_monthly_extra,
    SUM(mta_tax) AS revenue_monthly_mta_tax,
    SUM(tip_amount) AS revenue_monthly_tip_amount,
    SUM(tolls_amount) AS revenue_monthly_tolls_amount,
    SUM(improvement_surcharge) AS revenue_monthly_improvement_surcharge,
    SUM(total_amount) AS revenue_monthly_total_amount,
    SUM(congestion_surcharge) AS revenue_monthly_congestion_surcharge,

    -- Additional calculations
    AVG(passenger_count) AS avg_montly_passenger_count,
    AVG(trip_distance) AS avg_montly_trip_distance

FROM
    trips_data
GROUP BY
    1, 2, 3
""")

In [28]:
df_result.show()



+------------+-------------------+------------+--------------------+---------------------+-----------------------+--------------------------+----------------------------+-------------------------------------+----------------------------+------------------------------------+--------------------------+------------------------+
|revenue_zone|      revenue_month|service_type|revenue_monthly_fare|revenue_monthly_extra|revenue_monthly_mta_tax|revenue_monthly_tip_amount|revenue_monthly_tolls_amount|revenue_monthly_improvement_surcharge|revenue_monthly_total_amount|revenue_monthly_congestion_surcharge|avg_montly_passenger_count|avg_montly_trip_distance|
+------------+-------------------+------------+--------------------+---------------------+-----------------------+--------------------------+----------------------------+-------------------------------------+----------------------------+------------------------------------+--------------------------+------------------------+
|         250|2020-

                                                                                

In [29]:
df_result.write.parquet("data/report/revenue/")

                                                                                

In [30]:
!ls -FGhla data/report/revenue/

total 532K
drwxr-xr-x 2 freddie 4.0K Oct 25 21:43 ./
drwxr-xr-x 3 freddie 4.0K Oct 25 21:43 ../
-rw-r--r-- 1 freddie    8 Oct 25 21:43 ._SUCCESS.crc
-rw-r--r-- 1 freddie 4.1K Oct 25 21:43 .part-00000-21946b23-eb8c-479f-a973-03865a0ac8af-c000.snappy.parquet.crc
-rw-r--r-- 1 freddie    0 Oct 25 21:43 _SUCCESS
-rw-r--r-- 1 freddie 512K Oct 25 21:43 part-00000-21946b23-eb8c-479f-a973-03865a0ac8af-c000.snappy.parquet


In [31]:
df_result.coalesce(1).write.parquet("data/report/revenue/", mode="overwrite")

                                                                                

In [32]:
!ls -FGhla data/report/revenue/

total 532K
drwxr-xr-x 2 freddie 4.0K Oct 25 21:46 ./
drwxr-xr-x 3 freddie 4.0K Oct 25 21:46 ../
-rw-r--r-- 1 freddie    8 Oct 25 21:46 ._SUCCESS.crc
-rw-r--r-- 1 freddie 4.1K Oct 25 21:46 .part-00000-bc1a9f90-dcd6-447c-bbea-cc636d8d938f-c000.snappy.parquet.crc
-rw-r--r-- 1 freddie    0 Oct 25 21:46 _SUCCESS
-rw-r--r-- 1 freddie 512K Oct 25 21:46 part-00000-bc1a9f90-dcd6-447c-bbea-cc636d8d938f-c000.snappy.parquet


>Note: `coalesce()` is the opposite of `repartition()`.

Spark is good for transformations within the data lake, as we have done here (if we can pretend for a moment that our local hard drive were a data lake!) We didn't have to go to our data warehouse here. Although it's too simple and contrived of an example (because we could have done everything we did here easily in SQL. But if we have a Spark cluster, might as well use it. As you can see it was fairly easy to use and flexible and performant).