# Big Data Management Project 1:
## Analyzing New York City Taxi Data

In [1]:
!pip install shapely

Collecting shapely
  Using cached shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
Installing collected packages: shapely
Successfully installed shapely-2.0.7


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, unix_timestamp, col, lag, avg, lead, count, sum as spark_sum, floor
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType
from pyspark.sql.window import Window

import json
import time

from shapely.geometry import shape, Point

In [3]:
spark = (SparkSession.builder
                    .appName('BDM_Project1')
                    .enableHiveSupport()
                    .getOrCreate()
        )

### NYC Borough Data

In [4]:
with open('input/nyc-boroughs.geojson') as f:
    geo_data = json.load(f)

# Broadcasting data to workers
broadcast_geo_data = spark.sparkContext.broadcast(geo_data)

# Creating a dictionary of borough codes and polygons within the borough
polygons = {}
b_names = {} # borough names by code

for feature in broadcast_geo_data.value['features']:

    code = feature['properties']['boroughCode']
    name = feature['properties']['borough']

    if code not in polygons:
        polygons[code] = []
        b_names[code] = name

    polygons[code].append(shape(feature['geometry']))

# Sorting borough polygons by area
for code in polygons:
    polygons[code] = sorted(
        polygons[code], key=lambda x: x.area, reverse=True
    )

In [5]:
# UDF: longitude, latitude -> borough
def get_borough(long, lat):
    point = Point(long, lat)

    for code, pols in polygons.items():
        for polygon in pols:
            if polygon.contains(point):
                return code

    return None

get_borough_udf = udf(get_borough, IntegerType())

### NYC Taxi Data

In [6]:
start_time = time.time() # To see the time it takes to execute data transformations

# Defining the schema for faster reading of data
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("vendor_id", StringType(), True),
    StructField("rate_code", StringType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", StringType(), True),
    StructField("trip_time_in_secs", StringType(), True),
    StructField("trip_distance", StringType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True)
])

# Creating a single dataframe of all the trip_data files
taxi_df_og = (spark.read
             .option("sep", ",")
             .option("header", True)
             .schema(schema)
             .csv("input/trip_data/trip_data_*.csv")
             #.csv(["input/trip_data/trip_data_1.csv"])
          )

# Removing the trips with 0 passengers
# Transforming the data (further explained in the project report)
taxi_df = taxi_df_og.filter(
    (taxi_df_og.passenger_count != "0") &
    ~(
        (taxi_df_og.trip_distance == 0) &
        (taxi_df_og.pickup_longitude == taxi_df_og.dropoff_longitude) &
        (taxi_df_og.pickup_latitude == taxi_df_og.dropoff_latitude)
    )
).withColumns({
    "pickup_ts": unix_timestamp("pickup_datetime", "dd-MM-yy HH:mm"),
    "dropoff_ts": unix_timestamp("dropoff_datetime", "dd-MM-yy HH:mm")
}).withColumn(
    "duration", col("dropoff_ts") - col("pickup_ts")
).filter(
    (col("duration") > 0) & (col("duration") <= 4 * 60 * 60)
).select(
    "hack_license",
    "pickup_latitude",
    "pickup_longitude",
    "pickup_ts",
    "dropoff_latitude",
    "dropoff_longitude",
    "dropoff_ts",
    "duration"
).dropna()

# Adding pick up and drop off boroughs to taxi data
taxi_df = taxi_df.withColumn(
    "pickup_borough", get_borough_udf("pickup_longitude", "pickup_latitude")
).withColumn(
    "dropoff_borough", get_borough_udf("dropoff_longitude", "dropoff_latitude")
)
taxi_df.show(5, truncate=False)

print("Execution time", time.time() - start_time)

+--------------------------------+---------------+----------------+----------+----------------+-----------------+----------+--------+--------------+---------------+
|hack_license                    |pickup_latitude|pickup_longitude|pickup_ts |dropoff_latitude|dropoff_longitude|dropoff_ts|duration|pickup_borough|dropoff_borough|
+--------------------------------+---------------+----------------+----------+----------------+-----------------+----------+--------+--------------+---------------+
|BA96DE419E711691B9445D6A6307C170|40.757977      |-73.978165      |1357053108|40.751171       |-73.989838       |1357053490|382     |1             |1              |
|9FD8F69F0804BDB5549F40E9DA1BE472|40.731781      |-74.006683      |1357431515|40.75066        |-73.994499       |1357431774|259     |1             |1              |
|9FD8F69F0804BDB5549F40E9DA1BE472|40.73777       |-74.004707      |1357411781|40.726002       |-74.009834       |1357412063|282     |1             |1              |
|51EE87E32

### Query 1
Utilization: idle time per taxi

In [7]:
# Copy taxi_df dataframe
taxi_copy = (
    taxi_df
    .select("hack_license", "pickup_ts", "dropoff_ts", "dropoff_borough")
    .repartition("hack_license")
)

# Partition by driver and then order by pickup time with Window
window_sp = Window.partitionBy("hack_license").orderBy("pickup_ts")

# Compute previous dropoff time and idle time
taxi_copy = (
    taxi_copy.withColumn("prev_dropoff_ts", lag("dropoff_ts").over(window_sp))
             .withColumn("idle_time", col("pickup_ts") - col("prev_dropoff_ts"))
)

# Control that idle time is not over 4h and filter out rows where pickup_ts < prev_dropoff_ts (overlapping trips)
taxi_copy = taxi_copy.filter(
    (col("idle_time").isNotNull()) &
    (col("idle_time") > 0) &
    (col("idle_time") <= 4 * 60 * 60)
)

# Group by driver and calculate total idle time and total ride duration
taxi_stats = (
    taxi_copy.groupBy("hack_license")
    .agg(
        spark_sum("idle_time").alias("total_idle_time"),
        spark_sum(col("dropoff_ts") - col("pickup_ts")).alias("total_ride_duration")
    )
)

# Calculate utilization rate
utilization_df = (
    taxi_stats.withColumn("total_time", col("total_idle_time") + col("total_ride_duration"))
              .withColumn("utilization_rate", col("total_ride_duration") / col("total_time"))
)

utilization_df.show(10)

+--------------------+---------------+-------------------+----------+-------------------+
|        hack_license|total_idle_time|total_ride_duration|total_time|   utilization_rate|
+--------------------+---------------+-------------------+----------+-------------------+
|001EEDEA00E57988E...|        2824815|            3997082|   6821897| 0.5859194297421964|
|00C2BC1F860FC66BA...|        3476121|            4458427|   7934548| 0.5619005644681966|
|013DB7F394A06CD24...|        3156241|            3770415|   6926656| 0.5443340913710742|
|01594037EE38FE0D9...|          96540|             114360|    210900| 0.5422475106685632|
|015D33FBAB8A7C5CE...|        4319100|            3831240|   8150340| 0.4700711872142757|
|01606C9E10D8D0B19...|        3479773|            3549859|   7029632| 0.5049850404686903|
|024B2794FF91BF97F...|        2489916|            3175059|   5664975| 0.5604718467424834|
|02548BECEDACA82F0...|        4156986|            2832100|   6989086|0.40521750626619846|
|02856AFC2

In [11]:
avg_utilization_rate = utilization_df.agg(avg("utilization_rate").alias("avg_utilization_rate"))

# Show the result
avg_utilization_rate.show()

+--------------------+
|avg_utilization_rate|
+--------------------+
|    0.50287482780684|
+--------------------+

