# Big Data Management Project 1:
## Analyzing New York City Taxi Data

In [1]:
!pip install shapely



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, unix_timestamp, col, lag, avg, lead, sum, count, broadcast, lead, col, avg as spark_sum
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window

import json

from shapely.geometry import shape, Point

In [3]:
spark = (SparkSession.builder
                    .appName('BDM_Project1')
                    .enableHiveSupport()  # Enables Hive support, persistent Hive metastore
                    .getOrCreate()
        )

### NYC Borough Data

In [4]:
with open('input/nyc-boroughs.geojson') as f:
    geo_data = json.load(f)

# Broadcasting data to workers
broadcast_geo_data = spark.sparkContext.broadcast(geo_data)

# TODO: is using a dictionary fine? "dataframe can be created out of it"
# Creating a dictionary of borough codes and polygons within the borough
polygons = {}
b_names = {} # borough names by code

for feature in broadcast_geo_data.value['features']:

    code = feature['properties']['boroughCode']
    name = feature['properties']['borough']

    if code not in polygons:
        polygons[code] = []
        b_names[code] = name

    polygons[code].append(shape(feature['geometry']))

# Sorting borough polygons by area
for code in polygons:
    polygons[code] = sorted(
        polygons[code], key=lambda x: x.area, reverse=True
    )

# also sort boroughs by total area?
#borough_total_areas = {code: sum(poly.area for poly in polys) for code, polys in polygons.items()}
#sorted_boroughs = sorted(borough_total_areas.keys(), key=lambda x: borough_total_areas[x], reverse=True)
#polygons = {code: polygons[code] for code in sorted_boroughs}


In [5]:
# UDF: longitude, latitude -> borough
def get_borough(long, lat):
    point = Point(long, lat)

    for code, pols in polygons.items():
        for polygon in pols:
            if polygon.contains(point):
                return code

    return None

get_borough_udf = udf(get_borough, IntegerType())

### NYC Taxi Data

In [6]:
taxi_df = (spark.read
             .option("sep", ",")
             .option("header", True)
             .option("inferSchema", True)
             .csv("input/Sample NYC Data.csv")
            )

# Selecting only necessary columns
taxi_df = taxi_df.select(
    "hack_license",
    "pickup_latitude",
    "pickup_longitude",
    "pickup_datetime",
    "dropoff_latitude",
    "dropoff_longitude",
    "dropoff_datetime"
)

# Converting datetime to unix timestamp (seconds)
taxi_df = taxi_df.withColumn(
    "pickup_ts", unix_timestamp("pickup_datetime", "dd-MM-yy HH:mm")
).withColumn(
    "dropoff_ts", unix_timestamp("dropoff_datetime", "dd-MM-yy HH:mm")
)

# Calculating ride duration (seconds)
taxi_df = taxi_df.withColumn(
    "duration", (taxi_df["dropoff_ts"] - taxi_df["pickup_ts"])
)

# Filtering out rides longer than 4h or with negative duration
taxi_df = taxi_df.filter((taxi_df["duration"] > 0) & (taxi_df["duration"] <= 4 * 60 * 60))

# Add pick up and drop off boroughs to taxi data
taxi_df = taxi_df.withColumn(
    "pickup_borough", get_borough_udf("pickup_longitude", "pickup_latitude")
).withColumn(
    "dropoff_borough", get_borough_udf("dropoff_longitude", "dropoff_latitude")
)
taxi_df.show(3, truncate=False)

+--------------------------------+---------------+----------------+---------------+----------------+-----------------+----------------+----------+----------+--------+--------------+---------------+
|hack_license                    |pickup_latitude|pickup_longitude|pickup_datetime|dropoff_latitude|dropoff_longitude|dropoff_datetime|pickup_ts |dropoff_ts|duration|pickup_borough|dropoff_borough|
+--------------------------------+---------------+----------------+---------------+----------------+-----------------+----------------+----------+----------+--------+--------------+---------------+
|BA96DE419E711691B9445D6A6307C170|40.757977      |-73.978165      |01-01-13 15:11 |40.751171       |-73.989838       |01-01-13 15:18  |1357053060|1357053480|420     |1             |1              |
|9FD8F69F0804BDB5549F40E9DA1BE472|40.731781      |-74.006683      |06-01-13 00:18 |40.75066        |-73.994499       |06-01-13 00:22  |1357431480|1357431720|240     |1             |1              |
|9FD8F69F0

### Query 1
Utilization: idle time per taxi

In [7]:
# Copy taxi_df dataframe
taxi_copy = taxi_df.select("*")

# Ensure all trips of the same driver are together
taxi_copy = taxi_copy.repartition("hack_license")

# Partition by driver and then order by pickup time with Window
window_sp = Window.partitionBy("hack_license").orderBy("pickup_ts")

# Compute previous dropoff time
taxi_copy = taxi_copy.withColumn("prev_dropoff_ts", lag("dropoff_ts").over(window_sp))

# Compute idle time
taxi_copy = taxi_copy.withColumn("idle_time", col("pickup_ts") - col("prev_dropoff_ts"))

# Control that the idle time is not over 4h
taxi_copy = taxi_copy.filter((col("idle_time").isNotNull()) & (col("idle_time") <= 14400))

# Group by driver and calculate total idle time and total ride duration
taxi_stats = taxi_copy.groupBy("hack_license").agg(
    spark_sum("idle_time").alias("total_idle_time"),
    spark_sum("duration").alias("total_ride_duration")
)

# Calculate total time (idle + ride duration)
taxi_stats = taxi_stats.withColumn("total_time",
                                   col("total_idle_time") + col("total_ride_duration"))

# Calculate utilization rate
utilization_df = taxi_stats.withColumn(
    "utilization_rate",
    col("total_ride_duration") / col("total_time")
)

utilization_df.show()

+--------------------+------------------+-------------------+------------------+--------------------+
|        hack_license|   total_idle_time|total_ride_duration|        total_time|    utilization_rate|
+--------------------+------------------+-------------------+------------------+--------------------+
|001C8AAB90AEE49F3...|            4320.0|              720.0|            5040.0| 0.14285714285714285|
|0025133AD810DBE80...|            2400.0|              900.0|            3300.0|  0.2727272727272727|
|00447A6197DBB329F...|            3360.0|             1380.0|            4740.0|  0.2911392405063291|
|006313464EC98A24B...|            1575.0|              528.0|            2103.0| 0.25106990014265335|
|007439EEDB510EF82...|            3240.0|              180.0|            3420.0| 0.05263157894736842|
|00927C48BA4C1B2B1...|            903.75|              690.0|           1593.75|  0.4329411764705882|
|00AE05F56D451E89E...|            1850.0|              620.0|            2470.0| 0

### Query 2
The average time it takes for a taxi to find its next fare(trip) per destination borough

In [8]:
window_spec = Window.partitionBy("hack_license").orderBy("pickup_ts")

# Get the next pickup time
next_pick_up = taxi_copy.withColumn("next_pickup_ts", lead("pickup_ts").over(window_spec))

# Calculate waiting time
wait_time = next_pick_up.withColumn("waiting_time", (next_pick_up["next_pickup_ts"] - next_pick_up["dropoff_ts"]))

# Filter out null values, null boroughs, and waiting times > 4 hours
wait_time = wait_time.na.drop(subset=["next_pickup_ts"])
wait_time = wait_time.filter(wait_time["dropoff_borough"].isNotNull())
wait_time = wait_time.filter(col("waiting_time") <= 14400)

# Calculate average waiting time by dropoff borough
result = wait_time.groupBy("dropoff_borough").agg(avg("waiting_time").alias("avg_waiting_time"))

# Create df from borough names
borough_names_df = spark.createDataFrame([(code, name) for code, name in b_names.items()],
                                         ["dropoff_borough", "borough_name"])
query2_result = result.join(borough_names_df, on="dropoff_borough", how="left")
query2_result = query2_result.select("borough_name", "avg_waiting_time").orderBy("avg_waiting_time")

query2_result.show()

+-------------+------------------+
| borough_name|  avg_waiting_time|
+-------------+------------------+
|    Manhattan|   824.00276950772|
|     Brooklyn|1843.5581518852894|
|        Bronx|1966.6666666666667|
|       Queens|2521.4008489993935|
|Staten Island|            4125.0|
+-------------+------------------+



### Query 3
The number of trips that started and ended within the same borough

In [9]:
taxi_df.show(3)

+--------------------+---------------+----------------+---------------+----------------+-----------------+----------------+----------+----------+--------+--------------+---------------+
|        hack_license|pickup_latitude|pickup_longitude|pickup_datetime|dropoff_latitude|dropoff_longitude|dropoff_datetime| pickup_ts|dropoff_ts|duration|pickup_borough|dropoff_borough|
+--------------------+---------------+----------------+---------------+----------------+-----------------+----------------+----------+----------+--------+--------------+---------------+
|BA96DE419E711691B...|      40.757977|      -73.978165| 01-01-13 15:11|       40.751171|       -73.989838|  01-01-13 15:18|1357053060|1357053480|     420|             1|              1|
|9FD8F69F0804BDB55...|      40.731781|      -74.006683| 06-01-13 00:18|        40.75066|       -73.994499|  06-01-13 00:22|1357431480|1357431720|     240|             1|              1|
|9FD8F69F0804BDB55...|       40.73777|      -74.004707| 05-01-13 18:49

In [None]:
sumOfTripsBoroughSame = taxi_df.filter(col("pickup_borough") == col("dropoff_borough")).count()

print("Total trips that start and end in same borough:", sumOfTripsBoroughSame)

same_borough_trips_df = taxi_df.filter(col("pickup_borough") == col("dropoff_borough")).groupBy("pickup_borough").agg(count("hack_license").alias("same_borough_trip_count"))

same_borough_trips_named = (same_borough_trips_df.join(borough_names_df, on="pickup_borough", how="left")
                            .select("borough_name", "same_borough_trip_count")).orderBy(col("same_borough_trip_count").desc())

same_borough_trips_named.show()

### Query 4
The number of trips that started in one borough and ended in another one

In [None]:
sumOfTripsBoroughDifferent = taxi_df.filter(col("pickup_borough") != col("dropoff_borough")).count()
print("Total trips that start and end in different boroughs: " + sumOfTripsBoroughDifferent)

different_borough_trips_df = (taxi_df.filter(col("pickup_borough") != col("dropoff_borough"))
                                      .groupBy("pickup_borough", "dropoff_borough")
                                      .agg(count("*").alias("trip_count")))

pickup_borough_names_df = spark.createDataFrame([(code, name) for code, name in b_names.items()],
                                                ["pickup_borough", "pickup_borough_name"])
different_borough_trips_named = different_borough_trips_df.join(pickup_borough_names_df, on="pickup_borough", how="left")

dropoff_borough_names_df = spark.createDataFrame([(code, name) for code, name in b_names.items()],
                                                 ["dropoff_borough", "dropoff_borough_name"])
different_borough_trips_named = different_borough_trips_named.join(dropoff_borough_names_df, on="dropoff_borough", how="left")

different_borough_trips_named = different_borough_trips_named.select("pickup_borough_name", "dropoff_borough_name", "trip_count")
different_borough_trips_named.orderBy(col("trip_count")).show()