# Big Data Management Project 2:
## DESB GRAND CHALLENGE 2015

In [1]:
!pip install shapely

Collecting shapely
  Using cached shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
Installing collected packages: shapely
Successfully installed shapely-2.0.7


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, unix_timestamp, regexp_extract, col, lag, avg, lead, count, sum as spark_sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType, FloatType
from pyspark.sql.window import Window

import json
import time

In [3]:
pip install --upgrade pyspark

Collecting pyspark
  Using cached pyspark-3.5.5-py2.py3-none-any.whl
Collecting py4j==0.10.9.7 (from pyspark)
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Installing collected packages: py4j, pyspark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.3
    Can't uninstall 'pyspark'. No files were found to uninstall.
Successfully installed py4j-0.10.9.7 pyspark-3.5.5
Note: you may need to restart the kernel to use updated packages.


### Query 0
Data Cleansing and Setup

In [13]:
start_time = time.time()  

spark = SparkSession.builder \
    .appName('BDM_Project2') \
    .getOrCreate()

# Define schema for taxi trips
taxi_schema = """
    medallion STRING, 
    hack_license STRING, 
    pickup_datetime TIMESTAMP, 
    dropoff_datetime TIMESTAMP, 
    trip_time_in_secs INT, 
    trip_distance DOUBLE, 
    pickup_longitude DOUBLE, 
    pickup_latitude DOUBLE, 
    dropoff_longitude DOUBLE, 
    dropoff_latitude DOUBLE, 
    payment_type STRING, 
    fare_amount DOUBLE, 
    surcharge DOUBLE, 
    mta_tax DOUBLE, 
    tip_amount DOUBLE, 
    tolls_amount DOUBLE
"""

taxi_df_og = (
    spark.readStream
    .schema(taxi_schema)
    .option("maxFilesPerTrigger", 1)  # Read one file at a time
    .option("header", True)  # CSV files usually have headers
    .csv("input/")  # Change to CSV directory
)

# Removing the trips with 0 passengers
# Transforming the data 
taxi_df = taxi_df_og.filter(
    (regexp_extract(col("medallion"), r"^[a-fA-F0-9]{32}$", 0) != "") &
    (regexp_extract(col("hack_license"), r"^[a-fA-F0-9]{32}$", 0) != "") &
    (col("pickup_datetime").isNotNull()) &
    (col("dropoff_datetime").isNotNull()) &               
    (col("trip_distance") > 0) &                    
    (col("fare_amount") > 0) &
    (col("tip_amount") > 0)
)

# Convert timestamps to Unix format 
taxi_df = taxi_df.withColumn("pickup_ts", unix_timestamp("pickup_datetime")) \
    .withColumn("dropoff_ts", unix_timestamp("dropoff_datetime")) \
    .withColumn("duration", col("dropoff_ts") - col("pickup_ts")) \
    .select("*") \
    .dropna()  # Drop remaining null values

# Start the streaming query with trigger(once=True) to process data once and stop
query = (
    taxi_df.writeStream
    .outputMode("append")
    .format("parquet")
    .option("path", "output/preprocessed_data")
    .option("checkpointLocation", "output/checkpoint")
    .trigger(once=True)  
    .start()
)

query.awaitTermination()

print("Execution time", time.time() - start_time)

Execution time 1344.0212268829346


In [14]:
output_df = spark.read.parquet("output/preprocessed_data")
output_df.show(5) 

+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+----------+----------+--------+
|           medallion|        hack_license|    pickup_datetime|   dropoff_datetime|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount| pickup_ts|dropoff_ts|duration|
+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+----------+----------+--------+
|64187531006B3D8B3...|871C25EC0B8BF4BC8...|2013-08-01 17:47:36|2013-08-01 19:05:40|             4683|         18.0|      -74.005295|      40.750935|      

### Query 1
Frequent Routes

### Query 2
Profitable Areas