# Big Data Management Project 2:
## DESB GRAND CHALLENGE 2015

In [1]:
!pip install shapely

Collecting shapely
  Using cached shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
Installing collected packages: shapely
Successfully installed shapely-2.0.7


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, unix_timestamp, regexp_extract, col, lag, avg, lead, count, sum as spark_sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType, FloatType
from pyspark.sql.window import Window

import json
import time

In [3]:
pip install --upgrade pyspark

Collecting pyspark
  Using cached pyspark-3.5.5-py2.py3-none-any.whl
Collecting py4j==0.10.9.7 (from pyspark)
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Installing collected packages: py4j, pyspark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.3
    Can't uninstall 'pyspark'. No files were found to uninstall.
Successfully installed py4j-0.10.9.7 pyspark-3.5.5
Note: you may need to restart the kernel to use updated packages.


### Query 0
Data Cleansing and Setup

In [None]:
start_time = time.time() # To see the time it takes to execute data transformations

spark = SparkSession.builder \
                    .appName('BDM_Project2') \
                    .master("local[*]") \
                    .config("spark.executor.memory", "2g") \
                    .config("spark.driver.memory", "2g") \
                    .getOrCreate()

# Defining the schema for faster reading of data
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("payment_type", StringType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("surcharge", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True)
])

# Read the streaming data
taxi_df_og = (
    spark.readStream
    .option("maxFilesPerTrigger", 1)  # Read one file at a time to avoid overwhelming the system
    .option("header", True)
    .schema(schema)
    .csv("input/")  # Streaming from directory
)

# Removing the trips with 0 passengers
# Transforming the data 
taxi_df = taxi_df_og.filter(
    (regexp_extract(col("medallion"), r"^[a-fA-F0-9]{32}$", 0) != "") &
    (regexp_extract(col("hack_license"), r"^[a-fA-F0-9]{32}$", 0) != "") &
    (col("pickup_datetime").isNotNull()) &
    (col("dropoff_datetime").isNotNull()) &
    (col("passenger_count") > 0) &                 
    (col("trip_distance") > 0) &                    
    (col("fare_amount") > 0) &
    (col("tip_amount") > 0)
)

# Convert timestamps to Unix format 
taxi_df = taxi_df.withColumn("pickup_ts", unix_timestamp("pickup_datetime")) \
    .withColumn("dropoff_ts", unix_timestamp("dropoff_datetime")) \
    .withColumn("duration", col("dropoff_ts") - col("pickup_ts")) \
    .filter(
        (col("duration") > 0) & (col("duration") <= 4 * 60 * 60)) \
    .select("*") \
    .dropna()  # Drop remaining null values

# Write the output to Parquet files in a directory
query = taxi_df.writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "output/") \
    .option("checkpointLocation", "checkpoint/") \
    .trigger(processingTime="1000 seconds") \
    .start()

# Await termination to keep the stream running
query.awaitTermination(60000)  # Wait for 60 seconds or until stopped


#optional
#original_count = taxi_df_og.count()
#filtered_count = taxi_df.count()
#filtered_out_count = original_count - filtered_count

#print(f"Original count: {original_count}") 
#print(f"Filtered count: {filtered_count}")
#print(f"Rows filtered out: {filtered_out_count}")
print("Execution time", time.time() - start_time)

#optional
#original_count = taxi_df_og.count()
#filtered_count = taxi_df.count()
#filtered_out_count = original_count - filtered_count

#print(f"Original count: {original_count}") 
#print(f"Filtered count: {filtered_count}")
#print(f"Rows filtered out: {filtered_out_count}")

In [5]:
query = taxi_df.limit(5) \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .trigger(processingTime="1000 seconds") \
    .start()
print(query)

<pyspark.sql.streaming.query.StreamingQuery object at 0x7f50b64d4950>


In [6]:
# Example to check query status and progress
print(query.status)  # Prints the status of the query
print(query.lastProgress)  # Prints the last progress made by the query
print(query.isActive)  # Checks if the query is still active


{'message': 'Processing new data', 'isDataAvailable': True, 'isTriggerActive': True}
None
True


In [26]:
taxi_df_small = taxi_df.sample(fraction=0.0833)  # 1GB out of 12GB
small_count = taxi_df_small.count()
print(f"Small count: {small_count}")

Small count: 7516760


### Query 1
Frequent Routes

### Query 2
Profitable Areas