# Analyzing New York City Taxi Data with Spark Structured Streaming

## Setup

In [56]:
import os
from delta import configure_spark_with_delta_pip

import time
import uuid
from pyspark.sql.functions import explode, lead, col, unix_timestamp, sum
from pyspark.sql.functions import split
import pyspark.sql.functions as F
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, TimestampType, DateType

In [18]:
builder = SparkSession.builder.appName("NYTaxiTrips") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [19]:
BOOTSTRAP_SERVERS = os.environ.get('BOOTSTRAP_SERVERS')
assert BOOTSTRAP_SERVERS is not None, 'BOOTSTRAP_SERVERS must be set'

TRIP_TOPIC = 'trips'
FARE_TOPIC = 'fares'

Be sure to start the stream on Kafka!

In [88]:
plain_trip_schema = StructType(
    [
        StructField("medallion", StringType(), False),
        StructField("hack_license", StringType(), False),
        StructField("vendor_id", StringType(), False),
        StructField("rate_code", StringType(), False),
        StructField("store_and_fwd_flag", StringType(), False),
        StructField("pickup_datetime", TimestampType(), False),
        StructField("dropoff_datetime", TimestampType(), False),
        StructField("passenger_count", StringType(), False),
        StructField("trip_time_in_secs", StringType(), False),
        StructField("trip_distance", StringType(), False),
        StructField("pickup_longitude", StringType(), False),
        StructField("pickup_latitude", StringType(), False),
        StructField("dropoff_longitude", StringType(), False),
        StructField("dropoff_latitude", StringType(), False),
        StructField("timestamp", TimestampType(), False),
    ]
)

casted_trip_schema = StructType(
    [
        StructField("medallion", StringType(), False),
        StructField("hack_license", StringType(), False),
        StructField("vendor_id", StringType(), False),
        StructField("rate_code", IntegerType(), False),
        StructField("store_and_fwd_flag", StringType(), False),
        StructField("pickup_datetime", TimestampType(), False),
        StructField("dropoff_datetime", TimestampType(), False),
        StructField("passenger_count", IntegerType(), False),
        StructField("trip_time_in_secs", DoubleType(), False),
        StructField("trip_distance", DoubleType(), False),
        StructField("pickup_longitude", DoubleType(), False),
        StructField("pickup_latitude", DoubleType(), False),
        StructField("dropoff_longitude", DoubleType(), False),
        StructField("dropoff_latitude", DoubleType(), False),
        StructField("timestamp", TimestampType(), False),

    ]
)

fare_schema = StructType(
    [
        StructField("medallion", StringType(), False),
        StructField("hack_license", StringType(), False),
        StructField("vendor_id", StringType(), False),
        StructField("pickup_datetime", TimestampType(), False),
        StructField("payment_type", StringType(), False),
        StructField("fare_amount", DoubleType(), False),
        StructField("surcharge", DoubleType(), False),
        StructField("mta_tax", DoubleType(), False),
        StructField("tip_amount", DoubleType(), False),
        StructField("tolls_amount", DoubleType(), False),
        StructField("total_amount", DoubleType(), False),
        StructField("timestamp", TimestampType(), False),

    ]
)

In [89]:
from pyspark.sql.functions import from_json

lines = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", BOOTSTRAP_SERVERS) # Configure the Kafka server name and port
  .option("subscribe", TRIP_TOPIC)                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
)

## Utils

In [97]:
def active_streams():
    return [a.name for a in spark.streams.active]


def stop_stream(name):
    for stream in spark.streams.active:
        if stream.name == name:
            stream.stop()
            return True
    return False


def inmemory_stream(df, wait_seconds=5):
    temp_table_name = f"inmemory_{uuid.uuid4().hex}"

    query = (
        df.writeStream.outputMode("append")
        .format("memory")
        .queryName(temp_table_name)
        .start()
    )

    time.sleep(wait_seconds)
    result = spark.sql(f"SELECT * FROM {temp_table_name}")
    stop_stream(query.name)
    return result

## Tasks

### QUERY-1

Utilization over a window of 5, 10, and 15 minutes per taxi/driver. This can be computed by computing the idle time per taxi. How does it change? Is there an optimal window?

In [103]:
casted_trip_schema.fields[0].dataType, casted_trip_schema.fields[0].name

(StringType(), 'medallion')

In [104]:
trips_df = lines.select(from_json(col("value").cast("string"), plain_trip_schema).alias("parsed_value"))
trips_df = trips_df.select("parsed_value.*")

for field in casted_trip_schema.fields:
    trips_df = trips_df.withColumn(field.name, col(field.name).cast(field.dataType))

trips_df.printSchema()

root
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- vendor_id: string (nullable = true)
 |-- rate_code: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_time_in_secs: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [110]:
from pyspark.sql.functions import window, col, max, min

window_durations = [5 * 60, 10 * 60, 15 * 60]

for window_duration in window_durations:
    window_col = window("pickup_datetime", "{} seconds".format(window_duration))
    df_windowed = trips_df.groupBy("medallion", window_col).agg((max("dropoff_datetime").cast("long") - min("pickup_datetime").cast("long")).alias("busy_time"))
    
    df_windowed = df_windowed.withColumn("idle_time", window_duration - col("busy_time"))
    
    df_windowed = df_windowed.withColumn("utilization", col("busy_time") / window_duration)
    
    query = (df_windowed.writeStream
      .outputMode("complete")
      .format("memory")
      .queryName("utilization_{}_minutes".format(window_duration // 60))
      .trigger(processingTime="5 second")
      .start())

spark.sql("SELECT * FROM utilization_5_minutes").show()
spark.sql("SELECT * FROM utilization_10_minutes").show()
spark.sql("SELECT * FROM utilization_15_minutes").show()

+---------+------+---------+---------+-----------+
|medallion|window|busy_time|idle_time|utilization|
+---------+------+---------+---------+-----------+
+---------+------+---------+---------+-----------+

+---------+------+---------+---------+-----------+
|medallion|window|busy_time|idle_time|utilization|
+---------+------+---------+---------+-----------+
+---------+------+---------+---------+-----------+

+---------+------+---------+---------+-----------+
|medallion|window|busy_time|idle_time|utilization|
+---------+------+---------+---------+-----------+
+---------+------+---------+---------+-----------+



### QUERY-2 

The average time it takes for a taxi to find its next fare(trip) per destination borough. This can be computed by finding the time difference, e.g. in seconds, between the trip's drop off and the next trip's pick up within a given unit of time

In [None]:
# remember you can register another stream


### QUERY-3

The number of trips that started and ended within the same borough in the last hour

In [None]:
# remember you can register another stream


### QUERY-4 

The number of trips that started in one borough and ended in another one in the last hour