# Big Data Management Project 2:
## DESB GRAND CHALLENGE 2015

In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, regexp_extract, col, count
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType, FloatType

In [23]:
spark = (SparkSession.builder
                    .appName('BDM_Project2')
                    .enableHiveSupport()
                    .getOrCreate()
        )

### Query 0
Data Cleansing and Setup

In [25]:
# Defining the schema for faster reading of data
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("payment_type", StringType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("surcharge", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True)
])

# Creating a single dataframe of all the trip_data files
taxi_df_og = (
    spark.read
    .option("header", True)
    .schema(schema)
    .csv("input/sorted_data.csv")
)

# Removing the trips with 0 passengers
# Transforming the data 
taxi_df = taxi_df_og.filter(
    (regexp_extract(col("medallion"), r"^[a-fA-F0-9]{32}$", 0) != "") &
    (regexp_extract(col("hack_license"), r"^[a-fA-F0-9]{32}$", 0) != "") &
    (col("pickup_datetime").isNotNull()) &
    (col("dropoff_datetime").isNotNull()) &
    (col("passenger_count") > 0) &                 
    (col("trip_distance") > 0) &                    
    (col("fare_amount") > 0) &
    (col("tip_amount") > 0)
)

# Convert timestamps to Unix format 
taxi_df = taxi_df.withColumns({
    "pickup_ts": unix_timestamp("pickup_datetime"),
    "dropoff_ts": unix_timestamp("dropoff_datetime")
}).withColumn(
    "duration", col("dropoff_ts") - col("pickup_ts")
).filter(
    (col("duration") > 0) & (col("duration") <= 4 * 60 * 60) 
).select(
    "*"
).dropna()  # Drop remaining null values

taxi_df.show(5, truncate=False)

#optional
original_count = taxi_df_og.count()
filtered_count = taxi_df.count()
filtered_out_count = original_count - filtered_count

print(f"Original count: {original_count}") 
print(f"Filtered count: {filtered_count}")
print(f"Rows filtered out: {filtered_out_count}")

+--------------------------------+--------------------------------+-------------------+-------------------+---------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+----------+----------+--------+
|medallion                       |hack_license                    |pickup_datetime    |dropoff_datetime   |passenger_count|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|pickup_ts |dropoff_ts|duration|
+--------------------------------+--------------------------------+-------------------+-------------------+---------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+----------+----------+--------+
|FD39403FDE46B6C753DDD6518A4365D7|2B36D07A27BB35D7DF7170C83EEAA196|2013-01-01 00:01:00|2

In [26]:
taxi_df_small = taxi_df.sample(fraction=0.0833)  # 1GB out of 12GB
small_count = taxi_df_small.count()
print(f"Small count: {small_count}")

Small count: 7516760


### Query 1
Frequent Routes

### Query 2
Profitable Areas