#### https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz

#### Example for processing FHV Taxi data (Week 5 homework)

In [30]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql import types
from pyspark.sql import functions as F

In [3]:
# Create SparkSession
spark = SparkSession.builder.master("local[*]") \
                    .appName('fvh-homework-week5-application') \
                    .getOrCreate()

print(f'The PySpark {spark.version} version is running...')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/25 22:35:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/25 22:35:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/02/25 22:35:28 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


The PySpark 3.5.0 version is running...


In [5]:
df_fhv = spark.read\
    .option('header','True')\
    .csv('data/raw/fhv/2019/01/fhv_tripdata_2019_01.csv.gz')
df_fhv.schema

StructType([StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropOff_datetime', StringType(), True), StructField('PUlocationID', StringType(), True), StructField('DOlocationID', StringType(), True), StructField('SR_Flag', StringType(), True), StructField('Affiliated_base_number', StringType(), True)])

In [7]:
df_fhv_schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True), 
    types.StructField('pickup_datetime', types.TimestampType(), True), 
    types.StructField('dropOff_datetime', types.TimestampType(), True), 
    types.StructField('PUlocationID', types.IntegerType(), True), 
    types.StructField('DOlocationID', types.IntegerType(), True), 
    types.StructField('SR_Flag', types.StringType(), True), 
    types.StructField('Affiliated_base_number', types.StringType(), True)
])

### write to parquet by repartition to 6 partitions

In [10]:
def write_parquet(file, schema):
    years = [2019]
    for year in years:
        for month in range(10,11):
            try:
                print(f'processing data for {file} trip data for {year}/{month:02d}')
                input_path = f'data/raw/{file}/{year}/{month:02d}'
                output_path = f'data/pq/{file}/{year}/{month:02d}'
                df = spark.read\
                     .option('header','True')\
                     .schema(schema)\
                     .csv(input_path)
                df.repartition(6).write.parquet(output_path, mode='overwrite')  
            except:  
                print('error occured')
    print('write to parquet completed for ',file)

In [12]:
write_parquet('fhv',df_fhv_schema)

processing data for fhv trip data for 2019/01


                                                                                

processing data for fhv trip data for 2019/02


                                                                                

processing data for fhv trip data for 2019/03


                                                                                

processing data for fhv trip data for 2019/04


                                                                                

processing data for fhv trip data for 2019/05


                                                                                

processing data for fhv trip data for 2019/06


                                                                                

processing data for fhv trip data for 2019/07


                                                                                

processing data for fhv trip data for 2019/08


                                                                                

processing data for fhv trip data for 2019/09


                                                                                

processing data for fhv trip data for 2019/10


                                                                                

processing data for fhv trip data for 2019/11


                                                                                

processing data for fhv trip data for 2019/12


[Stage 36:>                                                         (0 + 6) / 6]

write to parquet completed for  fhv


                                                                                

### average file size

In [178]:
!du -h data/pq/fhv/2019/10/*

  0B	data/pq/fhv/2019/10/_SUCCESS
7.1M	data/pq/fhv/2019/10/part-00000-076e2dfb-9ec2-4121-8581-5e27131160b5-c000.snappy.parquet
7.1M	data/pq/fhv/2019/10/part-00001-076e2dfb-9ec2-4121-8581-5e27131160b5-c000.snappy.parquet
7.1M	data/pq/fhv/2019/10/part-00002-076e2dfb-9ec2-4121-8581-5e27131160b5-c000.snappy.parquet
7.1M	data/pq/fhv/2019/10/part-00003-076e2dfb-9ec2-4121-8581-5e27131160b5-c000.snappy.parquet
7.1M	data/pq/fhv/2019/10/part-00004-076e2dfb-9ec2-4121-8581-5e27131160b5-c000.snappy.parquet
7.1M	data/pq/fhv/2019/10/part-00005-076e2dfb-9ec2-4121-8581-5e27131160b5-c000.snappy.parquet


### count of taxi trips for 15th october

In [180]:
df_fhv = spark.read.parquet('data/pq/fhv/2019/10')

In [182]:
df_fhv.columns

['dispatching_base_num',
 'pickup_datetime',
 'dropOff_datetime',
 'PUlocationID',
 'DOlocationID',
 'SR_Flag',
 'Affiliated_base_number']

In [184]:
df_fhv.show(5)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B02594|2019-10-19 12:32:00|2019-10-19 13:03:00|         157|         133|   NULL|                B02594|
|              B00647|2019-10-07 19:26:49|2019-10-07 19:42:43|         264|          78|   NULL|                B00647|
|              B01145|2019-10-20 02:30:04|2019-10-20 02:37:54|         264|         244|   NULL|                B01145|
|              B03060|2019-10-13 18:58:21|2019-10-13 19:11:24|         264|         123|   NULL|                B02888|
|              B02418|2019-10-07 21:40:00|2019-10-07 22:22:00|         264|         264|   NULL|                B00280|
+--------------------+------------------

In [186]:
df_fhv\
    .filter(F.to_date(df_fhv.pickup_datetime) == '2019-10-15')\
    .count()

62610

### What is the length of the longest trip in the dataset in hours?

In [188]:
df_fhv.createOrReplaceTempView('fhv_trips_data')

In [208]:
spark.sql("""
SELECT MAX(TIMESTAMPDIFF(HOUR, pickup_datetime, dropoff_datetime)) AS longest_trip_hours from fhv_trips_data 
WHERE MONTH(pickup_datetime) = 10
""").show()

+------------------+
|longest_trip_hours|
+------------------+
|            631152|
+------------------+



### Load taxi zone lookup data to spark df

In [35]:
df_taxi_zone = spark.read\
    .option('header','True')\
    .csv('taxi_zone_lookup.csv')
df_taxi_zone.schema

StructType([StructField('LocationID', StringType(), True), StructField('Borough', StringType(), True), StructField('Zone', StringType(), True), StructField('service_zone', StringType(), True)])

In [37]:
df_taxi_zone.show(5)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 5 rows



In [41]:
df_taxi_zone_schema = types.StructType([
    types.StructField('LocationID', types.IntegerType(), True), 
    types.StructField('Borough', types.StringType(), True), 
    types.StructField('Zone', types.StringType(), True), 
    types.StructField('service_zone', types.StringType(), True)
])

In [43]:
df_taxi_zone = spark.read\
         .option('header','True')\
         .schema(df_taxi_zone_schema)\
         .csv('taxi_zone_lookup.csv')

In [45]:
df_taxi_zone.show(5)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 5 rows



In [210]:
df_taxi_zone.createOrReplaceTempView('taxi_zone_data')

In [213]:
df_fhv.schema

StructType([StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', TimestampType(), True), StructField('dropOff_datetime', TimestampType(), True), StructField('PUlocationID', IntegerType(), True), StructField('DOlocationID', IntegerType(), True), StructField('SR_Flag', StringType(), True), StructField('Affiliated_base_number', StringType(), True)])

In [217]:
df_taxi_zone.schema

StructType([StructField('LocationID', IntegerType(), True), StructField('Borough', StringType(), True), StructField('Zone', StringType(), True), StructField('service_zone', StringType(), True)])

````east frequent pickup location zone

Load the zone lookup data into a temp view in Spark
Zone Data

Using the zone lookup data and the FHV October 2019 data, what is the name of the LEAST frequent pickup location Zone?
````

In [253]:
spark.sql("""
SELECT tz1.Zone pickup_zone, COUNT(*) pickup_count
FROM fhv_trips_data , taxi_zone_data tz1 , taxi_zone_data tz2
WHERE 
fhv_trips_data.PUlocationID  =   tz1.LocationID AND
fhv_trips_data.DOlocationID  =   tz2.LocationID
GROUP BY pickup_zone
ORDER BY pickup_count ASC
""").show(1)

+-----------+------------+
|pickup_zone|pickup_count|
+-----------+------------+
|Jamaica Bay|           1|
+-----------+------------+
only showing top 1 row

