In [13]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
         .master("local[*]") \
         .appName('homework5') \
         .getOrCreate()

24/03/05 21:15:21 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [14]:
spark.version

'3.3.2'

In [4]:
df = spark.read.options(header="true", inferSchema="true").csv('fhv_data/fhv_tripdata_2019-10.csv.gz')

                                                                                

In [8]:
df.repartition(6).write.parquet('data/', mode='overwrite')

                                                                                

In [9]:
df.createOrReplaceTempView('fhv')

In [15]:
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PUlocationID: integer (nullable = true)
 |-- DOlocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [16]:
# find total taxi trips that occurred on the 15th of October
spark.sql("""
SELECT
    DATE(pickup_datetime) as pickup_date,
    count(*) AS total_trips
FROM fhv
WHERE
    DATE(pickup_datetime) = '2019-10-15'
GROUP BY 1
""").show()

[Stage 8:>                                                          (0 + 1) / 1]

+-----------+-----------+
|pickup_date|total_trips|
+-----------+-----------+
| 2019-10-15|      62610|
+-----------+-----------+



                                                                                

In [19]:
# to find the length of the longest trip in the dataset in hours
spark.sql("""
SELECT
     MAX(TIMESTAMPDIFF(HOUR, pickup_datetime, dropOff_datetime)) AS longest_trip_hours
FROM fhv
""").show()

[Stage 17:>                                                         (0 + 1) / 1]

+------------------+
|longest_trip_hours|
+------------------+
|            631152|
+------------------+



                                                                                

In [20]:
df_zones = spark.read.parquet('zones/')

In [24]:
# Join the tables to find the name of the LEAST frequent pickup location Zone
df_join = df.join(df_zones, df.PUlocationID == df_zones.LocationID)

In [26]:
df_zones.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [32]:
df_least_freq_zone = df_join \
    .groupBy("Zone") \
    .count() \
    .orderBy("count") \
    .limit(10)

In [33]:
least_freq_zone.show()

[Stage 26:>                                                         (0 + 1) / 1]

+--------------------+-----+
|                Zone|count|
+--------------------+-----+
|         Jamaica Bay|    1|
|Governor's Island...|    2|
| Green-Wood Cemetery|    5|
|       Broad Channel|    8|
|     Highbridge Park|   14|
|        Battery Park|   15|
|Saint Michaels Ce...|   23|
|Breezy Point/Fort...|   25|
|Marine Park/Floyd...|   26|
|        Astoria Park|   29|
+--------------------+-----+



                                                                                