In [1]:
FHVHV_FILE = "fhvhv_tripdata_2021-06.csv.gz"
ZONES_FILE = "taxi_zone_lookup.csv"
INPUT_PATH = "data/raw"
OUTPUT_PATH = "data/pq"

URL_FILES = [
    f"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/{FHVHV_FILE}",
    f"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/{ZONES_FILE}",
]

In [2]:
from pathlib import Path
from urllib import request

for url in URL_FILES:
    file_type = url.split("/")[-2]
    file_name = url.split("/")[-1]
    local_directory = f"{INPUT_PATH}/{file_type}"
    input_path = f"{local_directory}/{file_name}"
    
    Path(local_directory).mkdir(parents=True, exist_ok=True)
    request.urlretrieve(url, input_path)

In [3]:
import pyspark
from pyspark.sql import SparkSession, types

spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("Homework_05")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/06 19:41:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
input_path = f"{INPUT_PATH}/fhvhv/"
output_path = f"{OUTPUT_PATH}/fhvhv/"

fhvhv_schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True),
    types.StructField('Affiliated_base_number', types.StringType(), True)
])

fhvhv_df = (
    spark.read
    .option("header", "true")
    .schema(fhvhv_schema)
    .csv(input_path)
)

fhvhv_df \
    .repartition(12) \
    .write.parquet(output_path, mode="overwrite")
    
fhvhv_df.createOrReplaceTempView("fhvhv_trips")

                                                                                

In [18]:
input_path = f"{INPUT_PATH}/misc/"
output_path = f"{OUTPUT_PATH}/misc/"

zones_schema = types.StructType([
    types.StructField('LocationID', types.IntegerType(), True),
    types.StructField('Borough', types.StringType(), True),
    types.StructField('Zone', types.StringType(), True),
    types.StructField('service_zone', types.StringType(), True)
])

zones_df = (
    spark.read
    .option("header", "True")
    .schema(zones_schema)
    .csv(input_path)
)

zones_df.write.parquet(output_path, mode="overwrite")

zones_df.createOrReplaceTempView("zones")

In [5]:
spark.sql("""
select
    count(*) as trips_count
from
    fhvhv_trips
where
    cast(pickup_datetime as date) = '2021-06-15'
""").show()

[Stage 3:>                                                          (0 + 1) / 1]

+-----------+
|trips_count|
+-----------+
|     452470|
+-----------+



                                                                                

In [6]:
spark.sql("""
select 
    round(max(
        cast(dropoff_datetime - pickup_datetime as bigint) / 3600
    ), 2) as max_trip_duration_hours 
from 
    fhvhv_trips
""").show()

[Stage 6:>                                                          (0 + 1) / 1]

+-----------------------+
|max_trip_duration_hours|
+-----------------------+
|                  66.88|
+-----------------------+



                                                                                

In [21]:
spark.sql("""
select 
    z.Zone as zone_name
from 
    fhvhv_trips ft
    join zones z on ft.PULocationID = z.LocationID
group by
    z.Zone
order by
    count(*) desc
limit
    1
""").show()

[Stage 19:>                                                         (0 + 1) / 1]

+-------------------+
|          zone_name|
+-------------------+
|Crown Heights North|
+-------------------+



                                                                                