## Homework Solution

In [1]:
import os
import sys

# have my file path set to python not anaconda
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.path.dirname(sys.executable)

'C:\\Users\\Abdulkadir\\anaconda3'

In [2]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd

In [3]:
spark = SparkSession.builder \
.master("local[*]") \
.appName('Week5-Homework').getOrCreate()

In [4]:
# Download the HVFHV data for february 2021
# !curl -sS https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2021-02.parquet > HVFHV.parquet

In [5]:
df = spark.read.parquet('HVFHV.parquet', header=True, inferSchema=True)

### Data exploration

In [6]:
# shape of df
df.count(), len(df.columns)

(11613942, 24)

In [7]:
df.columns

['hvfhs_license_num',
 'dispatching_base_num',
 'originating_base_num',
 'request_datetime',
 'on_scene_datetime',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'trip_miles',
 'trip_time',
 'base_passenger_fare',
 'tolls',
 'bcf',
 'sales_tax',
 'congestion_surcharge',
 'airport_fee',
 'tips',
 'driver_pay',
 'shared_request_flag',
 'shared_match_flag',
 'access_a_ride_flag',
 'wav_request_flag',
 'wav_match_flag']

In [8]:
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp (nullable = true)
 |-- on_scene_datetime: timestamp (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_ride_flag: string (nul

In [9]:
# NULL VALUES
from pyspark.sql.functions import col, count
for column in df.columns:
    print(column, end=': ')
    print(df.filter(col(column).isNull()).count())


hvfhs_license_num: 0
dispatching_base_num: 0
originating_base_num: 3319132
request_datetime: 1
on_scene_datetime: 3318817
pickup_datetime: 0
dropoff_datetime: 0
PULocationID: 0
DOLocationID: 0
trip_miles: 0
trip_time: 0
base_passenger_fare: 0
tolls: 0
bcf: 0
sales_tax: 0
congestion_surcharge: 0
airport_fee: 11613181
tips: 0
driver_pay: 0
shared_request_flag: 0
shared_match_flag: 0
access_a_ride_flag: 0
wav_request_flag: 0
wav_match_flag: 0


In [11]:
import pandas as pd
pd.set_option('display.max_columns', None)
df.limit(2).toPandas().head()

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,HV0003,B02764,B02764,2021-01-31 23:59:00,2021-02-01 00:10:19,2021-02-01 00:10:40,2021-02-01 00:21:09,35,39,2.06,629,17.14,0.0,0.51,1.52,0.0,,0.0,9.79,N,N,,N,N
1,HV0003,B02764,B02764,2021-02-01 00:13:35,2021-02-01 00:25:23,2021-02-01 00:27:23,2021-02-01 00:44:01,39,35,3.15,998,32.11,0.0,0.96,2.85,0.0,,0.0,24.01,N,N,,N,N


### Q2) Repartition to 24 partitions and find how the file sizes

In [12]:
df = df.repartition(24)

In [13]:
df.write.parquet('homework/raw/', mode='overwrite')

Each file is 21,826KB

### Q3) How many taxi trips were there on February 15?

Consider only trips that started on February 15.

In [53]:
spark.sql("""
SELECT 
    COUNT(pickup_datetime)
FROM 
    taxi_data
WHERE
    pickup_datetime BETWEEN '2021-02-15 00:00:00' AND '2021-02-15 23:59:59'
""").show()

+----------------------+
|count(pickup_datetime)|
+----------------------+
|                367170|
+----------------------+



### Q4) Longest trip for each day

Now calculate the duration for each trip.

Trip starting on which day was the longest?

In [17]:
df.createOrReplaceTempView('taxi_data')

In [24]:
spark.sql("""
SELECT 
    pickup_datetime,
    MAX(trip_time)
FROM 
    taxi_data
GROUP BY
    pickup_datetime
ORDER BY
    MAX(trip_time) DESC
""").show(1)

+-------------------+--------------+
|    pickup_datetime|max(trip_time)|
+-------------------+--------------+
|2021-02-11 13:40:44|         75540|
+-------------------+--------------+
only showing top 1 row



### Q5) Most frequent dispatching_base_num

Now find the most frequently occurring dispatching_base_num in this dataset.

How many stages this spark job has?

In [54]:
spark.sql("""
SELECT
    dispatching_base_num,
    COUNT(dispatching_base_num)
FROM 
    taxi_data
GROUP BY
    dispatching_base_num
ORDER BY
    COUNT(dispatching_base_num) DESC
""").show(1)

+--------------------+---------------------------+
|dispatching_base_num|count(dispatching_base_num)|
+--------------------+---------------------------+
|              B02510|                    3233664|
+--------------------+---------------------------+
only showing top 1 row



3 Stages, 2 Skipped

### Q6( Most common locations pair

Find the most common pickup-dropoff pair.

For example:

"Jamaica Bay / Clinton East"

Enter two zone names separated by a slash

If any of the zone names are unknown (missing), use "Unknown". For example, "Unknown / Clinton East".capitalize

- Only have PULocationID and DOLocation ID

In [41]:
spark.sql("""
SELECT 
    CONCAT(PULocationID, '/', DOLocationID) AS pickup_dropoff_pair,
    COUNT(CONCAT(PULocationID, '/', DOLocationID)) AS count
FROM 
    taxi_data
GROUP BY
    CONCAT(PULocationID, '/', DOLocationID)
ORDER BY
    count DESC
""").show()

+-------------------+-----+
|pickup_dropoff_pair|count|
+-------------------+-----+
|              76/76|45041|
|              26/26|37329|
|              39/39|28026|
|              61/61|25976|
|              14/14|17934|
|                7/7|14688|
|            129/129|14688|
|              42/42|14481|
|              37/37|14424|
|              89/89|13976|
|            216/216|13716|
|              35/35|12829|
|            132/265|12542|
|             188/61|11814|
|              95/95|11548|
|              36/37|11491|
|              37/36|11487|
|             61/188|11462|
|             61/225|11342|
|            188/188|11308|
+-------------------+-----+
only showing top 20 rows

