# Week 5

## Working with Spark

### Run Pyspark

In [1]:
import pandas as pd
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local[*]") \
        .appName("test") \
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/02 12:16:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
spark.version

'3.4.2'

### Loading FHV 2019-10 data

In [3]:
df = pd.read_csv("data/raw/fhv/2019/10/fhv_tripdata_2019_10.csv.gz", nrows=100)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   dispatching_base_num    100 non-null    object 
 1   pickup_datetime         100 non-null    object 
 2   dropOff_datetime        100 non-null    object 
 3   PUlocationID            100 non-null    int64  
 4   DOlocationID            100 non-null    int64  
 5   SR_Flag                 0 non-null      float64
 6   Affiliated_base_number  99 non-null     object 
dtypes: float64(1), int64(2), object(4)
memory usage: 5.6+ KB


In [5]:
spark.createDataFrame(df).schema

StructType([StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropOff_datetime', StringType(), True), StructField('PUlocationID', LongType(), True), StructField('DOlocationID', LongType(), True), StructField('SR_Flag', DoubleType(), True), StructField('Affiliated_base_number', StringType(), True)])

In [7]:
from pyspark.sql import types

In [9]:
fhv_schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True), 
    types.StructField('pickup_datetime', types.TimestampType(), True), 
    types.StructField('dropOff_datetime', types.TimestampType(), True), 
    types.StructField('PUlocationID', types.IntegerType(), True), 
    types.StructField('DOlocationID', types.IntegerType(), True), 
    types.StructField('SR_Flag', types.DoubleType(), True), 
    types.StructField('Affiliated_base_number', types.StringType(), True)
    ]
)

In [15]:
print("Processing fhv data for October 2019...")

input_path = 'data/raw/fhv/2019/10/'
output_path = 'data/pq/fhv/2019/10/'

df_fhv = spark.read \
    .option("header", "true") \
    .schema(fhv_schema) \
    .csv(input_path)

df_fhv \
    .repartition(6) \
    .write.parquet(output_path, mode='overwrite')

print("...Successfully saved fhv data for October 2019 to parquet.")

Processing fhv data for October 2019...




...Successfully saved fhv data for October 2019 to parquet.


                                                                                

In [16]:
!ls -lh data/pq/fhv/2019/10

total 39M
-rw-r--r-- 1 abhijit abhijit    0 Mar  2 12:37 _SUCCESS
-rw-r--r-- 1 abhijit abhijit 6.4M Mar  2 12:37 part-00000-5b14061e-581a-4f33-b0d0-f40dcd8fcec0-c000.snappy.parquet
-rw-r--r-- 1 abhijit abhijit 6.4M Mar  2 12:37 part-00001-5b14061e-581a-4f33-b0d0-f40dcd8fcec0-c000.snappy.parquet
-rw-r--r-- 1 abhijit abhijit 6.4M Mar  2 12:37 part-00002-5b14061e-581a-4f33-b0d0-f40dcd8fcec0-c000.snappy.parquet
-rw-r--r-- 1 abhijit abhijit 6.4M Mar  2 12:37 part-00003-5b14061e-581a-4f33-b0d0-f40dcd8fcec0-c000.snappy.parquet
-rw-r--r-- 1 abhijit abhijit 6.4M Mar  2 12:37 part-00004-5b14061e-581a-4f33-b0d0-f40dcd8fcec0-c000.snappy.parquet
-rw-r--r-- 1 abhijit abhijit 6.4M Mar  2 12:37 part-00005-5b14061e-581a-4f33-b0d0-f40dcd8fcec0-c000.snappy.parquet


The average size of parquet files is 6.4 MB

In [17]:
df_fhv.schema

StructType([StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', TimestampType(), True), StructField('dropOff_datetime', TimestampType(), True), StructField('PUlocationID', IntegerType(), True), StructField('DOlocationID', IntegerType(), True), StructField('SR_Flag', DoubleType(), True), StructField('Affiliated_base_number', StringType(), True)])

In [18]:
df_fhv \
    .withColumnRenamed("dropOff_dateime", "dropoff_datetime")

DataFrame[dispatching_base_num: string, pickup_datetime: timestamp, dropOff_datetime: timestamp, PUlocationID: int, DOlocationID: int, SR_Flag: double, Affiliated_base_number: string]

In [19]:
df_fhv.registerTempTable('trips')



In [25]:
spark.sql(
"""
SELECT 
    COUNT(1) AS October_15_trips    
FROM
    trips
WHERE 
    CAST(pickup_datetime AS DATE) = "2019-10-15"
"""
).show()

[Stage 15:>                                                         (0 + 1) / 1]

+----------------+
|October_15_trips|
+----------------+
|           62610|
+----------------+



                                                                                

In [39]:
spark.sql("""
SELECT 
    MAX(DATEDIFF(hour, pickup_datetime, dropOff_datetime)) as max_trip_duration_hours
FROM 
    trips
""").show()

[Stage 27:>                                                         (0 + 1) / 1]

+-----------------------+
|max_trip_duration_hours|
+-----------------------+
|                 631152|
+-----------------------+



                                                                                