In [50]:
import findspark
import os
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, hour

In [4]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

# Question 1:

## Install Spark and PySpark

- Install Spark
- Run PySpark
- Create a local spark session
- Execute spark.version.

**What's the output?**

In [5]:
print(spark.version)

3.3.2


# Question 2:
## FHV October 2019

Read the October 2019 FHV into a Spark Dataframe with a schema as we did in the lessons.

Repartition the Dataframe to 6 partitions and save it to parquet.

What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? Select the answer which most closely matches.

- 1MB
- 6MB
- 25MB
- 87MB

In [6]:
df = spark.read.csv('fhvhv\\fhv_tripdata_2019-10.csv.gz', header=True, inferSchema=True)
df.show(10)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B00009|2019-10-01 00:23:00|2019-10-01 00:35:00|         264|         264|   null|                B00009|
|              B00013|2019-10-01 00:11:29|2019-10-01 00:13:22|         264|         264|   null|                B00013|
|              B00014|2019-10-01 00:11:43|2019-10-01 00:37:20|         264|         264|   null|                B00014|
|              B00014|2019-10-01 00:56:29|2019-10-01 00:57:47|         264|         264|   null|                B00014|
|              B00014|2019-10-01 00:23:09|2019-10-01 00:28:27|         264|         264|   null|                B00014|
|     B00021         |2019-10-01 00:00:4

In [12]:
repart_df = df.repartition(6)

In [13]:
repart_df.write.parquet('fhvhv/output/fhvhv_repart_6.parquet')

In [15]:
import os

In [32]:
# Directory path
path = 'fhvhv/output/fhvhv_repart_6.parquet'

# List all parquet files (ignoring hidden .crc files)
parquet_files = [f for f in os.listdir(path) if f.endswith('.parquet') and not f.startswith('.')]

# Check if files are being filtered correctly
print(parquet_files)

['part-00000-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet', 'part-00001-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet', 'part-00002-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet', 'part-00003-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet', 'part-00004-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet', 'part-00005-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet']


In [31]:
os.listdir('fhvhv/output/fhvhv_repart_6.parquet')

['.part-00000-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet.crc',
 '.part-00001-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet.crc',
 '.part-00002-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet.crc',
 '.part-00003-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet.crc',
 '.part-00004-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet.crc',
 '.part-00005-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet.crc',
 '._SUCCESS.crc',
 'part-00000-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet',
 'part-00001-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet',
 'part-00002-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet',
 'part-00003-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet',
 'part-00004-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet',
 'part-00005-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet',
 '_SUCCESS']

In [40]:
parquet_files = [file for file in os.listdir('fhvhv/output/fhvhv_repart_6.parquet') if file.endswith('.parquet') and not file.startswith('.')]
total_size = sum(os.path.getsize(os.path.join('fhvhv/output/fhvhv_repart_6.parquet',f)) for f in parquet_files)

In [41]:
parquet_files

['part-00000-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet',
 'part-00001-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet',
 'part-00002-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet',
 'part-00003-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet',
 'part-00004-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet',
 'part-00005-b7d65ad2-4e86-4bba-86af-af6962aa42ec-c000.snappy.parquet']

In [42]:
total_size_mb = total_size / (1024 * 1024)

In [43]:
average_size_mb = total_size_mb / len(parquet_files)
print(average_size_mb)

6.350232283274333


# Question 3:
## Count records

__How many taxi trips were there on the 15th of October?__

Consider only trips that started on the 15th of October.

- 108,164
- 12,856
- 452,470
- 62,610

In [24]:
pickup_date_df = df.withColumn("pickup_date",to_date(col('pickup_datetime'))).select('pickup_date')

In [28]:
pickup_date_df.filter(pickup_date_df.pickup_date == '2019-10-15').count()

62610

# Question 4:
## Longest trip for each day

__What is the length of the longest trip in the dataset in hours?__

- 631,152.50 Hours
- 243.44 Hours
- 7.68 Hours
- 3.32 Hours

In [33]:
df = df.withColumn('pickup_date',to_date(col('pickup_datetime')))
df = df.withColumn('dropOff_date',to_date(col('dropOff_datetime')))

In [59]:
df.withColumn('duration',df['pickup_datetime'] - df['dropOff_datetime']).select('duration').show()

+--------------------+
|            duration|
+--------------------+
|INTERVAL '-0 00:1...|
|INTERVAL '-0 00:0...|
|INTERVAL '-0 00:2...|
|INTERVAL '-0 00:0...|
|INTERVAL '-0 00:0...|
|INTERVAL '-0 00:0...|
|INTERVAL '-0 00:0...|
|INTERVAL '-0 00:0...|
|INTERVAL '-0 00:1...|
|INTERVAL '-0 00:0...|
|INTERVAL '-0 00:2...|
|INTERVAL '-0 00:0...|
|INTERVAL '-0 00:1...|
|INTERVAL '-0 00:1...|
|INTERVAL '-0 00:1...|
|INTERVAL '-0 00:0...|
|INTERVAL '-0 00:1...|
|INTERVAL '-0 00:0...|
|INTERVAL '-0 00:2...|
|INTERVAL '-0 00:4...|
+--------------------+
only showing top 20 rows

