In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types
import os
import requests
import gzip
from pyspark.sql.functions import col

In [4]:

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('process-taxi-data') \
    .getOrCreate()

# Q1 spark version
print(f"Spark Version - {spark.version}")


Spark Version - 3.5.0


In [5]:

schema = types.StructType([
    types.StructField('hvfhs_license_num', types.StringType(), True),
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True),
    types.StructField('Affiliated_base_number', types.StringType(), True)
])
#  Load the dataframe
file_path = 'D:/data_engenering/week4/fhvhv/2019/fhv_tripdata_2019-10.csv'
print(f"Reading - {file_path}")

df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv(file_path)

#  Partition the data frame
folder_path = 'D:/data_engenering/week4/fhvhv/partitions'
print(f"Creating partitions - 6 folder {folder_path}")
df.head()
df = df.repartition(6)
df.write.mode('overwrite').parquet(folder_path, compression='gzip')


Reading - D:/data_engenering/week4/fhvhv/2019/fhv_tripdata_2019-10.csv
Creating partitions - 6 folder D:/data_engenering/week4/fhvhv/partitions


In [6]:
import os

# Path to the directory containing Parquet files
directory = 'D:/data_engenering/week4/fhvhv/partitions'

# List all Parquet files in the directory
parquet_files = [file for file in os.listdir(directory) if file.endswith('.parquet')]

total_size_bytes = 0

# Iterate over each Parquet file
for file in parquet_files:
    # Get the file path
    file_path = os.path.join(directory, file)
    
    # Get the size of the file in bytes
    file_size_bytes = os.path.getsize(file_path)
    
    # Add the file size to the total siz
    total_size_bytes += file_size_bytes

# Calculate the average size in MB
average_size_mb = total_size_bytes / len(parquet_files) / (1024 * 1024)

print(f'Average size of Parquet files: {average_size_mb:.2f} MB')


Average size of Parquet files: 4.31 MB


In [7]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("TaxiTripCount") \
    .getOrCreate()

# Path to the directory containing Parquet files
directory = 'D:/data_engenering/week4/fhvhv/partitions'

# Read Parquet files into DataFrame
df = spark.read.parquet(directory)

# Filter data to include only trips on the 15th of October
df_filtered = df.filter(col("pickup_datetime").between("2019-10-15 00:00:00", "2019-10-15 23:59:59"))

# Count the number of records
trip_count = df_filtered.count()

print(f"Number of taxi trips on the 15th of October: {trip_count}")

Number of taxi trips on the 15th of October: 62295


In [8]:
pickup_dt = '2019-10-15'
from pyspark.sql import functions as F

# Convert pickup_datetime to date
df = df.withColumn('pickup_date', F.to_date(df.pickup_datetime))

# Filter data for the given pickup date and count the number of records
trip_count = df.filter(F.col('pickup_date') == pickup_dt).count()

print(f"Number of taxi trips on {pickup_dt}: {trip_count}")

Number of taxi trips on 2019-10-15: 62295


In [9]:
# using SQL syntax
df.createOrReplaceTempView('fhvhv_tripdata')
pickup_dt = '2019-10-15'

spark.sql(f"""
SELECT
    COUNT(1)
FROM 
    fhvhv_tripdata
WHERE
    to_date(pickup_datetime) = '{pickup_dt}'
""").show()

+--------+
|count(1)|
+--------+
|   62295|
+--------+



In [10]:
spark.sql("""
SELECT
    count(*) AS total_trips
FROM
    fhvhv_tripdata
WHERE pickup_datetime BETWEEN '2019-10-15 00:00:00' AND '2019-10-15 23:59:59'
""").show()

+-----------+
|total_trips|
+-----------+
|      62295|
+-----------+



In [11]:
spark.sql(f"""
SELECT
    *
FROM 
    fhvhv_tripdata
WHERE
    to_date(dropoff_datetime) is not null
""").show()

+-----------------+--------------------+---------------+----------------+------------+------------+-------+----------------------+-----------+
|hvfhs_license_num|dispatching_base_num|pickup_datetime|dropoff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|pickup_date|
+-----------------+--------------------+---------------+----------------+------------+------------+-------+----------------------+-----------+
+-----------------+--------------------+---------------+----------------+------------+------------+-------+----------------------+-----------+



In [12]:
df.columns
['hvfhs_license_num',
 'dispatching_base_num',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'SR_Flag']
df \
    .withColumn('duration', (df.dropoff_datetime.cast('long') - df.pickup_datetime.cast('long'))/( 60 * 60 )) \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .groupBy('pickup_date') \
        .max('duration') \
    .orderBy('max(duration)', ascending=False) \
    .limit(5) \
    .show()

+-----------+-------------+
|pickup_date|max(duration)|
+-----------+-------------+
| 2019-11-01|         NULL|
| 2019-10-05|         NULL|
| 2019-10-24|         NULL|
| 2019-10-01|         NULL|
| 2019-10-22|         NULL|
+-----------+-------------+



In [13]:
spark.sql("""
SELECT
    to_date(pickup_datetime) AS pickup_date,
    MAX((UNIX_TIMESTAMP(COALESCE(dropoff_datetime, '2019-10-15 00:00:00')) - UNIX_TIMESTAMP(pickup_datetime)) / (60 * 60)) AS duration
FROM 
    fhvhv_tripdata
GROUP BY
    1
ORDER BY
    2 DESC
LIMIT 5;
""").show()

+-----------+------------------+
|pickup_date|          duration|
+-----------+------------------+
| 2019-10-01| 335.9661111111111|
| 2019-10-02|311.99833333333333|
| 2019-10-03|             288.0|
| 2019-10-04|             264.0|
| 2019-10-05|             240.0|
+-----------+------------------+



In [14]:
schema = types.StructType([
    types.StructField('LocationID', types.IntegerType(), True),
    types.StructField('Borough', types.StringType(), True),
    types.StructField('Zone', types.StringType(), True),
    types.StructField('service_zone', types.StringType(), True)
])
#  Load the dataframe
file_path = 'D:/data_engenering/week4/fhvhv/2019/taxi_zone_lookup.csv'
print(f"Reading - {file_path}")

df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv(file_path)

#  Partition the data frame
folder_path = 'D:/data_engenering/week4/fhvhv/zons'
print(f"Creating partitions - 6 folder {folder_path}")
df.head()
df = df.repartition(6)
df.write.mode('overwrite').parquet(folder_path, compression='gzip')

Reading - D:/data_engenering/week4/fhvhv/2019/taxi_zone_lookup.csv
Creating partitions - 6 folder D:/data_engenering/week4/fhvhv/zons


In [15]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("zones_data") \
    .getOrCreate()

# Path to the directory containing 
directory = 'D:/data_engenering/week4/fhvhv/zons'

# Read Parquet files into DataFrame
df = spark.read.parquet(directory)

In [16]:
df.createOrReplaceTempView('zones_data')

In [17]:
spark.sql("""
SELECT
   Zone
FROM 
    zones_data  
GROUP BY 
    1
""").show()

+--------------------+
|                Zone|
+--------------------+
|Governor's Island...|
|         Westerleigh|
|Charleston/Totten...|
|Heartland Village...|
|       Dyker Heights|
|     Jackson Heights|
|             Bayside|
|      Yorkville West|
|Flushing Meadows-...|
|Riverdale/North R...|
|  Stuyvesant Heights|
|Upper West Side N...|
|Upper East Side N...|
|       Prospect Park|
|       Starrett City|
|Long Island City/...|
|        Bloomingdale|
|        Midtown East|
|Downtown Brooklyn...|
|Saint George/New ...|
+--------------------+
only showing top 20 rows



In [18]:
from pyspark.sql.functions import max

# Group by 'Zone' and calculate the maximum duration for each zone
max_duration_per_zone = df.groupBy('Zone')

# Order by max duration in descending order, limit to top 5, and show the result
#max_duration_per_zone.orderBy('max_duration', ascending=False).limit(5).show()


In [19]:
max_duration_per_zone

GroupedData[grouping expressions: [Zone], value: [LocationID: int, Borough: string ... 2 more fields], type: GroupBy]

In [20]:
spark.sql("""
SELECT
   pul.Zone,
   COUNT(1) as Total
FROM 
    fhvhv_tripdata fhv 
    INNER JOIN zones_data pul ON fhv.PULocationID = pul.LocationID  
WHERE pul.Zone = "East Chelsea"
GROUP BY 
    1
""").show()

+------------+-----+
|        Zone|Total|
+------------+-----+
|East Chelsea| 2391|
+------------+-----+



In [21]:
spark.sql("""
SELECT
   pul.Zone,
   COUNT(1) as Total
FROM 
    fhvhv_tripdata fhv 
    INNER JOIN zones_data pul ON fhv.PULocationID = pul.LocationID  
WHERE pul.Zone = "Jamaica Bay"
GROUP BY 
    1
""").show()

+-----------+-----+
|       Zone|Total|
+-----------+-----+
|Jamaica Bay|   14|
+-----------+-----+



In [22]:
spark.sql("""
SELECT
   pul.Zone,
   COUNT(1) as Total
FROM 
    fhvhv_tripdata fhv 
    INNER JOIN zones_data pul ON fhv.PULocationID = pul.LocationID  
WHERE pul.Zone = "Union Sq"
GROUP BY 
    1
""").show()

+--------+-----+
|    Zone|Total|
+--------+-----+
|Union Sq| 2102|
+--------+-----+



In [23]:
spark.sql("""
SELECT
   pul.Zone,
   COUNT(1) as Total
FROM 
    fhvhv_tripdata fhv 
    INNER JOIN zones_data pul ON fhv.PULocationID = pul.LocationID  
WHERE pul.Zone = "Crown Heights North"
GROUP BY 
    1
""").show()

+-------------------+-----+
|               Zone|Total|
+-------------------+-----+
|Crown Heights North|15701|
+-------------------+-----+

