## Download NY Taxi data from Github

In [10]:
import os
import requests
from tqdm import tqdm

services = ['green','yellow']
init_url = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/'

def dwld_ny_taxi_data(year, service):
    desc = f'Downloading data for {service} service in {year}'
    for i in tqdm(range(12), desc = desc):
        # sets the month part of the file_name string
        month = '0'+str(i+1)
        month = month[-2:]

        # Get full path to download file
        local_prefix = f'data/raw/{service}/{year}/{month}'
        local_file = f'{service}_tripdata_{year}-{month}.csv.gz'
        local_path = f'{local_prefix}/{local_file}'

        # Create local directory if it doesn't exist
        os.makedirs(local_prefix, exist_ok=True)

        # Download file
        request_url = init_url + service + '/' + local_file
        response = requests.get(request_url)
        with open(local_path, 'wb') as f:
            f.write(response.content)

# Download data for all the provided years and services
for year in ['2020', '2021']:
    for service in ['green', 'yellow']:
        dwld_ny_taxi_data(year, service)

Downloading data for green service in 2020: 100%|██████████| 12/12 [00:08<00:00,  1.34it/s]
Downloading data for yellow service in 2020: 100%|██████████| 12/12 [00:32<00:00,  2.73s/it]
Downloading data for green service in 2021: 100%|██████████| 12/12 [00:08<00:00,  1.40it/s]
Downloading data for yellow service in 2021: 100%|██████████| 12/12 [00:15<00:00,  1.32s/it]


## Read csv with PySpark and save as parquet

In [21]:
# Load libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types
import pandas as pd

# Create spark session
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('prepare_data') \
    .getOrCreate()

In [22]:
# Define schemas
green_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("lpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("lpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("ehail_fee", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("trip_type", types.IntegerType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

yellow_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("tpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("tpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

In [25]:
# Pass CSV files to a parquet format, including the defined schema
years = ['2020', '2021']
services = ['green', 'yellow']
months = range(1, 13)
for year in years:
    for service in services:
        for month in months:
            print(f'Processing data for {service} service - {year}/{month}')
            # Set paths
            input_path = f'data/raw/{service}/{year}/{month:02d}/'
            output_path = f'data/pq/{service}/{year}/{month:02d}/'
            # Get correct schema
            schema = green_schema if service == 'green' else yellow_schema
            try:
                # Read csv file
                df = spark.read \
                    .option("header", "true") \
                    .schema(schema) \
                    .csv(input_path)
                # Save as parquet
                df.repartition(4).write.parquet(output_path)
            except Exception as e:
                print(f'Error processing {input_path} - {type(e).__name__}')
                continue

Processing data for yellow service - 2021/1
Processing data for yellow service - 2021/2
Processing data for yellow service - 2021/3
Processing data for yellow service - 2021/4
Processing data for yellow service - 2021/5
Processing data for yellow service - 2021/6
Processing data for yellow service - 2021/7
Processing data for yellow service - 2021/8
Error processing data/raw/yellow/2021/08/ - Py4JJavaError
Processing data for yellow service - 2021/9
Error processing data/raw/yellow/2021/09/ - Py4JJavaError
Processing data for yellow service - 2021/10
Error processing data/raw/yellow/2021/10/ - Py4JJavaError
Processing data for yellow service - 2021/11
Error processing data/raw/yellow/2021/11/ - Py4JJavaError
Processing data for yellow service - 2021/12
Error processing data/raw/yellow/2021/12/ - Py4JJavaError


In [26]:
spark.stop()