In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

This notebook is to read local csv and write i in partitioned parquet format locally

In [2]:
bucket_name = 'dezoomcamp2024_project'
SPARK_SESSION = (
    SparkSession.builder.appName("App_to_write_GCS")
    .config(
        "spark.jars.packages",
        "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.15.1-beta,com.google.cloud.bigdataoss:gcs-connector:hadoop2-2.1.6",
    )
    .config(
        "spark.jars",
        "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar",
    )
    .getOrCreate()
)

SPARK_SESSION._jsc.hadoopConfiguration().set(
    "fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem"
)
SPARK_SESSION.conf.set("temporaryGcsBucket", bucket_name)

:: loading settings :: url = jar:file:/home/abyssde232024/spark/spark-3.3.2-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/abyssde232024/.ivy2/cache
The jars for the packages stored in: /home/abyssde232024/.ivy2/jars
com.google.cloud.spark#spark-bigquery-with-dependencies_2.12 added as a dependency
com.google.cloud.bigdataoss#gcs-connector added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a51be35b-1187-423d-8c26-07a93e431f0c;1.0
	confs: [default]
	found com.google.cloud.spark#spark-bigquery-with-dependencies_2.12;0.15.1-beta in central
	found com.google.cloud.bigdataoss#gcs-connector;hadoop2-2.1.6 in central
	found com.google.api-client#google-api-client-java6;1.30.10 in central
	found com.google.api-client#google-api-client;1.30.10 in central
	found com.google.oauth-client#google-oauth-client;1.31.2 in central
	found com.google.http-client#google-http-client;1.38.0 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found commons-logging#commons-logging;1.

24/04/13 02:46:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType, DoubleType, IntegerType

In [4]:
green_schema = StructType([
    StructField("VendorID", IntegerType(), nullable=True),
    StructField("lpep_pickup_datetime", TimestampType(), nullable=True),
    StructField("lpep_dropoff_datetime", TimestampType(), nullable=True),
    StructField("store_and_fwd_flag", StringType(), nullable=True),
    StructField("RatecodeID", IntegerType(), nullable=True),
    StructField("PULocationID", IntegerType(), nullable=True),
    StructField("DOLocationID", IntegerType(), nullable=True),
    StructField("passenger_count", IntegerType(), nullable=True),
    StructField("trip_distance", DoubleType(), nullable=True),
    StructField("fare_amount", DoubleType(), nullable=True),
    StructField("extra", DoubleType(), nullable=True),
    StructField("mta_tax", DoubleType(), nullable=True),
    StructField("tip_amount", DoubleType(), nullable=True),
    StructField("tolls_amount", DoubleType(), nullable=True),
    StructField("ehail_fee", DoubleType(), nullable=True),
    StructField("improvement_surcharge", DoubleType(), nullable=True),
    StructField("total_amount", DoubleType(), nullable=True),
    StructField("payment_type", IntegerType(), nullable=True),
    StructField("trip_type", IntegerType(), nullable=True),
    StructField("congestion_surcharge", DoubleType(), nullable=True)
])
yellow_schema = StructType([
    StructField("VendorID", IntegerType(), nullable=True),
    StructField("tpep_pickup_datetime", TimestampType(), nullable=True),
    StructField("tpep_dropoff_datetime", TimestampType(), nullable=True),
    StructField("passenger_count", IntegerType(), nullable=True),
    StructField("trip_distance", DoubleType(), nullable=True),
    StructField("RatecodeID", IntegerType(), nullable=True),
    StructField("store_and_fwd_flag", StringType(), nullable=True),
    StructField("PULocationID", IntegerType(), nullable=True),
    StructField("DOLocationID", IntegerType(), nullable=True),
    StructField("payment_type", IntegerType(), nullable=True),
    StructField("fare_amount", DoubleType(), nullable=True),
    StructField("extra", DoubleType(), nullable=True),
    StructField("mta_tax", DoubleType(), nullable=True),
    StructField("tip_amount", DoubleType(), nullable=True),
    StructField("tolls_amount", DoubleType(), nullable=True),
    StructField("improvement_surcharge", DoubleType(), nullable=True),
    StructField("total_amount", DoubleType(), nullable=True),
    StructField("congestion_surcharge", DoubleType(), nullable=True)
])

In [5]:
import pandas as pd

In [7]:
year=2020

for month in range(1, 13):
    if month < 10:
        month = f'0{month}'
    try:
        df_green = SPARK_SESSION.read \
            .option("header", "true") \
            .schema(green_schema) \
            .csv(f'data/raw/green/{year}/{month}')
    except Exception as e:
        print(f"Green Error is: {e}")
    else:
        df_green.repartition(4).write.mode('overwrite').parquet(f'gs://dezoomcamp2024_project/pq/green/{year}/{month}')

    try:
        df_yellow = SPARK_SESSION.read \
            .option("header", "true") \
            .schema(yellow_schema) \
            .csv(f'data/raw/yellow/{year}/{month}')
    except Exception as e:
        print(f"Yellow Error is: {e}")
    else:
        df_yellow.repartition(4).write.mode('overwrite').parquet(f'gs://dezoomcamp2024_project/pq/yellow/{year}/{month}')

                                                                                

In [8]:
df_green = SPARK_SESSION.read.parquet("gs://dezoomcamp2024_project/processed/green/2020/*")

In [9]:
df_green.show(5)

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2020-01-23 13:10:15|  2020-01-23 13:38:16|                 N|         1|          74|         130|              1|        12.77|       36.0|  0.0|    0.

                                                                                

In [53]:
SPARK_SESSION.stop()