# Imports Libraries

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Accessing Data from Datalake

In [0]:
spark.conf.set("fs.azure.account.auth.type.nyctaxiprojlake.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.nyctaxiprojlake.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.nyctaxiprojlake.dfs.core.windows.net", "6b4fc492-2553-4401-a2fd-e82a226d8c5e")
spark.conf.set("fs.azure.account.oauth2.client.secret.nyctaxiprojlake.dfs.core.windows.net", 'jIN8Q~7qSVeGLvGyG~a~kff2hoJ6pPIpf-Fo8c0L')
spark.conf.set("fs.azure.account.oauth2.client.endpoint.nyctaxiprojlake.dfs.core.windows.net", "https://login.microsoftonline.com/c5642e7e-4b66-4a1e-b222-e3c95e2aa7be/oauth2/token")

In [0]:
dbutils.fs.ls("abfss://bronze@nyctaxiprojlake.dfs.core.windows.net/")

# Read CSV files

## Trip Type Data

In [0]:
df_trip_type = spark.read.format('csv')\
                    .option('inferSchema',True)\
                    .option('header',True)\
                    .load('abfss://bronze@nyctaxiprojlake.dfs.core.windows.net/trip_type/')

In [0]:
df_trip_type.display()

In [0]:
df_trip_zone = spark.read.format('csv')\
                    .option('inferSchema',True)\
                    .option('header',True)\
                    .load('abfss://bronze@nyctaxiprojlake.dfs.core.windows.net/trip_zone')

In [0]:
df_trip_zone.display()

# Trip Data

## DDL Schema

In [0]:
myschema = '''
                VendorID BIGINT,
                lpep_pickup_datetime TIMESTAMP,
                lpep_dropoff_datetime TIMESTAMP,
                store_and_fwd_flag STRING,
                RatecodeID BIGINT,
                PULocationID BIGINT,
                DOLocationID BIGINT,
                passenger_count BIGINT,
                trip_distance DOUBLE,
                fare_amount DOUBLE,
                extra DOUBLE,
                mta_tax DOUBLE,
                tip_amount DOUBLE,
                tolls_amount DOUBLE,
                ehail_fee DOUBLE,
                improvement_surcharge DOUBLE,
                total_amount DOUBLE,
                payment_type BIGINT,
                trip_type BIGINT,
                congestion_surcharge DOUBLE

      '''

In [0]:
df_trip = spark.read.format('parquet')\
              .schema(myschema)\
              .option('header',True)\
              .option('recursiveFileLookup',True)\
              .load('abfss://bronze@nyctaxiprojlake.dfs.core.windows.net/trips2023data/')

In [0]:
df_trip.display()

# Data Transformation

## Trip Type

In [0]:
df_trip_type.display()

In [0]:
df_trip_type = df_trip_type.withColumnRenamed("Description","Trip_Description")
df_trip_type.display()

In [0]:
df_trip_type.write.format('parquet')\
            .mode('append')\
            .option("path","abfss://silver@nyctaxiprojlake.dfs.core.windows.net/trip_type")\
            .save()

## Trip Zone

In [0]:
df_trip_zone.display()

In [0]:
df_trip_zone = df_trip_zone.withColumn("Zone1",split(col("Zone"),'/')[0])\
                           .withColumn("Zone2",split(col("Zone"),'/')[1])

df_trip_zone.display()

In [0]:
df_trip_zone.write.format('parquet')\
          .mode('append')\
          .option('path','abfss://silver@nyctaxiprojlake.dfs.core.windows.net/trip_zone')\
          .save()

## Trip Data

In [0]:
df_trip.display()

In [0]:
df_trip = df_trip.withColumn('trip_date',to_date('lpep_pickup_datetime'))\
                  .withColumn('trip_year',year('lpep_pickup_datetime'))\
                  .withColumn('trip_month',month('lpep_pickup_datetime'))

In [0]:
df_trip.display()

In [0]:
df_trip = df_trip.select('VendorID','PULocationID','DOLocationID','fare_amount','total_amount')
df_trip.display()

In [0]:
df_trip.write.format('parquet')\
            .mode('append')\
            .option('path','abfss://silver@nyctaxiprojlake.dfs.core.windows.net/trips2023data')\
            .save()

# Analysis

In [0]:
display(df_trip)