In [0]:
import os

# os.environ["AZURE_STORAGE_KEY"] = <key>

# Should use unity catalog but was struggling with that, need to come back to this.
spark.conf.set(
    "fs.azure.account.key.ughnovgs.dfs.core.windows.net",
    os.environ["AZURE_STORAGE_KEY"]
)

In [0]:
# Replace with your values
storage_account_name = "ughnovgs"
container_name = "raw"

display(dbutils.fs.ls(
    f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/",
))

In [0]:
yellowTaxiFilePath = "abfss://raw@ughnovgs.dfs.core.windows.net/YellowTaxis_202501.csv"

yellowTaxiDF = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(yellowTaxiFilePath)
)

yellowTaxiDF.printSchema()

display(yellowTaxiDF)

Databricks data profile. Run in Databricks to view.

In [0]:
json_example = "abfss://raw@ughnovgs.dfs.core.windows.net/PaymentTypes.json"
paymentTypes = spark.read.json(json_example)
display(paymentTypes)

In [0]:
pc = yellowTaxiDF.describe("passenger_count",  "trip_distance")
display(pc)

In [0]:
# Remove zeroes
print(f"Before: {str(yellowTaxiDF.count())}")
yellowTaxiDfRemoveZeroes = yellowTaxiDF.where(yellowTaxiDF.passenger_count > 0).filter(col("trip_distance") > 0)
print(f"After: {str(yellowTaxiDfRemoveZeroes.count())}")

In [0]:
from pyspark.sql.functions import col

# Remove nulls
print(f"Before: {str(yellowTaxiDF.count())}")
yellowTaxiDfDropNulls = yellowTaxiDF.na.drop('all')
print(f"After: {str(yellowTaxiDfDropNulls.count())}")

In [0]:
# Replace nulls
print(f"Before: {str(yellowTaxiDF.count())}")
yellowTaxidFReplaceNulls = yellowTaxiDF.na.fill('all')
print(f"After: {str(yellowTaxidFReplaceNulls.count())}")

In [0]:
# Drop duplicates
print(f"Before: {str(yellowTaxiDF.count())}")

yellowTaxiDfNoDuplicates = yellowTaxiDF.dropDuplicates()

print(f"After: {str(yellowTaxiDfNoDuplicates.count())}")

In [0]:
from pyspark.sql.types import IntegerType

# Select only columns we need

yellowTaxiFiltered = yellowTaxiDF.select(
    'VendorID',
    col('passenger_count').cast(IntegerType()),
    col('trip_distance').alias('TripDistance'),
    col('tpep_dropoff_datetime'),
    'tpep_pickup_datetime',
    'RatecodeID',
    'PULocationID',
    'DOLocationID',    
    'payment_type',
    'fare_amount'
)

yellowTaxiFiltered.printSchema()
display(yellowTaxiFiltered)

In [0]:
# Rename columns
yellowTaxiRenamed = (
    yellowTaxiDF
        .withColumnRenamed("tpep_pickup_datetime", "PickupTime")
        .withColumnRenamed("tpep_dropoff_datetime", "DropoffTime")
        .withColumnRenamed("PULocationID", "PickupLocationID")
        .withColumnRenamed("DOLocationID", "DropoffLocationID")
        .withColumnRenamed("total_amount", "TotalAmount")
        .withColumnRenamed("payment_type", "PaymentType"))


In [0]:
from pyspark.sql.functions import year, month, dayofmonth, col, expr
# Derive columns
yellowTaxiDerivedFields = (
    yellowTaxiRenamed
        .withColumn("TripYear", year(col("PickupTime")))
        .select(
            "*", 
            expr("month(PickupTime) AS TripMonth"),
            dayofmonth(col("PickupTime")).alias("TripDay")
        )
)  

display(yellowTaxiDerivedFields)
                                                    

In [0]:
from pyspark.sql.functions import unix_timestamp, round, col

# Add calculated field for trip time in minutes

tripTimeInSecondsExpr = unix_timestamp(col("DropoffTime")) - unix_timestamp(col("PickupTime"))
                                       
tripTimeInMinutesExpr = round(tripTimeInSecondsExpr / 60)

yellowTaxiTime = (
    yellowTaxiDerivedFields
        .withColumn("TripTimeInMinutes", tripTimeInMinutesExpr)
)
display(yellowTaxiTime)

In [0]:
from pyspark.sql.functions import when

# Create derived column - TripType

tripTypeColumn = (
    when(
        col("RatecodeID") == 6,
        "SharedTrip"
    )
    .otherwise("SoloTrip")
)

yellowTaxiTripType = (
    yellowTaxiTime
        .withColumn("TripType", tripTypeColumn)
)
display(yellowTaxiTripType)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, LongType

taxiBasesSchema = (
                    StructType
                    ([
                        StructField("License Number"         , StringType()    , True),
                        StructField("Entity Name"            , StringType()    , True),
                        StructField("Telephone Number"       , LongType()      , True),
                        StructField("SHL Endorsed"           , StringType()    , True),
                        StructField("Type of Base"           , StringType()    , True),

                        StructField("Address", 
                                        StructType
                                        ([
                                            StructField("Building"   , StringType(),   True),
                                            StructField("Street"     , StringType(),   True), 
                                            StructField("City"       , StringType(),   True), 
                                            StructField("State"      , StringType(),   True), 
                                            StructField("Postcode"   , StringType(),   True)
                                        ]),
                                    True
                                   ),
                        
                        StructField("GeoLocation", 
                                        StructType
                                        ([
                                            StructField("Latitude"   , StringType(),   True),
                                            StructField("Longitude"  , StringType(),   True), 
                                            StructField("Location"   , StringType(),   True)
                                        ]),
                                    True
                                   )  
                  ])
                )

# Read JSON file using the defined schema
taxiBasesFilePath = "abfss://raw@ughnovgs.dfs.core.windows.net/TaxiBases.json"

taxiBasesDF = (
                  spark
                    .read    
                    .option("multiline", "true")
                    .schema(taxiBasesSchema)
                    .json(taxiBasesFilePath)
              )

display(taxiBasesDF)


In [0]:
from pyspark.sql.functions import col

# Extract nested fields from JSON

taxiBasesFlatDF = (

                        taxiBasesDF
                            .select(
                                      col("License Number").alias("BaseLicenseNumber"),
                                      col("Entity Name").alias("EntityName"),

                                      col("Address.Building").alias("AddressBuilding"),

                                      col("Address.Street").alias("AddressStreet"),
                                      col("Address.City").alias("AddressCity"),
                                      col("Address.State").alias("AddressState"),
                                      col("Address.Postcode").alias("AddressPostCode"),

                                      col("GeoLocation.Latitude").alias("GeoLatitude"),
                                      col("GeoLocation.Longitude").alias("GeoLongitude")
                                   )
                  )

display(taxiBasesFlatDF)


In [0]:
from pyspark.sql.functions import avg, sum, col

# group by and aggregate

yellowTaxiDFReport = (
    yellowTaxiTime
        .groupBy("PickupLocationId", "DropoffLocationID")
        .agg(
            avg("TripTimeInMinutes").alias("AvgTripTime"),
            sum("TotalAmount").alias("SumAmount")
        )
        .orderBy(col("PickupLocationId").desc())
)

display(yellowTaxiDFReport)

In [0]:
yellowTaxisParquetOutputPath = "abfss://raw@ughnovgs.dfs.core.windows.net/Output/YellowTaxis.parquet"

# Write output in parquet format

(
    yellowTaxiDF
        .write
        .mode("overwrite")                                  # Other modes: append, errorifexists, ignore
        .partitionBy("VendorID")
        .format("parquet")                                  # Other formats: csv, json, avro, jdbc, etc.
        .save(yellowTaxisParquetOutputPath)
)

In [0]:
yellowTaxisParquetOutputPath = "abfss://raw@ughnovgs.dfs.core.windows.net/Output/YellowTaxis.delta"

# Write output in delta format

(
    yellowTaxiDF
        .write
        .mode("overwrite")                                  # Other modes: append, errorifexists, ignore
        .partitionBy("VendorID")
        .format("delta")                                  # Other formats: csv, json, avro, jdbc, etc.
        .save(yellowTaxisParquetOutputPath)
)