In [4]:
# Clean the data (Silver layer)
from pyspark.sql.functions import col, to_timestamp

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("Files/bronze/yellow_tripdata*.csv")

df_clean = df \
            .filter(col("trip_distance")>0)\
            .filter(col("passenger_count")>0)\
            .filter(col("fare_amount")>0)\
            .withColumn("pickup_datetime", to_timestamp(col("tpep_pickup_datetime"))) \
            .withColumn("dropoff_datetime", to_timestamp(col("tpep_dropoff_datetime")))\
            .drop("tpep_pickup_datetime", "tpep_dropoff_datetime") 

print(f"Rows after cleaning: {df_clean.count():,}")   
display(df_clean.limit(10))

# Save as managed Delta table (Silver)
df_clean.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("silver_clean_trips")
print("Silver layer saved as managed table!")

StatementMeta(, 830f6444-e1c8-48aa-9deb-c46c0415d175, 6, Finished, Available, Finished)

Rows after cleaning: 942,749


SynapseWidget(Synapse.DataFrame, 37c8e4bd-e365-424a-8d6b-99a0b5a36155)

Silver layer saved as managed table!


In [5]:
from pyspark.sql.functions import min, max, countDistinct

silver = spark.table("silver_clean_trips")

silver.agg(
    min("pickup_datetime").alias("earliest_date"),
    max("pickup_datetime").alias("latest_date"),
    countDistinct("pickup_datetime").alias("distinct_dates")
).show(truncate=False)

# Also show distinct months present
silver.selectExpr("month(pickup_datetime) as month").distinct().orderBy("month").show()

StatementMeta(, 830f6444-e1c8-48aa-9deb-c46c0415d175, 7, Finished, Available, Finished)

+-------------------+-------------------+--------------+
|earliest_date      |latest_date        |distinct_dates|
+-------------------+-------------------+--------------+
|2008-12-31 23:05:47|2020-11-01 15:41:04|854066        |
+-------------------+-------------------+--------------+

+-----+
|month|
+-----+
|    1|
|    2|
|    3|
|    4|
|    5|
|    6|
|    7|
|    8|
|    9|
|   10|
|   11|
|   12|
+-----+

