In [6]:
# Quality Checks Notebook - Project 2: NYC Taxi Analytics (Assertions Only)
from pyspark.sql.functions import year, month, avg, count,col

print("=== Starting Quality Checks for All Layers ===\n")

# 1. Bronze layer (raw) - basic existence check
bronze = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("Files/bronze/*.csv")

assert bronze.count() > 0, "Bronze layer is empty - no data loaded!"
print("Bronze layer: Data exists ✓")

# 2. Silver layer - existence, row count, and basic data quality
silver = spark.table("silver_clean_trips")

assert silver.count() > 0, "Silver layer is empty - cleaning failed!"
assert silver.filter(col("fare_amount") < 0).count() == 0, "Negative fares found in Silver!"
assert silver.filter(col("trip_distance") < 0).count() == 0, "Negative trip distances found in Silver!"
assert silver.filter(col("pickup_datetime").isNull()).count() == 0, "Null timestamps in Silver!"

print("Silver layer: All basic assertions passed ✓")

# 3. Gold tables - existence and row count checks
gold_daily = spark.table("nyc_daily_revenue")
gold_monthly = spark.table("nyc_monthly_distance")
gold_payment = spark.table("nyc_payment_summary")
gold_hourly = spark.table("nyc_hourly_zone_insights")  # if you have this table

assert gold_daily.count() > 0, "Daily revenue Gold table is empty!"
assert gold_monthly.count() > 0, "Monthly distance Gold table is empty!"
assert gold_payment.count() > 0, "Payment summary Gold table is empty!"
assert gold_hourly.count() > 0, "Hourly zone insights Gold table is empty!"

print("All Gold tables exist and have data ✓")

# Optional: You can add more specific business assertions if needed, e.g.:
# assert gold_daily.agg(sum("total_revenue")).collect()[0][0] > 0, "Total revenue is zero or negative!"

print("\nALL QUALITY CHECKS PASSED SUCCESSFULLY! ✓")
print("Project 2 core pipeline is clean, reliable, and ready for production use.")

StatementMeta(, 30646abf-b935-4e09-9ebe-08e756a7f30b, 8, Finished, Available, Finished)

=== Starting Quality Checks for All Layers ===

Bronze layer: Data exists ✓
Silver layer: All basic assertions passed ✓
All Gold tables exist and have data ✓

ALL QUALITY CHECKS PASSED SUCCESSFULLY! ✓
Project 2 core pipeline is clean, reliable, and ready for production use.
