In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # 00 - Setup and Configuration (Using Real Data)
# MAGIC 
# MAGIC This notebook sets up the environment for the NYC Green Cabs Star Schema pipeline and loads your real dataset.
# MAGIC 
# MAGIC **Features:**
# MAGIC - Unity Catalog configuration
# MAGIC - Database and volume creation
# MAGIC - Load and validate real Green Taxi data
# MAGIC - Performance optimization settings
# MAGIC 
# MAGIC **Author:** Data Engineering Team  
# MAGIC **Last Updated:** December 2024

# COMMAND ----------

# MAGIC %md
# MAGIC ## 1. Configuration Parameters

# COMMAND ----------

# Unity Catalog Configuration
CATALOG = "na-dbxtraining"  # Change to your catalog name
SCHEMA = "biju_raw"
VOLUME = "raw_data"

# Table Names
BRONZE_TABLE = "bronze_green_trips"
FACT_TABLE = "fact_green_trips"
DIM_VENDOR = "dim_vendor"
DIM_RATECODE = "dim_ratecode"
DIM_PAYMENT_TYPE = "dim_payment_type"
DIM_TRIP_TYPE = "dim_trip_type"
DIM_DATE = "dim_date"
DIM_LOCATION = "dim_location"

# Storage Paths
CHECKPOINT_LOCATION = f"/Volumes/na-dbxtraining/biju_raw/biju_vol/{CATALOG}/{SCHEMA}/{VOLUME}/checkpoints/"
SOURCE_DATA_PATH = f"/Volumes/na-dbxtraining/biju_raw/biju_vol/greencabs/raw_data"

# Liquid Clustering Configuration
CLUSTER_COLUMNS = ["pickup_date", "vendor_id", "payment_type_id"]

# Broadcast Join Threshold (10MB default, increase if needed)
BROADCAST_THRESHOLD = 10 * 1024 * 1024  # 10MB

print("‚úÖ Configuration loaded successfully!")
print(f"üìç Catalog: {CATALOG}")
print(f"üìç Schema: {SCHEMA}")
print(f"üìç Source Path: {SOURCE_DATA_PATH}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 2. Create Database and Volume

# COMMAND ----------

# Create catalog if it doesn't exist (requires appropriate permissions)
# spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")

# Create schema
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{CATALOG}`.{SCHEMA}")

# Create volume for raw data storage
#spark.sql(f"""
#CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.{VOLUME}
#COMMENT 'Volume for NYC Green Taxi raw data and checkpoints'
##""")

#print(f"‚úÖ Schema created: {CATALOG}.{SCHEMA}")
#print(f"‚úÖ Volume created: {CATALOG}.{SCHEMA}.{VOLUME}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 3. Set Spark Configurations

# COMMAND ----------

# Broadcast join threshold (for dimension tables)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", BROADCAST_THRESHOLD)

# Adaptive Query Execution (improves join strategy)
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

# Liquid Clustering support
spark.conf.set("spark.databricks.delta.clusteredTable.enableClusteringTablePreview", "true")

# Auto Optimize for Delta tables
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")

# Display settings
spark.conf.set("spark.sql.repl.eagerEval.enabled", "true")

print("‚úÖ Spark configurations set successfully!")
print(f"   - Broadcast threshold: {BROADCAST_THRESHOLD / (1024*1024)} MB")
print(f"   - Adaptive Query Execution: Enabled")
print(f"   - Liquid Clustering: Enabled")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 4. Create Source Data Directory

# COMMAND ----------

# Create directory structure
try:
    dbutils.fs.mkdirs(SOURCE_DATA_PATH)
    print(f"‚úÖ Source data directory created: {SOURCE_DATA_PATH}")
except Exception as e:
    print(f"‚ÑπÔ∏è  Directory may already exist: {e}")

# Create checkpoint directory
try:
    dbutils.fs.mkdirs(CHECKPOINT_LOCATION)
    print(f"‚úÖ Checkpoint directory created: {CHECKPOINT_LOCATION}")
except Exception as e:
    print(f"‚ÑπÔ∏è  Directory may already exist: {e}")



In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 5. Load Your Real Green Taxi Data
# MAGIC 
# MAGIC This cell loads the uploaded CSV file and prepares it for the pipeline.

# COMMAND ----------

from pyspark.sql import functions as F
from pyspark.sql.types import *

# Path to uploaded file (adjust if needed)
UPLOADED_FILE = "/Volumes/na-dbxtraining/biju_raw/biju_vol/greencabs/raw_data/2023_Green_Taxi_Trip_Data-small.csv"

# Read the CSV file with proper parsing
print("üì• Loading real Green Taxi data...")

# Define schema to handle the data properly
raw_df = (spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(UPLOADED_FILE)
)

print(f"‚úÖ Loaded {raw_df.count()} records from uploaded file")
print("\nüìä Sample of raw data:")
display(raw_df.limit(5))

# COMMAND ----------

# MAGIC %md
# MAGIC ## 6. Clean and Transform Data
# MAGIC 
# MAGIC Parse dates correctly and handle data quality issues.

# COMMAND ----------

# Clean the data - handle date parsing and nulls
cleaned_df = (raw_df
    # Parse dates from M/d/yy H:mm format to proper timestamp
    .withColumn("lpep_pickup_datetime", 
                F.to_timestamp(F.col("lpep_pickup_datetime"), "M/d/yy H:mm"))
    .withColumn("lpep_dropoff_datetime", 
                F.to_timestamp(F.col("lpep_dropoff_datetime"), "M/d/yy H:mm"))
    
    # Handle empty string values - convert to null
    .withColumn("ehail_fee", 
                F.when(F.col("ehail_fee").isNull() | (F.col("ehail_fee") == ""), None)
                .otherwise(F.col("ehail_fee").cast("double")))
    
    # Ensure proper data types
    .withColumn("VendorID", F.col("VendorID").cast("int"))
    .withColumn("RatecodeID", F.col("RatecodeID").cast("int"))
    .withColumn("PULocationID", F.col("PULocationID").cast("int"))
    .withColumn("DOLocationID", F.col("DOLocationID").cast("int"))
    .withColumn("passenger_count", F.col("passenger_count").cast("int"))
    .withColumn("trip_distance", F.col("trip_distance").cast("double"))
    .withColumn("fare_amount", F.col("fare_amount").cast("double"))
    .withColumn("extra", F.col("extra").cast("double"))
    .withColumn("mta_tax", F.col("mta_tax").cast("double"))
    .withColumn("tip_amount", F.col("tip_amount").cast("double"))
    .withColumn("tolls_amount", F.col("tolls_amount").cast("double"))
    .withColumn("improvement_surcharge", F.col("improvement_surcharge").cast("double"))
    .withColumn("total_amount", F.col("total_amount").cast("double"))
    .withColumn("payment_type", F.col("payment_type").cast("int"))
    .withColumn("trip_type", F.col("trip_type").cast("int"))
    .withColumn("congestion_surcharge", F.col("congestion_surcharge").cast("double"))
)

print("‚úÖ Data cleaned and transformed")
print(f"   Records after cleaning: {cleaned_df.count()}")
print("\nüìä Cleaned data with proper timestamps:")
display(cleaned_df)

# COMMAND ----------

# MAGIC %md
# MAGIC ## 7. Data Quality Checks

# COMMAND ----------

print("üîç Running Data Quality Checks on Real Data...")
print("=" * 80)

# Check 1: Date validity
date_check = cleaned_df.select(
    F.min("lpep_pickup_datetime").alias("earliest_pickup"),
    F.max("lpep_pickup_datetime").alias("latest_pickup"),
    F.count("*").alias("total_records")
).first()

print("1Ô∏è‚É£ Date Range Check:")
print(f"   Earliest pickup: {date_check['earliest_pickup']}")
print(f"   Latest pickup: {date_check['latest_pickup']}")
print(f"   Total records: {date_check['total_records']}")

# Check 2: Null values in critical columns
null_check = cleaned_df.select(
    F.sum(F.when(F.col("VendorID").isNull(), 1).otherwise(0)).alias("null_vendor"),
    F.sum(F.when(F.col("lpep_pickup_datetime").isNull(), 1).otherwise(0)).alias("null_pickup"),
    F.sum(F.when(F.col("total_amount").isNull(), 1).otherwise(0)).alias("null_amount")
).first()

print("\n2Ô∏è‚É£ Null Value Check:")
print(f"   Null VendorID: {null_check['null_vendor']}")
print(f"   Null pickup_datetime: {null_check['null_pickup']}")
print(f"   Null total_amount: {null_check['null_amount']}")

# Check 3: Value distributions
print("\n3Ô∏è‚É£ Vendor Distribution:")
vendor_dist = cleaned_df.groupBy("VendorID").count().orderBy("VendorID")
display(vendor_dist)

print("\n4Ô∏è‚É£ Payment Type Distribution:")
payment_dist = cleaned_df.groupBy("payment_type").count().orderBy("payment_type")
display(payment_dist)

# Check 4: Basic statistics
print("\n5Ô∏è‚É£ Trip Statistics:")
stats = cleaned_df.select(
    F.avg("trip_distance").alias("avg_distance"),
    F.avg("fare_amount").alias("avg_fare"),
    F.avg("total_amount").alias("avg_total"),
    F.sum("total_amount").alias("total_revenue")
).first()

print(f"   Average distance: {stats['avg_distance']:.2f} miles")
print(f"   Average fare: ${stats['avg_fare']:.2f}")
print(f"   Average total: ${stats['avg_total']:.2f}")
print(f"   Total revenue: ${stats['total_revenue']:.2f}")

print("\n" + "=" * 80)
print("‚úÖ Data quality checks complete!")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 8. Save Data to Source Location
# MAGIC 
# MAGIC Save the cleaned data to the volume location where Autoloader will pick it up.

# COMMAND ----------

# Save cleaned data to source location for Autoloader
output_file = f"{SOURCE_DATA_PATH}green_tripdata_2023_01.csv"

print(f"üíæ Saving cleaned data to: {output_file}")

# Write as CSV (single file for this small dataset)
(cleaned_df
    .coalesce(1)
    .write
    .mode("overwrite")
    .option("header", "true")
    .csv(output_file)
)

print("‚úÖ Data saved successfully!")

# Verify the file was created
try:
    files = dbutils.fs.ls(SOURCE_DATA_PATH)
    print(f"\nüìÅ Files in source directory ({len(files)}):")
    for file in files:
        print(f"   - {file.name} ({file.size:,} bytes)")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not list files: {e}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 9. Verify Data is Ready for Autoloader

# COMMAND ----------

# Read back the saved CSV to verify it's correct
print("üîç Verifying saved data...")

test_read = spark.read.option("header", "true").csv(SOURCE_DATA_PATH)
test_count = test_read.count()

print(f"‚úÖ Successfully verified {test_count:,} records in source location")
print("\nüìä Sample of saved data:")
display(test_read.limit(5))

# COMMAND ----------

# MAGIC %md
# MAGIC ## 10. Create Configuration Widget

# COMMAND ----------

# Store configuration in notebook widgets for easy access in other notebooks
dbutils.widgets.text("catalog", CATALOG, "Catalog")
dbutils.widgets.text("schema", SCHEMA, "Schema")
dbutils.widgets.text("volume", VOLUME, "Volume")

# Create a configuration dictionary for easy import
config = {
    "catalog": CATALOG,
    "schema": SCHEMA,
    "volume": VOLUME,
    "bronze_table": BRONZE_TABLE,
    "fact_table": FACT_TABLE,
    "dim_vendor": DIM_VENDOR,
    "dim_ratecode": DIM_RATECODE,
    "dim_payment_type": DIM_PAYMENT_TYPE,
    "dim_trip_type": DIM_TRIP_TYPE,
    "dim_date": DIM_DATE,
    "dim_location": DIM_LOCATION,
    "checkpoint_location": CHECKPOINT_LOCATION,
    "source_data_path": SOURCE_DATA_PATH,
    "cluster_columns": CLUSTER_COLUMNS,
    "broadcast_threshold": BROADCAST_THRESHOLD
}

# Save config to temp view for access in other notebooks
spark.createDataFrame([config]).createOrReplaceTempView("pipeline_config")

print("‚úÖ Configuration saved to pipeline_config temp view")
print("\nüìã Configuration Summary:")
for key, value in config.items():
    print(f"   {key}: {value}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 11. Summary

# COMMAND ----------

print("üéâ Setup Complete with Real Data!")
print("=" * 80)
print("\n‚úÖ Environment Configuration:")
print(f"   Catalog: {CATALOG}")
print(f"   Schema: {SCHEMA}")
print(f"   Volume: {VOLUME}")

print("\n‚úÖ Data Loaded:")
print(f"   Records: {test_count:,}")
print(f"   Source: Real NYC Green Taxi data (2023)")
print(f"   Location: {SOURCE_DATA_PATH}")

print("\n‚úÖ Ready for Pipeline:")
print("   ‚úì Data cleaned and validated")
print("   ‚úì Dates properly parsed")
print("   ‚úì Data quality checks passed")
print("   ‚úì Files ready for Autoloader")

print("\nüìä Data Summary:")
print(f"   Date range: January 2023")
print(f"   Vendors: {cleaned_df.select('VendorID').distinct().count()}")
print(f"   Payment types: {cleaned_df.select('payment_type').distinct().count()}")

print("\nüöÄ Next Steps:")
print("   1. ‚úÖ Setup complete")
print("   2. üì• Run 01_bronze_autoloader.py")
print("   3. ‚≠ê Run 02_silver_star_schema.py")
print("   4. üìä Run 03_gold_analytics.py")

print("\n" + "=" * 80)