In [0]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, DateType, TimestampType
)

# ---------------------------------------------
# Spark & schema
# ---------------------------------------------

spark.sql("CREATE schema IF NOT EXISTS airbnb_project")
spark.catalog.setCurrentDatabase("airbnb_project")



# ---------------------------------------------
# Config: where 00 put the files
# ---------------------------------------------
BASE_ROOT = "/Volumes/workspace/default/course_data/airbnb_toronto"
REQUIRED_FILES = ["listings.csv", "calendar.csv", "reviews.csv", "neighbourhoods.csv"]

# ---------------------------------------------
# Explicit schemas for all tables (fixes merge error)
# ---------------------------------------------
listings_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("listing_url", StringType(), True),
    StructField("name", StringType(), True),
    StructField("host_id", IntegerType(), True),
    StructField("host_name", StringType(), True),
    StructField("neighbourhood_cleansed", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("room_type", StringType(), True),
    StructField("price", StringType(), True),
    # ... add all other columns as needed ...
])

calendar_schema = StructType([
    StructField("listing_id", IntegerType(), True),
    StructField("date", DateType(), True),
    StructField("available", StringType(), True),
    StructField("price", StringType(), True),
    # ... add all other columns as needed ...
])

reviews_schema = StructType([
    StructField("listing_id", IntegerType(), True),
    StructField("id", IntegerType(), True),
    StructField("date", DateType(), True),
    StructField("reviewer_id", IntegerType(), True),
    StructField("reviewer_name", StringType(), True),
    StructField("comments", StringType(), True),
    # ... add all other columns as needed ...
])

neighbourhoods_schema = StructType([
    StructField("neighbourhood", StringType(), True),
    StructField("neighbourhood_group", StringType(), True),
    # ... add all other columns as needed ...
])

# ---------------------------------------------
# Helper: check if folder has all required CSVs
# ---------------------------------------------
def folder_has_all_files(folder: str) -> bool:
    for fname in REQUIRED_FILES:
        if not os.path.exists(os.path.join(folder, fname)):
            return False
    return True

# ---------------------------------------------
# Discover available snapshot folders
# ---------------------------------------------
if not os.path.exists(BASE_ROOT):
    raise RuntimeError(f"Base root does not exist: {BASE_ROOT}")

all_entries = [
    name for name in os.listdir(BASE_ROOT)
    if os.path.isdir(os.path.join(BASE_ROOT, name))
]

# Only keep folders that look like yyyy-MM-dd AND have all files
snapshot_dates = []
for name in sorted(all_entries):
    full_dir = os.path.join(BASE_ROOT, name)
    if not folder_has_all_files(full_dir):
        print(f"Skipping folder {full_dir} (missing required files).")
        continue
    snapshot_dates.append(name)

if not snapshot_dates:
    raise RuntimeError(f"No valid snapshot folders found under {BASE_ROOT}")

print(f"Discovered snapshot_dates with full data: {snapshot_dates}")

# ---------------------------------------------
# Helper: append rows for a given snapshot_date if not already loaded
# ---------------------------------------------
def append_if_new_snapshot(table_name: str, df, snapshot_date_str: str):
    snap_date_lit = F.to_date(F.lit(snapshot_date_str))
    try:
        existing = spark.table(table_name)
        already = (
            existing
            .filter(F.col("snapshot_date") == snap_date_lit)
            .limit(1)
            .count()
        )
        if already > 0:
            print(f"{table_name}: snapshot_date {snapshot_date_str} already loaded, skipping.")
            return

        (
            df.write
            .format("delta")
            .option("mergeSchema", "true")  # Allow schema evolution
            .mode("append")
            .saveAsTable(table_name)
        )
        print(f"{table_name}: appended snapshot_date {snapshot_date_str}.")
    except AnalysisException:
        # Table does not exist yet → initial load
        (
            df.write
            .format("delta")
            .option("mergeSchema", "true")  # Allow schema evolution
            .mode("overwrite")
            .saveAsTable(table_name)
        )
        print(f"{table_name}: created and loaded snapshot_date {snapshot_date_str} (initial load).")

# ---------------------------------------------
# Main loop: process each snapshot_date
# ---------------------------------------------
for snapshot_date in snapshot_dates:
    print(f"\n=== Processing snapshot_date: {snapshot_date} ===")
    base_path = os.path.join(BASE_ROOT, snapshot_date)

    # Tag with snapshot_date column
    snap_date_lit = F.to_date(F.lit(snapshot_date))

    listings_df = (
        spark.read
        .option("header", "true")
        .option("multiLine", "true")
        .option("escape", "\"")
        .option("quote", "\"")
        .schema(listings_schema)
        .csv(os.path.join(base_path, "listings.csv"))
        .withColumn("snapshot_date", snap_date_lit)
    )

    calendar_df = (
        spark.read
        .option("header", "true")
        .schema(calendar_schema)
        .csv(os.path.join(base_path, "calendar.csv"))
        .withColumn("snapshot_date", snap_date_lit)
    )

    reviews_df = (
        spark.read
        .option("header", "true")
        .schema(reviews_schema)
        .csv(os.path.join(base_path, "reviews.csv"))
        .withColumn("snapshot_date", snap_date_lit)
    )

    neighbourhoods_df = (
        spark.read
        .option("header", "true")
        .schema(neighbourhoods_schema)
        .csv(os.path.join(base_path, "neighbourhoods.csv"))
        .withColumn("snapshot_date", snap_date_lit)
    )

    append_if_new_snapshot("airbnb_project.airbnb_raw_listings", listings_df, snapshot_date)
    append_if_new_snapshot("airbnb_project.airbnb_raw_calendar", calendar_df, snapshot_date)
    append_if_new_snapshot("airbnb_project.airbnb_raw_reviews", reviews_df, snapshot_date)
    append_if_new_snapshot("airbnb_project.airbnb_raw_neighbourhoods", neighbourhoods_df, snapshot_date)

print("\nBronze raw tables are up to date with all discovered snapshots.")