In [0]:
print(f"Running init notebook with...\n\nTARGET_CATALOG={TARGET_CATALOG}\n\nTARGET_SCHEMA={TARGET_SCHEMA}\n\nTARGET_VOLUME={TARGET_VOLUME}")

In [0]:
import urllib.request
import os, re
from pyspark.sql.utils import AnalysisException

files_to_download = {
    "behavioral_analytics.csv": (
        "https://raw.githubusercontent.com/arjuncode/dbrx-12daysofdemos/"
        "main/00-init/data/behavioral_analytics.csv"
    ),
    "delivery_optimization.csv": (
        "https://raw.githubusercontent.com/arjuncode/dbrx-12daysofdemos/"
        "main/00-init/data/delivery_optimization.csv"
    ),
    "gift_requests.csv": (
        "https://raw.githubusercontent.com/arjuncode/dbrx-12daysofdemos/"
        "main/00-init/data/gift_requests.csv"
    ),
    "holiday_sales_and_trends.csv": (
        "https://raw.githubusercontent.com/arjuncode/dbrx-12daysofdemos/"
        "main/00-init/data/holiday_sales_and_trends.csv"
    ),
    "reindeer_telemetry.csv": (
        "https://raw.githubusercontent.com/arjuncode/dbrx-12daysofdemos/"
        "main/00-init/data/reindeer_telemetry.csv"
    ),
    "santa_letters_canada.csv": (
        "https://raw.githubusercontent.com/arjuncode/dbrx-12daysofdemos/"
        "main/00-init/data/santa_letters_canada.csv"
    ),
    "santa_letters_canada_with_emails.csv": (
        "https://raw.githubusercontent.com/arjuncode/dbrx-12daysofdemos/"
        "main/00-init/data/santa_letters_canada_with_emails.csv"
    ),
    "workshop_production.csv": (
        "https://raw.githubusercontent.com/arjuncode/dbrx-12daysofdemos/"
        "main/00-init/data/workshop_production.csv"
    )
}

CSV_OPTIONS = {
    "header": "true",
    "inferSchema": "true",
    "multiLine": "true"
}

# ================================================================
# 1. Use catalog and ensure schema + volume
# ================================================================

try:
    spark.sql(f"USE CATALOG {TARGET_CATALOG}")
    print(f"✓ Using catalog: {TARGET_CATALOG}")
except AnalysisException as e:
    raise RuntimeError(
        f"Could not USE CATALOG {TARGET_CATALOG}. Make sure it exists and you have access."
    ) from e

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {TARGET_CATALOG}.{TARGET_SCHEMA}")
print(f"✓ Schema exists: {TARGET_CATALOG}.{TARGET_SCHEMA}")

spark.sql(
    f"CREATE VOLUME IF NOT EXISTS {TARGET_CATALOG}.{TARGET_SCHEMA}.{TARGET_VOLUME}"
)
print(f"✓ Volume exists: {TARGET_CATALOG}.{TARGET_SCHEMA}.{TARGET_VOLUME}")

volume_path = f"/Volumes/{TARGET_CATALOG}/{TARGET_SCHEMA}/{TARGET_VOLUME}/"
print(f"Volume path: {volume_path}")

# ================================================================
# 2. Download CSVs from GitHub → UC Volume
# ================================================================

for filename, url in files_to_download.items():
    dest_path = volume_path + filename
    print(f"\nDownloading {filename} from {url}")
    try:
        urllib.request.urlretrieve(url, dest_path)
        print(f"✓ Downloaded to {dest_path}")
    except Exception as e:
        print(f"✗ Error downloading {filename}: {e}")

print("\n=== Download complete. Listing files in volume ===")
try:
    for f in dbutils.fs.ls(volume_path):
        if f.name.endswith(".csv"):
            print(f"✓ {f.name} ({f.size} bytes)")
except Exception as e:
    print(f"Error listing volume: {e}")

# ================================================================
# 3. Create UC tables from the CSVs in the volume
# ================================================================

def clean_table_name(filename: str) -> str:
    base = os.path.splitext(filename)[0]
    base = re.sub(r"[^0-9a-zA-Z_]", "_", base).lower()
    return base or "table_from_csv"

for filename in files_to_download.keys():
    table_name      = clean_table_name(filename)
    full_table_name = f"{TARGET_CATALOG}.{TARGET_SCHEMA}.{table_name}"
    csv_path        = volume_path + filename

    print(f"\n=== Loading {csv_path} into {full_table_name} ===")

    try:
        reader = spark.read
        for k, v in CSV_OPTIONS.items():
            reader = reader.option(k, v)

        df = reader.csv(csv_path)
        print(f"Rows read: {df.count()}")

        df.write.mode("overwrite").saveAsTable(full_table_name)
        print(f"✓ Created / replaced table {full_table_name}")
    except Exception as e:
        print(f"✗ Error creating table {full_table_name} from {csv_path}: {e}")

print("\nAll done! Tables are ready in Unity Catalog.")

In [0]:
def prepare_source_table_schema(table_name):
    print(f"Ensuring schema columns exist for {table_name}...")
    try:
        current_schema = spark.table(table_name).columns
        columns_to_add = []

        if "en_route" not in current_schema:
            spark.sql(f"ALTER TABLE {table_name} ADD COLUMN en_route BOOLEAN")
            spark.sql(f"UPDATE {table_name} SET en_route = False")

        if "delivered" not in current_schema:
            spark.sql(f"ALTER TABLE {table_name} ADD COLUMN delivered BOOLEAN")
            spark.sql(f"UPDATE {table_name} SET delivered = False")

        if "cookies" not in current_schema:
            spark.sql(f"ALTER TABLE {table_name} ADD COLUMN cookies INT")
            spark.sql(f"UPDATE {table_name} SET cookies = Null")

        print("✓ Schema meets requirements.")

    except Exception as e:
        print(f"Error modifying source table. Error: {e}")
        raise e

source_table_name = "main.dbrx_12daysofdemos.gift_requests"
prepare_source_table_schema(source_table_name)

spark.sql(f"ALTER TABLE {source_table_name} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")