## 02_bronze_copy_into_zip_time_series
## Loads Zip_time_series parts into Bronze using COPY INTO (incremental simulation).
## Audit cols are added during insert into final bronze table.

In [0]:
from pyspark.sql import functions as F

CAT = "zillow"
BRONZE = "zillow_bronze"

BASE = "/Volumes/zillow/zillow_medallion/raw-converted_format/incremental/zip_time_series"
ZIP_PARTS = [
  f"{BASE}/zip_part_1.csv",
  f"{BASE}/zip_part_2.csv",
  f"{BASE}/zip_part_3.csv",
  f"{BASE}/zip_part_4.csv",
]

stg = f"{CAT}.{BRONZE}.zip_ts_stg"
tgt = f"{CAT}.{BRONZE}.zip_ts_bronze"

# Checkpoint / helper location (optional)
print("Parts:", ZIP_PARTS)

# COMMAND ----------
# 1) Create staging table (schema inferred once from part_1)
df0 = (spark.read.option("header","true").option("inferSchema","true").csv(ZIP_PARTS[0]))
spark.sql(f"DROP TABLE IF EXISTS {stg}")
df0.limit(0).write.format("delta").mode("overwrite").saveAsTable(stg)

# 2) Create target bronze table with audit cols
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {tgt}
USING DELTA
AS SELECT
  s.*,
  CAST(NULL AS TIMESTAMP) AS load_dt,
  CAST(NULL AS STRING) AS source_path,
  CAST(NULL AS STRING) AS ingest_mode,
  CAST(NULL AS INT) AS part_id
FROM {stg} s
WHERE 1=0
""")

spark.sql(f"COMMENT ON TABLE {tgt} IS 'Zip time series Bronze. Loaded via COPY INTO per part arrival; audit cols added at insert.'")

# COMMAND ----------
# 3) Load each part (COPY INTO -> STG, then INSERT into TGT with audit cols)
for i, part_path in enumerate(ZIP_PARTS, start=1):
    print(f"\n--- Loading part {i}: {part_path}")

    # Clear staging
    spark.sql(f"TRUNCATE TABLE {stg}")

    # COPY INTO staging (requirement)
    spark.sql(f"""
    COPY INTO {stg}
    FROM '{part_path}'
    FILEFORMAT = CSV
    FORMAT_OPTIONS ('header'='true', 'inferSchema'='true')
    """)

    # Insert into final with audit cols
    spark.sql(f"""
    INSERT INTO {tgt}
    SELECT
      s.*,
      current_timestamp() AS load_dt,
      '{part_path}' AS source_path,
      'copy_into_csv' AS ingest_mode,
      {i} AS part_id
    FROM {stg} s
    """)

    rows = spark.sql(f"SELECT COUNT(*) AS c FROM {stg}").collect()[0]["c"]
    print("Rows inserted:", rows)

# Quick evidence
display(spark.sql(f"SELECT part_id, COUNT(*) cnt FROM {tgt} GROUP BY part_id ORDER BY part_id"))
