## 03_bronze_autoloader_county_json
## Ingest County JSON into Bronze using Auto Loader (cloudFiles).


In [0]:
from pyspark.sql import functions as F
import uuid

CAT = "zillow"
BRONZE = "zillow_bronze"

JSON_DIR = "/Volumes/zillow/zillow_medallion/raw-converted_format/json"
CKPT = "/Volumes/zillow/zillow_medallion/checkpoints/autoloader_county_json"
tgt = f"{CAT}.{BRONZE}.county_ts_json_bronze"

run_id = str(uuid.uuid4())

stream_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("cloudFiles.schemaLocation", CKPT + "/schema")
    .load(JSON_DIR)
    .withColumn("load_dt", F.current_timestamp())
    .withColumn("source_file", F.col("_metadata.file_path"))
    .withColumn("ingest_mode", F.lit("autoloader_json"))
    .withColumn("ingest_run_id", F.lit(run_id))
)

q = (
    stream_df.writeStream
    .format("delta")
    .option("checkpointLocation", CKPT + "/chk")
    .trigger(availableNow=True)
    .outputMode("append")
    .toTable(tgt)
)

q.awaitTermination()

spark.sql(
    f"COMMENT ON TABLE {tgt} IS 'County time series ingested as JSON using Auto Loader into Bronze.'"
)
display(
    spark.sql(f"SELECT COUNT(*) AS rows FROM {tgt}")
)