In [1]:
# Purpose: Build Gold-layer tables (aggregated features) from nfip_claims_silver.
# Aligns with CSV-based Silver schema (county_code, zip, claim_id, total_paid, etc.).

from pyspark.sql.functions import sum as _sum, countDistinct, avg

SILVER_TBL = "fema_nfip_claims_silver"
GOLD_GEO_TBL = "fema_nfip_geo_year_gold"
GOLD_CLAIMS_TBL = "fema_nfip_claims_fact_gold"


StatementMeta(, 4100e43e-5170-4d1a-9076-31c898cb6d86, 3, Finished, Available, Finished)

In [2]:
# Read the Silver table
silver = spark.table(SILVER_TBL)

# --- 1) County-year aggregates (for risk features) ---
gold_geo = (
    silver.groupBy("state", "county_code", "loss_year")
    .agg(
        _sum("total_paid").alias("paid_total"),
        _sum("paid_building").alias("paid_building_total"),
        _sum("paid_contents").alias("paid_contents_total"),
        _sum("paid_icc").alias("paid_icc_total"),
        countDistinct("claim_id").alias("claims_count"),
        avg("latitude").alias("avg_latitude"),
        avg("longitude").alias("avg_longitude")
    )
)

# Write to Delta table
gold_geo.write.format("delta").mode("overwrite").saveAsTable(GOLD_GEO_TBL)

StatementMeta(, 4100e43e-5170-4d1a-9076-31c898cb6d86, 4, Finished, Available, Finished)

In [3]:
# --- 2) Claims fact table (for drill-down and AI agent use) ---
claims_fact = silver.select(
    "claim_id", "loss_date", "loss_year", "state", "county_code", "zip",
    "total_paid", "paid_building", "paid_contents", "paid_icc",
    "latitude", "longitude", "ratedFloodZone", "floodEvent", "floodZoneCurrent", "occupancyType"
)

claims_fact.write.format("delta").mode("overwrite").saveAsTable(GOLD_CLAIMS_TBL)


StatementMeta(, 4100e43e-5170-4d1a-9076-31c898cb6d86, 5, Finished, Available, Finished)

In [4]:
# --- 4) Create helpful SQL views for agents / Power BI ---
spark.sql(f"""
CREATE OR REPLACE VIEW vw_nfip_risk_features AS
SELECT
  state,
  county_code,
  loss_year,
  paid_total,
  claims_count,
  CASE WHEN claims_count > 0 THEN paid_total / claims_count ELSE 0 END AS avg_paid_per_claim
FROM {GOLD_GEO_TBL}
""")

spark.sql(f"""
CREATE OR REPLACE VIEW vw_nfip_claims AS
SELECT * FROM {GOLD_CLAIMS_TBL}
""")

print("Gold tables created successfully.")
print("Rows in nfip_geo_year_gold:", spark.table(GOLD_GEO_TBL).count())
print("Rows in nfip_claims_fact_gold:", spark.table(GOLD_CLAIMS_TBL).count())


StatementMeta(, 4100e43e-5170-4d1a-9076-31c898cb6d86, 6, Finished, Available, Finished)

Gold tables created successfully.
Rows in nfip_geo_year_gold: 58529
Rows in nfip_claims_fact_gold: 2718200
