# Gold Layer – Affordability Index

This notebook builds the **Gold layer** table by:
- computing monthly growth metrics for wages and inflation,
- joining Silver tables,
- computing a first version of the **Affordability Index**:

> Affordability = wage_growth - (inflation_rate + rent_change)

In this first version, we will use **wages + CPI only**, and later extend to include **rent** and **unemployment**.


In [0]:
# =========================================================
# 3_gold_affordability_index.ipynb
# Build Gold Affordability Index table from Silver layer
# =========================================================

from pyspark.sql import functions as F
from pyspark.sql.window import Window

# -------------------------------------------
# 1. Load Silver tables
# -------------------------------------------

wages_silver = spark.table("mycatalog.default.wages_silver")
cpi_silver = spark.table("mycatalog.default.cpi_silver")
unemp_silver = spark.table("mycatalog.default.unemployment_silver")
rent_silver = spark.table("mycatalog.default.rent_silver")

print("Rows:")
print("  wages_silver:", wages_silver.count())
print("  cpi_silver  :", cpi_silver.count())
print("  unemp_silver:", unemp_silver.count())
print("  rent_silver :", rent_silver.count())

display(rent_silver.limit(5))
display(wages_silver.limit(5))
display(cpi_silver.limit(5))


In [0]:
# -------------------------------------------
# 2. Compute 12-month percentage changes
#    for wages, CPI, unemployment
# -------------------------------------------

# Window over time per geo
w = Window.partitionBy("geo").orderBy("date")

# WAGES: 12-month % change in avg_wage
wages_with_growth = (
    wages_silver
    .withColumn("prev_wage_12m", F.lag("avg_wage", 12).over(w))
    .withColumn(
        "wage_growth",
        F.when(F.col("prev_wage_12m").isNull(), None)
         .otherwise(
             100.0 * (F.col("avg_wage") - F.col("prev_wage_12m")) / F.col("prev_wage_12m")
         )
    )
    .drop("prev_wage_12m")
)

display(wages_with_growth.orderBy("date").limit(30))


In [0]:
# CPI: 12-month % change in cpi_value
cpi_with_inflation = (
    cpi_silver
    .withColumn("prev_cpi_12m", F.lag("cpi_value", 12).over(w))
    .withColumn(
        "inflation_rate",
        F.when(F.col("prev_cpi_12m").isNull(), None)
         .otherwise(
             100.0 * (F.col("cpi_value") - F.col("prev_cpi_12m")) / F.col("prev_cpi_12m")
         )
    )
    .drop("prev_cpi_12m")
)

display(cpi_with_inflation.orderBy("date").limit(15))


In [0]:
# UNEMPLOYMENT: 12-month % change in unemployment_rate
unemp_with_change = (
    unemp_silver
    .withColumn("prev_unemp_12m", F.lag("unemployment_rate", 12).over(w))
    .withColumn(
        "unemployment_rate_change",
        F.when(F.col("prev_unemp_12m").isNull(), None)
         .otherwise(
             100.0 * (F.col("unemployment_rate") - F.col("prev_unemp_12m")) / F.col("prev_unemp_12m")
         )
    )
    .drop("prev_unemp_12m")
)

display(unemp_with_change.orderBy("date").limit(15))


In [0]:
# -------------------------------------------
# 3. Build base Gold (wages + CPI)
#    inner join to keep overlapping dates
# -------------------------------------------

gold_base = (
    wages_with_growth.select("date", "geo", "avg_wage", "wage_growth")
    .join(
        cpi_with_inflation.select("date", "geo", "cpi_value", "inflation_rate"),
        on=["date", "geo"],
        how="inner"
    )
)

print("gold_base rows:", gold_base.count())
display(gold_base.orderBy("date").limit(15))


In [0]:
# -------------------------------------------
# 4. Prepare yearly rent_change and add YEAR
# -------------------------------------------

# rent_silver already has: date (year-01-01), geo, rent_value, rent_change_percentage
rent_yearly = (
    rent_silver
    .withColumn("year", F.year("date"))
    .select("year", "geo", "rent_value", "rent_change_percentage")
)

display(rent_yearly.orderBy("year").limit(10))


In [0]:
# -------------------------------------------
# 5. Add YEAR to gold_base and join rent (LEFT)
# -------------------------------------------

gold_base_y = gold_base.withColumn("year", F.year("date"))

gold_with_rent = gold_base_y.join(
    rent_yearly,
    on=["year", "geo"],
    how="left"   # keep all wage/CPI months, add rent by year
)

print("gold_with_rent rows:", gold_with_rent.count())
display(gold_with_rent.orderBy("date").limit(15))


In [0]:
# Optional: restrict to period where rent exists
gold_with_rent_nonnull = gold_with_rent.filter(
    F.col("rent_change_percentage").isNotNull()
)

print("gold_with_rent_nonnull rows:", gold_with_rent_nonnull.count())
display(gold_with_rent_nonnull.orderBy("date").limit(15))


In [0]:
# -------------------------------------------
# 6. Join unemployment change (monthly, by date)
# -------------------------------------------

gold_full = gold_with_rent_nonnull.join(
    unemp_with_change.select("date", "geo", "unemployment_rate_change"),
    on=["date", "geo"],
    how="left"
)

print("gold_full rows:", gold_full.count())
display(gold_full.orderBy("date").limit(15))


In [0]:
# -------------------------------------------
# 7. Compute Affordability Index
#    Affordability = wage_growth - (inflation_rate + rent_change + unemp_change)
# -------------------------------------------

gold_final = (
    gold_full
    .withColumn(
        "affordability_index",
        F.round(
            F.col("wage_growth")
            - (
                F.col("inflation_rate")
                + F.col("rent_change_percentage")
                + F.col("unemployment_rate_change")
            ),
            2
        )
    )
    .select(
        "date",
        "geo",
        "avg_wage",
        "wage_growth",
        "cpi_value",
        "inflation_rate",
        "rent_value",
        "rent_change_percentage",
        "unemployment_rate_change",
        "affordability_index"
    )
    .orderBy("date")
)

display(gold_final.limit(20))


In [0]:
# -------------------------------------------
# 8. Save Gold table to Unity Catalog
# -------------------------------------------

gold_table_name = "mycatalog.gold.affordability_gold"

gold_final.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(gold_table_name)

print(f"✅ Gold table created: {gold_table_name}")
print("Row count:", spark.table(gold_table_name).count())
display(
    spark.table(gold_table_name)
    .orderBy("date")
    .limit(20)
)


In [0]:
# -------------------------------------------
# 9. Quick visualization: Affordability Index over time
# -------------------------------------------

display(
    spark.table("mycatalog.gold.affordability_gold")
         .select("date", "affordability_index")
         .orderBy("date")
)
