# Gold Layer – Affordability Index

This notebook builds the **Gold layer** table by:
- computing monthly growth metrics for wages and inflation,
- joining Silver tables,
- computing a first version of the **Affordability Index**:

> Affordability = wage_growth - (inflation_rate + rent_change)

In this first version, we will use **wages + CPI only**, and later extend to include **rent** and **unemployment**.


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Make sure Gold schema exists
spark.sql("CREATE SCHEMA IF NOT EXISTS mycatalog.gold")

# Load Silver tables
wages_silver = spark.table("mycatalog.default.wages_silver")
cpi_silver = spark.table("mycatalog.default.cpi_silver")

print("Rows in wages_silver:", wages_silver.count())
print("Rows in cpi_silver:", cpi_silver.count())

display(wages_silver.limit(5))
display(cpi_silver.limit(5))


In [0]:
# Window by geo, ordered by date
w = Window.partitionBy("geo").orderBy("date")

# Wage growth: month-over-month % change
wages_with_growth = (
    wages_silver
    .withColumn("prev_wage", F.lag("avg_wage").over(w))
    .withColumn(
        "wage_growth",
        F.when(F.col("prev_wage").isNull(), None)
         .otherwise(100.0 * (F.col("avg_wage") - F.col("prev_wage")) / F.col("prev_wage"))
    )
    .drop("prev_wage")
)

print("Sample wage_growth:")
display(wages_with_growth.orderBy("date").limit(10))

# Inflation rate: month-over-month % change in CPI
cpi_with_inflation = (
    cpi_silver
    .withColumn("prev_cpi", F.lag("cpi_value").over(w))
    .withColumn(
        "inflation_rate",
        F.when(F.col("prev_cpi").isNull(), None)
         .otherwise(100.0 * (F.col("cpi_value") - F.col("prev_cpi")) / F.col("prev_cpi"))
    )
    .drop("prev_cpi")
)

print("Sample inflation_rate:")
display(cpi_with_inflation.orderBy("date").limit(10))


In [0]:
gold_base = (
    wages_with_growth
    .select("date", "geo", "wage_growth")
    .join(
        cpi_with_inflation.select("date", "geo", "inflation_rate"),
        on=["date", "geo"],
        how="inner"
    )
)

print("Rows in gold_base:", gold_base.count())
display(gold_base.orderBy("date").limit(10))


In [0]:
gold_with_index = (
    gold_base
    # placeholder columns for now
    .withColumn("rent_change", F.lit(0.0).cast("double"))  # using 0 so affordability formula works
    .withColumn("unemployment_rate_change", F.lit(None).cast("double"))
    .filter(
        F.col("wage_growth").isNotNull() &
        F.col("inflation_rate").isNotNull()
    )
    .withColumn(
        "affordability_index",
        F.round(
            F.col("wage_growth") - (F.col("inflation_rate") + F.col("rent_change")),
            2
        )
    )
)

print("Sample Gold rows:")
display(gold_with_index.orderBy("date").limit(20))
gold_with_index.printSchema()


In [0]:
gold_table = "mycatalog.gold.affordability_gold"

(
    gold_with_index
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(gold_table)
)

print(f"✅ Gold table created: {gold_table}")
display(spark.table(gold_table).orderBy("date").limit(20))


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 
  date,
  affordability_index
FROM mycatalog.gold.affordability_gold
ORDER BY date;


Databricks visualization. Run in Databricks to view.

In [0]:
gold_df = spark.table("mycatalog.gold.affordability_gold")
pdf = gold_df.toPandas()   # convert to Pandas for correlation and heatmap

pdf.head()


In [0]:
%sql
SELECT
  date,
  wage_growth,
  inflation_rate,
  affordability_index
FROM mycatalog.gold.affordability_gold
WHERE wage_growth IS NOT NULL
  AND inflation_rate IS NOT NULL
ORDER BY date;
