#This notebook reads  the wages data from Bronze table, cleans it, and saves it to Silver table.

In [0]:
# COMMAND ----------
from pyspark.sql import functions as F
from pyspark.sql.window import Window

wages_bronze_table = "mycatalog.default.wages_bronze"
wages_silver_table = "mycatalog.default.wages_silver"

print(f"Reading from Bronze table: {wages_bronze_table}")
wages_bronze_df = spark.table(wages_bronze_table)

print("Schema for wages_bronze:")
wages_bronze_df.printSchema()

print("Sample rows from wages_bronze:")
display(wages_bronze_df.limit(10))


In [0]:
%python
# Quick sanity + distinct values to be sure we filter correctly

print("Row count in wages_bronze:", wages_bronze_df.count())

print("Distinct GEO values:")
display(wages_bronze_df.select("GEO").distinct())

print("Distinct NAICS values:")
display(
    wages_bronze_df
      .select("North_American_Industry_Classification_System__NAICS_")
      .distinct()
)

In [0]:
# COMMAND ----------
# Filter to:
#   - Alberta
#   - Industrial aggregate
# and keep only needed columns

wages_filtered_df = (
    wages_bronze_df
    .filter(F.trim(F.col("GEO")).like("%Alberta%"))
    .filter(
        F.col("North_American_Industry_Classification_System__NAICS_")
        .like("Industrial aggregate%")
    )
    .select(
        F.col("REF_DATE").alias("date_raw"),
        F.col("GEO").alias("geo"),
        F.col("VALUE").alias("avg_wage_raw")
    )
)

print("Rows after Alberta + Industrial aggregate filter:",
      wages_filtered_df.count())

display(wages_filtered_df.limit(10))


In [0]:
# COMMAND ----------
# Clean date + avg_wage types

wages_clean_df = (
    wages_filtered_df
    .withColumn(
        "date",
        F.to_date(F.col("date_raw").cast("string"), "yyyy-MM")
    )
    .withColumn(
        "avg_wage",
        F.col("avg_wage_raw").cast("double")
    )
    .drop("date_raw", "avg_wage_raw")
)

print("Cleaned wages data (date + avg_wage):")
display(wages_clean_df.orderBy("date").limit(20))


In [0]:
# COMMAND ----------
# Compute 12-month wage_growth (%)

w = Window.orderBy("date")

wages_with_growth_df = (
    wages_clean_df
    .withColumn(
        "avg_wage_lag_12",
        F.lag("avg_wage", 12).over(w)
    )
    .withColumn(
        "wage_growth",
        F.round(
            (F.col("avg_wage") - F.col("avg_wage_lag_12"))
            / F.col("avg_wage_lag_12") * 100,
            2
        )
    )
    .drop("avg_wage_lag_12")
)

print("Sample with 12-month wage growth:")
display(wages_with_growth_df.orderBy("date").limit(25))


In [0]:
# COMMAND ----------
# Final Silver table layout

wages_silver_df = wages_with_growth_df.select(
    "date", "geo", "avg_wage", "wage_growth"
)

print("Row count in wages_silver (before write):",
      wages_silver_df.count())

display(wages_silver_df.orderBy("date").limit(20))


In [0]:
%python
# Add the new column to the Delta table schema using SQL
spark.sql(
    f"""
    ALTER TABLE {wages_silver_table}
    ADD COLUMNS (wage_growth DOUBLE)
    """
)

# Now overwrite the table with the new DataFrame
(
    wages_silver_df
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(wages_silver_table)
)

print(f"âœ… Successfully updated Silver table: {wages_silver_table}")

# Verify
wages_silver_check = spark.table(wages_silver_table)
display(wages_silver_check.orderBy("date").limit(20))