#This notebook reads  the cpi data from Bronze table, cleans it, and saves it to Silver table.

## 2.1 Clean CPI (Inflation)

- **Source:** `mycatalog.default.cpi_bronze`
- **Transforms:**
    - Filter for `GEO` = "Alberta"
    - Filter for `Products and product groups` = "All-items"
    - Select and rename key columns
    - Cast data to correct types (date, double)
- **Output:** `mycatalog.default.cpi_silver`

In [0]:
from pyspark.sql.functions import col, to_date

bronze_table = "mycatalog.default.cpi_bronze"
silver_table = "mycatalog.default.cpi_silver"

print(f"Reading from Bronze table: {bronze_table}")

bronze_df = spark.table(bronze_table)

print("Transforming CPI data...")
silver_df = (
    bronze_df
    .filter(col("GEO") == "Alberta")
    .filter(col("Products_and_product_groups") == "All-items")
    .select("REF_DATE", "GEO", "VALUE")
    .withColumnRenamed("REF_DATE", "date")
    .withColumnRenamed("GEO", "geo")
    .withColumnRenamed("VALUE", "cpi_value")
    .withColumn("date", to_date(col("date"), "yyyy-MM"))
    .withColumn("cpi_value", col("cpi_value").cast("double"))
)

print(f"Writing to Silver table: {silver_table}")
silver_df.write.format("delta").mode("overwrite").saveAsTable(silver_table)

print("✅ Successfully created Silver table: cpi_silver")
display(spark.table(silver_table))

## 2.2 Clean Wages

- **Source:** `mycatalog.default.wages_bronze`
- **Transforms:**
    - Filter for `GEO` = "Alberta"
    - Filter for `North American Industry Classification System (NAICS)` = "Industrial aggregate excluding unclassified businesses"
    - Select and rename key columns
    - Cast data to correct types (date, double)
- **Output:** `mycatalog.default.wages_silver`

In [0]:
display(bronze_df.select("GEO").distinct())
display(bronze_df.select("North_American_Industry_Classification_System_NAICS").distinct())

In [0]:
from pyspark.sql.functions import col, to_date

bronze_table = "mycatalog.default.wages_bronze"
silver_table = "mycatalog.default.wages_silver"

print(f"Reading from Bronze table: {bronze_table}")

bronze_df = spark.table(bronze_table)

print("Transforming Wages data...")

industry_column_name = "North_American_Industry_Classification_System_NAICS"
industry_filter_value = "Industrial aggregate excluding unclassified businesses"

silver_df = (
    bronze_df
    .filter(col("GEO") == "Alberta")
    .filter(col(industry_column_name) == industry_filter_value)
    .select("REF_DATE", "GEO", "VALUE")
    .withColumnRenamed("REF_DATE", "date")
    .withColumnRenamed("GEO", "geo")
    .withColumnRenamed("VALUE", "avg_wage")
    .withColumn("date", to_date(col("date"), "yyyy-MM"))
    .withColumn("avg_wage", col("avg_wage").cast("double"))
)

print(f"Writing to Silver table: {silver_table}")
silver_df.write.format("delta").mode("overwrite").saveAsTable(silver_table)

print("✅ Successfully created Silver table: wages_silver")
display(spark.table(silver_table))