# Gold Layer â€“ Dimensional Modeling (PySpark)

This notebook builds Gold layer dimension and fact tables
using curated Silver tables, following a Star Schema design.


In [0]:
spark


<pyspark.sql.connect.session.SparkSession at 0xff728e1e1730>

In [0]:
spark.sql("USE CATALOG pyspark_dataware_house_project")
spark.sql("USE SCHEMA default")


DataFrame[]

In [0]:
from pyspark.sql.functions import col, row_number, coalesce
from pyspark.sql.window import Window


In [0]:
silver_cust_df = spark.table("silver_crm_cust_info")
silver_prd_df  = spark.table("silver_crm_prd_info")
silver_sales_df = spark.table("silver_crm_sale_details")

erp_cust_df = spark.table("silver_erp_cust_az12")
erp_loc_df  = spark.table("silver_erp_loc_a101")
erp_cat_df  = spark.table("silver_erp_px_cat_g1v2")


In [0]:
cust_window = Window.orderBy("cst_id")


In [0]:
from pyspark.sql.functions import when, coalesce

gold_dim_customers_df = (
    silver_cust_df.alias("ci")
    .join(
        erp_cust_df.alias("ca"),
        col("ci.cst_key") == col("ca.cid"),
        "left"
    )
    .join(
        erp_loc_df.alias("la"),
        col("ci.cst_key") == col("la.cid"),
        "left"
    )
    .select(
        row_number().over(cust_window).alias("customer_key"),
        col("ci.cst_id").alias("customer_id"),
        col("ci.cst_key").alias("customer_number"),
        col("ci.cst_firstname").alias("first_name"),
        col("ci.cst_lastname").alias("last_name"),
        col("la.cntry").alias("country"),
        col("ci.cst_marital_status").alias("marital_status"),
        when(
            col("ci.cst_gndr") != "n/a",
            col("ci.cst_gndr")
        ).otherwise(
            coalesce(col("ca.gen"), col("ci.cst_gndr"))
        ).alias("gender"),
        col("ca.bdate").alias("birthdate"),
        col("ci.cst_create_date").alias("create_date")
    )
)




In [0]:
gold_dim_customers_df.write.mode("overwrite").saveAsTable(
    "gold_dim_customers"
)




In [0]:
prd_window = Window.orderBy("prd_start_dt", "prd_key")


In [0]:
gold_dim_products_df = (
    silver_prd_df.alias("pn")
    .join(
        erp_cat_df.alias("pc"),
        col("pn.cat_id") == col("pc.id"),
        "left"
    )
    .filter(col("pn.prd_end_dt").isNull())
    .select(
        row_number().over(prd_window).alias("product_key"),
        col("pn.prd_id").alias("product_id"),
        col("pn.prd_key").alias("product_number"),
        col("pn.prd_nm").alias("product_name"),
        col("pn.cat_id").alias("category_id"),
        col("pc.cat").alias("category"),
        col("pc.subcat").alias("subcategory"),
        col("pc.maintenance"),
        col("pn.prd_cost").alias("cost"),
        col("pn.prd_line").alias("product_line"),
        col("pn.prd_start_dt").alias("start_date")
    )
)




In [0]:
gold_dim_products_df.write.mode("overwrite").saveAsTable(
    "gold_dim_products"
)




In [0]:
gold_fact_sales_df = (
    silver_sales_df.alias("sd")
    .join(
        gold_dim_products_df.alias("pr"),
        col("sd.sls_prd_key") == col("pr.product_number"),
        "left"
    )
    .join(
        gold_dim_customers_df.alias("cu"),
        col("sd.sls_cust_id") == col("cu.customer_id"),
        "left"
    )
    .select(
        col("sd.sls_ord_num").alias("order_number"),
        col("pr.product_key"),
        col("cu.customer_key"),
        col("sd.sls_order_dt").alias("order_date"),
        col("sd.sls_ship_dt").alias("shipping_date"),
        col("sd.sls_due_dt").alias("due_date"),
        col("sd.sls_sales").alias("sales_amount"),
        col("sd.sls_quantity").alias("quantity"),
        col("sd.sls_price").alias("price")
    )
)




In [0]:
gold_fact_sales_df.write.mode("overwrite").saveAsTable(
    "gold_fact_sales"
)




## Gold Layer Completion

- Built Star Schema (Dimensions & Fact)
- Applied business-ready joins and enrichments
- Gold tables ready for analytics and BI reporting
