In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("StagingToDimension").getOrCreate()

# Load source data (Product Table) from staging tables
product_df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/tables/production/Production_product.csv")

# Load lookup tables for enrichment
product_model_df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/tables/production/Production_productmodel.csv")
product_descr_df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/tables/production/Production_productDescription.csv")


In [0]:
product_df = product_df.withColumnRenamed("Name", "product_name")
product_df = product_df.withColumnRenamed("Name", "product_name")


In [0]:
product_df.show(2)


+---------+---------------+-------------+--------+-----------------+-----+----------------+------------+------------+---------+----+-------------------+---------------------+------+-----------------+-----------+-----+-----+--------------------+--------------+--------------------+-----------+----------------+--------------------+--------------------+
|ProductID|   product_name|ProductNumber|MakeFlag|FinishedGoodsFlag|Color|SafetyStockLevel|ReorderPoint|StandardCost|ListPrice|Size|SizeUnitMeasureCode|WeightUnitMeasureCode|Weight|DaysToManufacture|ProductLine|Class|Style|ProductSubcategoryID|ProductModelID|       SellStartDate|SellEndDate|DiscontinuedDate|             rowguid|        ModifiedDate|
+---------+---------------+-------------+--------+-----------------+-----+----------------+------------+------------+---------+----+-------------------+---------------------+------+-----------------+-----------+-----+-----+--------------------+--------------+--------------------+-----------+----

In [0]:
product_model_df.show(2)

+--------------+------------+------------------+------------+--------------------+--------------------+
|ProductModelID|        Name|CatalogDescription|Instructions|             rowguid|        ModifiedDate|
+--------------+------------+------------------+------------+--------------------+--------------------+
|             1|Classic Vest|              NULL|        NULL|29321D47-1E4C-4AA...|2013-04-30 00:00:...|
|             2| Cycling Cap|              NULL|        NULL|474FB654-3C96-4CB...|2011-05-01 00:00:...|
+--------------+------------+------------------+------------+--------------------+--------------------+
only showing top 2 rows



In [0]:
product_descr_df.show(2)

+--------------------+--------------------+--------------------+--------------------+
|ProductDescriptionID|         Description|             rowguid|        ModifiedDate|
+--------------------+--------------------+--------------------+--------------------+
|                   3|     Chromoly steel.|301EED3A-1A82-485...|2013-04-30 00:00:...|
|                   4|Aluminum alloy cu...|DFEBA528-DA11-465...|2013-04-30 00:00:...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [0]:
# Convert necessary columns to appropriate data types for joining and transformations
product_df = product_df.withColumn("ProductID", col("ProductID").cast("int"))
product_df = product_df.withColumn("ReorderPoint", col("ReorderPoint").cast("int"))
product_df = product_df.withColumn("StandardCost", col("StandardCost").cast("float"))

product_model_df = product_model_df.withColumn("ProductModelID", col("ProductModelID").cast("int"))
product_descr_df = product_descr_df.withColumn("ProductDescriptionID", col("ProductDescriptionID").cast("int"))

In [0]:
# Cast columns for joining if needed (e.g., ProductModelID as Integer)
product_df = product_df.withColumn("ProductModelID", col("ProductModelID").cast("int"))
product_model_df = product_model_df.withColumn("ProductModelID", col("ProductModelID").cast("int"))

# Step 1: Join product_df with product_model_df on ProductModelID
enriched_df = product_df.join(
    product_model_df.select("ProductModelID", "Name", "CatalogDescription"), 
    on="ProductModelID", 
    how="left"
)

In [0]:
print("Columns in enriched_df:", enriched_df.columns)

Columns in enriched_df: ['ProductModelID', 'ProductID', 'product_name', 'ProductNumber', 'MakeFlag', 'FinishedGoodsFlag', 'Color', 'SafetyStockLevel', 'ReorderPoint', 'StandardCost', 'ListPrice', 'Size', 'SizeUnitMeasureCode', 'WeightUnitMeasureCode', 'Weight', 'DaysToManufacture', 'ProductLine', 'Class', 'Style', 'ProductSubcategoryID', 'SellStartDate', 'SellEndDate', 'DiscontinuedDate', 'rowguid', 'ModifiedDate', 'Name', 'CatalogDescription']


In [0]:

# Step 2: Select and rename columns for the dimension table
# Select and rename columns for the dimension table
dim_product_sale_df = enriched_df.select(
    col("ProductID").alias("product_id"),
    col("Name").alias("product_name"),                     # Using 'Name' as product_name
    col("CatalogDescription").alias("product_description"),  # Using 'CatalogDescription' as product_description
    col("ReorderPoint").alias("reorder_point"),
    col("StandardCost").alias("standard_cost")
)



In [0]:

# Write the final enriched data to the dimension table (saving as a CSV for illustration)
dim_product_sale_df.write.format("csv").mode("overwrite").option("header", "true").save("dbfs:/FileStore/tables/dimension/dim_product_sale")

# Show a sample of the final dimension table
dim_product_sale_df.show(5)

# Stop the Spark session
#spark.stop()

+----------+------------+-------------------+-------------+-------------+
|product_id|product_name|product_description|reorder_point|standard_cost|
+----------+------------+-------------------+-------------+-------------+
|         1|        null|               null|          750|          0.0|
|         2|        null|               null|          750|          0.0|
|         3|        null|               null|          600|          0.0|
|         4|        null|               null|          600|          0.0|
|       316|        null|               null|          600|          0.0|
+----------+------------+-------------------+-------------+-------------+
only showing top 5 rows

