Task 3

Overview: Creation of gold layer. Create standard star schema based on the bellow
specifications
- Consider only type 2-dimension fields when calculating history for the dim.
- Each row should have effective start and effective end dates, which will
represent the time row was active. Base them on silver layer
consume_datetime ;
- Make sure the referential integrity between the fact and dimensions is kept
- No duplicates should be kept, and history should be only for specified SCD2
fields.
- Use xxhash64 function for hashing, where applicable.
- Dimension details:
dim_customersField name Description Dimension Type
cust_sk surrgogate_key - hashed
cust_nk
cust_nk natural key (customer_id)
cust_first_name 1
cust_last_name 1
cust_address_country_id 2
cust_address_state_province 2
cust_address_city 2
cust_address_postal_code 2
cust_address_street_address 2
cust_phone_number 1
cust_email 1
account_mgr_id 1
date_of_birth 1
marital_status 2
Gender 1
effective_from effective from date for type
2 dimensions based on the
consume_datetime in silver
layer
effective_to effective to date for type 2
dimensions based on the
consume_datetime in silver
layer
Inserted_datetime when row was inserted
Updated_datetime when row was last updated
dim_products
Field name Description Dimension Type
product_sk surrgogate_key - hashed
product_id
product_nk natural key (product_id)
product_name 1
category_name 2
weight_class 1
product_status 2
list_price 2
min_price 1
effective_from effective from date for type
2 dimensions based on the
consume_timestamp in
silver layereffective_to effective to date for type 2
dimensions based on the
consume_timestamp in
silver layer
Inserted_datetime when row was inserted
Updated_datetime when row was last updated
Fact orders
Field name Description
order_sk surrgogate_key - hashed order_nk
customer_sk dimension surrgogate key
product_sk dimension surrgogate key
order_nk natural key - Concatenation of
order_id,line_item_id,customer_id,product_id
split by pipes
customer_nk dimension natural key
product_nk dimension natural key
order_id
line_item_id
order_date
order_mode
order_status
unit_price
quantity
Inserted_datetime when row was inserted

In [0]:
#################
##### Gold ######
#################


###################
## dim_customers ##
###################

from pyspark.sql.functions import col, lit, xxhash64, current_timestamp

# Load silver customers
silver_customers_df = spark.table("de_pyspark_training_catalog.buddy_group_1.amanolov_silver_customers_exam")

# Build the gold dim_customers 
dim_customers_df = silver_customers_df.select(
    col("customer_id").alias("cust_nk"),
    col("cust_first_name"),
    col("cust_last_name"),
    col("cust_address_country_id"),
    col("cust_address_state_province"),
    col("cust_address_city"),
    col("cust_address_postal_code"),
    col("cust_address_street_address"),
    col("phone_number").alias("cust_phone_number"),
    col("cust_email"),
    "account_mgr_id",
    "date_of_birth",
    "marital_status",
    "gender",
    "consume_datetime"
)

# Add surrogate key
dim_customers_df = dim_customers_df.withColumn("cust_sk", xxhash64("cust_nk"))

# Add timestamps
dim_customers_df = dim_customers_df.withColumn("inserted_datetime", current_timestamp())
dim_customers_df = dim_customers_df.withColumn("updated_datetime", current_timestamp())

# Add SCD2 fields
dim_customers_df = dim_customers_df.withColumn("effective_from", col("consume_datetime"))
dim_customers_df = dim_customers_df.withColumn("effective_to", lit("9999-12-31").cast("date"))

# Drop consume_datetime if you donâ€™t want it in the final table
dim_customers_df = dim_customers_df.drop("consume_datetime")

# Final column order
dim_customers_df = dim_customers_df.select(
    "cust_sk",
    "cust_nk",
    "cust_first_name",
    "cust_last_name",
    "cust_address_country_id",
    "cust_address_state_province",
    "cust_address_city",
    "cust_address_postal_code",
    "cust_address_street_address",
    "cust_phone_number",
    "cust_email",
    "account_mgr_id",
    "date_of_birth",
    "marital_status",
    "gender",
    "effective_from",
    "effective_to",
    "inserted_datetime",
    "updated_datetime"
)

# Save as gold table
dim_customers_df.write.format("delta").mode("overwrite").saveAsTable("de_pyspark_training_catalog.buddy_group_1.amanolov_gold_dim_customers_exam")


In [0]:
#################
##### Gold ######
#################

###################
## dim_products ##
###################

from pyspark.sql.functions import col, lit, xxhash64, current_timestamp

# Load silver products
silver_products_df = spark.table("de_pyspark_training_catalog.buddy_group_1.amanolov_silver_products_exam")

# Build gold dim_products
dim_products_df = silver_products_df.select(
    col("product_id").alias("product_nk"),
    col("product_name"),
    col("category_name"),
    col("weight_class"),
    col("product_status"),
    col("list_price"),
    col("min_price"),
    "consume_datetime"
)

# Add surrogate key
dim_products_df = dim_products_df.withColumn("product_sk", xxhash64("product_nk"))

# Add timestamps
dim_products_df = dim_products_df.withColumn("inserted_datetime", current_timestamp())
dim_products_df = dim_products_df.withColumn("updated_datetime", current_timestamp())

# Add SCD2 fields
dim_products_df = dim_products_df.withColumn("effective_from", col("consume_datetime"))
dim_products_df = dim_products_df.withColumn("effective_to", lit("9999-12-31").cast("date"))

# Final column order
dim_products_df = dim_products_df.select(
    "product_sk",
    "product_nk",
    "product_name",
    "category_name",
    "weight_class",
    "product_status",
    "list_price",
    "min_price",
    "consume_datetime",
    "effective_from",
    "effective_to",
    "inserted_datetime",
    "updated_datetime"
)

# Save as gold table
dim_products_df.write.format("delta").mode("overwrite").saveAsTable("de_pyspark_training_catalog.buddy_group_1.amanolov_gold_dim_products_exam")

In [0]:
#################
##### Gold ######
#################

###################
### fact_orders ###
###################

from pyspark.sql.functions import col, lit, xxhash64, current_timestamp, concat_ws

# Load silver order_items table containing line_item_id, product_id
silver_order_items_df = spark.table("de_pyspark_training_catalog.buddy_group_1.amanolov_silver_order_items_exam")

# Load silver orders table containing customer_id, order details
silver_orders_df = spark.table("de_pyspark_training_catalog.buddy_group_1.amanolov_silver_orders_exam")

# Load gold dims
dim_customers_df = spark.table("de_pyspark_training_catalog.buddy_group_1.amanolov_gold_dim_customers_exam")
dim_products_df = spark.table("de_pyspark_training_catalog.buddy_group_1.amanolov_gold_dim_products_exam")

# Join order_items with orders to get customer_id and order details
fact_orders_df = silver_order_items_df.join(
    silver_orders_df.drop("inserted_datetime"),  # Drop to avoid ambiguity, fresh timestamp will be added
    on="order_id",
    how="left"
)

# Build order_nk (natural key)
fact_orders_df = fact_orders_df.withColumn(
    "order_nk",
    concat_ws("|",
              col("order_id").cast("string"),
              col("line_item_id").cast("string"),
              col("customer_id").cast("string"),
              col("product_id").cast("string"))
)

# Build surrogate key
fact_orders_df = fact_orders_df.withColumn("order_sk", xxhash64("order_nk"))

# Join to dim_customers on natural key
fact_orders_df = fact_orders_df.join(
    dim_customers_df.select("cust_sk", "cust_nk"),
    fact_orders_df.customer_id == dim_customers_df.cust_nk,
    how="left"
)

# Join to dim_products on natural key
fact_orders_df = fact_orders_df.join(
    dim_products_df.select("product_sk", "product_nk"),
    fact_orders_df.product_id == dim_products_df.product_nk,
    how="left"
)

# Add inserted_datetime
fact_orders_df = fact_orders_df.withColumn("inserted_datetime", current_timestamp())

# Select final columns exactly as per the PDF
fact_orders_df = fact_orders_df.select(
    "order_sk",
    "cust_sk",
    "product_sk",
    "order_nk",
    col("customer_id").alias("customer_nk"),
    col("product_id").alias("product_nk"),
    "order_id",
    "line_item_id",
    "order_date",
    "order_mode",
    "order_status",
    "unit_price",
    "quantity",
    "inserted_datetime"
)

# Save as gold fact table
fact_orders_df.write.format("delta").mode("overwrite").saveAsTable("de_pyspark_training_catalog.buddy_group_1.amanolov_gold_fact_orders_exam")
