#Silver Layer ETL
With the new modeling done, time to ingest the original data into the new tables.

In [0]:
# Imports
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, weekofyear, monotonically_increasing_id

# Cleaning
Before distributing the data from the bronze layer into the new silver layer tables, we'll check if the data needs cleaning

In [0]:
# For later...
df_bronze = spark.table("bronze.raw_data")

In [0]:
# We will work with data frames for ease of use before converting them to delta tables
df_silver = spark.table("bronze.raw_data")

In [0]:
# Check for duplicates
duplicates = (
    df_silver
    .groupBy(df_silver.columns)
    .count()
    .filter("count > 1")
)
display(duplicates)

# Remove duplicates 
#df_silver = df_silver.dropDuplicates()     

ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,count


In [0]:
# Check for NAs
nas = df_silver.select([col(c).isNull().alias(c) for c in df_silver.columns]).groupBy().sum()
display(nas)

# Remove NAs
# Add '#' when no NAs are left
df_silver = df_silver.dropna()

In [0]:
# Total amount of rows should be consistent going forward
display(df_silver.count())

2216

Checking the different columns

In [0]:
display(df_silver.select("Education").distinct())
# Seems fine.

Education
Graduation
PhD
Master
Basic
2n Cycle


In [0]:
display(df_silver.select("Marital_Status").distinct())

Marital_Status
Single
Together
Married
Divorced
Widow
Alone
Absurd
YOLO


Out of the gate, I believe "Absurd" and "YOLO" do not belong to a cleaned dataset, and should be made part of a "Unknown" category for the time being. Meanwhile, "Alone" clearly belongs to the category "Single".

In [0]:
df_silver = df_silver.replace({"Absurd": "Unknown", "YOLO": "Unknown"}, subset=["Marital_Status"])

In [0]:
df_silver = df_silver.replace({"Alone": "Single"}, subset=["Marital_Status"])

In [0]:
display(df_silver.select("Marital_Status").distinct())

Marital_Status
Single
Together
Married
Divorced
Widow
Unknown


# Data Ingestion
From the cleaned dataframe into the new tables

In [0]:
# Branch 2
df_customer = (
    df_silver
    .select(
        "Year_Birth",
        "Education",
        "Marital_Status",
        "Kidhome",
        "Teenhome",
        "Income",
        "Complain"
    )
    .dropDuplicates()   
    )


In [0]:
df_customer.count()

2014

In [0]:
display(df_customer.select("Marital_Status").distinct())

Marital_Status
Single
Together
Married
Divorced
Widow
Unknown


In [0]:
# Checking the schema for conflicts
print(spark.table("bronze.raw_data").printSchema())
print(spark.table("silver.dim_customer").printSchema())

root
 |-- ID: long (nullable = true)
 |-- Year_Birth: long (nullable = true)
 |-- Education: string (nullable = true)
 |-- Marital_Status: string (nullable = true)
 |-- Income: long (nullable = true)
 |-- Kidhome: long (nullable = true)
 |-- Teenhome: long (nullable = true)
 |-- Dt_Customer: date (nullable = true)
 |-- Recency: long (nullable = true)
 |-- MntWines: long (nullable = true)
 |-- MntFruits: long (nullable = true)
 |-- MntMeatProducts: long (nullable = true)
 |-- MntFishProducts: long (nullable = true)
 |-- MntSweetProducts: long (nullable = true)
 |-- MntGoldProds: long (nullable = true)
 |-- NumDealsPurchases: long (nullable = true)
 |-- NumWebPurchases: long (nullable = true)
 |-- NumCatalogPurchases: long (nullable = true)
 |-- NumStorePurchases: long (nullable = true)
 |-- NumWebVisitsMonth: long (nullable = true)
 |-- AcceptedCmp3: long (nullable = true)
 |-- AcceptedCmp4: long (nullable = true)
 |-- AcceptedCmp5: long (nullable = true)
 |-- AcceptedCmp1: long (nullab

In [0]:
# Saving DIM_Customer as a delta table
df_customer.write.format("delta") \
.mode("overwrite") \
.saveAsTable("silver.dim_customer") 

Seems its going to be a large dimensional table, since there are so many permutations of categorical values

### DIM_Promo

In [0]:
df_promo = (
    df_silver
    .select(
        "AcceptedCmp1",
        "AcceptedCmp2",
        "AcceptedCmp3",
        "AcceptedCmp4",
        "AcceptedCmp5",
        "Response"
    )
    .dropDuplicates()
)


In [0]:
# Checking the schema for conflicts
print(spark.table("bronze.raw_data").printSchema())
print(spark.table("silver.dim_promo").printSchema())

root
 |-- ID: long (nullable = true)
 |-- Year_Birth: long (nullable = true)
 |-- Education: string (nullable = true)
 |-- Marital_Status: string (nullable = true)
 |-- Income: long (nullable = true)
 |-- Kidhome: long (nullable = true)
 |-- Teenhome: long (nullable = true)
 |-- Dt_Customer: date (nullable = true)
 |-- Recency: long (nullable = true)
 |-- MntWines: long (nullable = true)
 |-- MntFruits: long (nullable = true)
 |-- MntMeatProducts: long (nullable = true)
 |-- MntFishProducts: long (nullable = true)
 |-- MntSweetProducts: long (nullable = true)
 |-- MntGoldProds: long (nullable = true)
 |-- NumDealsPurchases: long (nullable = true)
 |-- NumWebPurchases: long (nullable = true)
 |-- NumCatalogPurchases: long (nullable = true)
 |-- NumStorePurchases: long (nullable = true)
 |-- NumWebVisitsMonth: long (nullable = true)
 |-- AcceptedCmp3: long (nullable = true)
 |-- AcceptedCmp4: long (nullable = true)
 |-- AcceptedCmp5: long (nullable = true)
 |-- AcceptedCmp1: long (nullab

In [0]:
display(df_promo.count())

35

In [0]:
# Saving DIM_Promo as a delta table
df_promo.write.format("delta") \
.mode("overwrite") \
.saveAsTable("silver.dim_promo") 

### DIM_Calendar

In [0]:
df_date = (
    df_silver
    .select(col("Dt_Customer").alias("full_date"))
    .dropDuplicates()  
    .withColumn("year", year("full_date"))
    .withColumn("month", month("full_date"))
    .withColumn("day", dayofmonth("full_date"))
    .withColumn("week_of_year", weekofyear("full_date"))
)


In [0]:
display(df_date.count())

662

For clariry, this means there are 662 distinct dates in the dataset, allowing us to avoid storing duplicate dates and reducing computational overhead.

In [0]:
# Checking the schema for conflicts
#print(spark.table("bronze.raw_data").printSchema())
print(df_date.printSchema())
print(spark.table("silver.dim_calendar").printSchema())

root
 |-- full_date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)

None
root
 |-- id_date: long (nullable = false)
 |-- full_date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)

None


In [0]:
# Saving DIM_Calendar as a delta table
df_date.write.format("delta") \
.mode("overwrite") \
.saveAsTable("silver.dim_calendar") 

### FACT_Sales

Only at the end do we ingest the data into the fact table because it will reference all of the other dimensional tables!

In [0]:
%sql
CREATE OR REPLACE TABLE silver.fact_sales
USING DELTA
AS
SELECT
    c.id_customer        AS fk_customer,
    p.id_promo           AS fk_promo,
    d.id_date            AS fk_date,

    b.MntWines,
    b.MntFruits,
    b.MntMeatProducts,
    b.MntFishProducts,
    b.MntSweetProducts,
    b.MntGoldProds,

    b.NumDealsPurchases,
    b.NumWebPurchases,
    b.NumCatalogPurchases,
    b.NumStorePurchases,
    b.NumWebVisitsMonth,
    b.Recency

FROM bronze.raw_data b

LEFT JOIN silver.dim_customer c
    ON  b.Income          = c.Income
    AND b.Kidhome         = c.Kidhome
    AND b.Teenhome        = c.Teenhome
    AND b.Education       = c.Education
    AND b.Marital_Status  = c.Marital_Status
    AND b.Complain        = c.Complain

LEFT JOIN silver.DIM_Promo p
    ON  b.AcceptedCmp1 = p.AcceptedCmp1
    AND b.AcceptedCmp2 = p.AcceptedCmp2
    AND b.AcceptedCmp3 = p.AcceptedCmp3
    AND b.AcceptedCmp4 = p.AcceptedCmp4
    AND b.AcceptedCmp5 = p.AcceptedCmp5
    AND b.Response     = p.Response

LEFT JOIN silver.DIM_Calendar d
    ON b.Dt_Customer = d.full_date;

num_affected_rows,num_inserted_rows


In [0]:
display(spark.table("silver.fact_sales").count())

2242

Interesting...
There should not be more than  2216 rows after cleaning the data.

In [0]:
display(spark.table("silver.fact_sales").limit(15))

fk_customer,fk_promo,fk_date,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Recency
1750.0,6,512,635,88,546,172,88,88,3,8,10,4,7,58
1179.0,9,554,11,1,6,2,1,6,2,1,1,2,5,38
665.0,9,233,426,49,127,111,21,42,1,8,2,10,4,26
58.0,9,244,11,4,20,10,3,5,2,2,0,4,6,26
1585.0,9,229,173,43,118,46,27,15,5,5,3,6,5,94
1288.0,9,146,520,42,98,0,42,14,2,6,4,10,6,16
1996.0,9,398,235,65,164,50,49,27,4,7,3,7,6,34
613.0,9,604,76,10,56,3,1,23,2,4,0,4,8,32
326.0,6,247,14,0,24,3,3,2,1,3,0,2,9,19
1941.0,5,548,28,0,6,1,1,13,1,1,0,0,20,68


In [0]:
# Check for duplicates in the fact table
duplicates = (
    spark.table("silver.fact_sales")
    .groupBy(spark.table("silver.fact_sales").columns)
    .count()
    .filter("count > 1")
)
display(duplicates)

# Remove duplicates from the fact table
df_fact_sales = spark.table("silver.fact_sales").dropDuplicates()
df_fact_sales.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("silver.fact_sales")

fk_customer,fk_promo,fk_date,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Recency,count
217,22,544,1006,22,115,59,68,45,1,7,6,12,3,23,2
1427,9,182,84,5,38,150,12,28,2,4,1,6,7,20,2
79,9,164,384,0,102,21,32,5,3,6,2,9,4,0,2
182,9,558,270,3,27,39,6,99,7,7,1,5,8,69,2
418,16,641,684,100,801,21,66,0,1,6,9,10,2,96,2
1778,9,147,8,4,10,2,2,4,2,3,0,3,5,34,2
235,9,122,112,17,44,34,22,89,1,2,5,3,3,56,2
678,9,260,55,0,6,2,0,4,2,1,1,3,5,99,2
1693,12,16,437,8,206,160,49,42,2,7,10,5,6,53,2
1180,9,358,9,0,6,3,1,3,1,0,0,3,5,4,2


Not ideal, but we can clean the dataset once again by removing the 176 duplicates.

In [0]:
display(spark.table("silver.fact_sales").count())

2060