#Silver Layer ETL
With the new modeling done, time to ingest the original data into the new tables.

In [0]:
# Imports
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, monotonically_increasing_id

In [0]:
%sql
SELECT * FROM bronze.raw_data LIMIT 10

In [0]:
%skip
%sql
CREATE OR REPLACE TABLE silver.dim_customer
AS
SELECT
    ID,
    CAST(Income AS INT)            AS Income,
    Education,
    Marital_Status,
    Kidhome,
    Teenhome,
    DtCustomer,
    Recency
FROM bronze.raw_data
WHERE Income IS NOT NULL;

# Cleaning
Before distributing the data from the bronze layer into the new silver layer tables, we'll check if the data needs cleaning

In [0]:
df_silver = spark.table("bronze.raw_data")

In [0]:
# Check for duplicates
duplicates = (
    df_silver
    .groupBy(df_silver.columns)
    .count()
    .filter("count > 1")
)
display(duplicates)

# Remove duplicates 
#df_bronze = df_silver.dropDuplicates()     

In [0]:
# Check for NAs
nas = df_silver.select([col(c).isNull().alias(c) for c in df_silver.columns]).groupBy().sum()
display(nas)

# Remove NAs
#df_bronze = df_silver.dropna()

# Data Ingestion
From the cleaned dataframe into the new tables

In [0]:
%skip
df_sales = pd.DataFrame()
df_sales = df_bronze[[
  'ID','fk_customer','fk_promo',
  'fk_date','MntWines','MntFruits',
  'MntMeatProducts','MntFishProducts',
  'MntSweetProducts','MntGoldProds','NumDealsPurchases','NumWebPurchases',
  'NumCatalogPurchases','NumStorePurchases',
  'NumWebVisitsMonth','Recency'
]]
df_sales.shape

In [0]:
df_customer = df_silver.select(
"Year_Birth",
"Education",
"Marital_Status",
"Kidhome",
"Teenhome",
"Income",
"Complain"
)#.dropDuplicates()


# Creating surrogate key
#df_customer = df_customer.withColumn(
#"id_customer",
#monotonically_increasing_id()
#)


# Saving DIM_Customer as a delta table
df_customer.write.format("delta") \
.mode("overwrite") \
.saveAsTable("silver.DIM_Customer")

In [0]:
%sql
SELECT * FROM silver.DIM_Customer LIMIT 10