In [0]:
%sql
show databases

databaseName
default
globalretail_bronze
globalretail_silver
retaildb


In [0]:
spark.sql("USE globalretail_silver")

spark.sql("""
CREATE TABLE IF NOT EXISTS silver_customers (
    customer_id STRING,
    name STRING,
    email STRING,
    country STRING,
    customer_type STRING,
    registration_date DATE,
    age INT,
    gender STRING,
    total_purchases INT,
    customer_segment STRING,
    days_since_registration INT,
    last_updated TIMESTAMP
)
""")

DataFrame[]

In [0]:
%sql
show tables

database,tableName,isTemporary
globalretail_silver,silver_customers,False
,_sqldf,True


In [0]:
# Get the last processed timestamp from silver layer
last_processed_df = spark.sql("select max(last_updated) as last_processed from silver_customers")
last_processed_timestamp = last_processed_df.collect()[0]['last_processed']

if last_processed_timestamp is None:
    last_processed_timestamp = '1900-01-01'

In [0]:
# Create a temporary view of incremental bronze data
spark.sql(f"""
          CREATE OR REPLACE TEMPORARY VIEW bronze_incremental_customers AS
          SELECT * FROM globalretail_bronze.bronze_customer WHERE ingestion_timestamp > '{last_processed_timestamp}'
          """)


DataFrame[]

In [0]:
%sql
select * from bronze_incremental_customers limit 10

customer_id,name,email,country,customer_type,registration_date,age,gender,total_purchases,ingestion_timestamp
1,Customer 1,customer1@example.com,Australia,Regular,2011-05-15,22,Male,191,2025-04-08T13:21:45.393Z
2,Customer 2,customer2@example.com,France,Premium,2018-11-27,52,Other,145,2025-04-08T13:21:45.393Z
3,Customer 3,customer3@example.com,Canada,Premium,2015-10-01,32,Other,691,2025-04-08T13:21:45.393Z
4,Customer 4,customer4@example.com,USA,Premium,2011-01-19,70,Other,644,2025-04-08T13:21:45.393Z
5,Customer 5,customer5@example.com,Germany,Regular,2021-08-26,66,Other,508,2025-04-08T13:21:45.393Z
6,Customer 6,customer6@example.com,France,Premium,2015-03-02,20,Male,704,2025-04-08T13:21:45.393Z
7,Customer 7,customer7@example.com,China,Premium,2018-05-24,24,Female,892,2025-04-08T13:21:45.393Z
8,Customer 8,customer8@example.com,China,Regular,2023-10-02,26,Male,488,2025-04-08T13:21:45.393Z
9,Customer 9,customer9@example.com,Japan,Premium,2014-10-05,36,Other,30,2025-04-08T13:21:45.393Z
10,Customer 10,customer10@example.com,Brazil,Premium,2017-08-30,30,Male,959,2025-04-08T13:21:45.393Z


In [0]:
#Validate email addresses (null or not null)
#Valid age between 18 to 100
#Create customer_segment as total_purchases > 10000 THEN 'High Value' if > 5000 THEN 'Medium Value' ELSE 'Low Value'
#days since user is registered in the system
#Remove any junk records where total_purchase is negative number

spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW silver_incremental_customers AS
SELECT
    customer_id,
    name,
    email,
    country,
    customer_type,
    registration_date,
    age,
    gender,
    total_purchases,
CASE
    WHEN total_purchases > 10000 THEN 'High Value'
    WHEN total_purchases > 5000 THEN 'Medium Value'
    ELSE 'Low Value'
END AS customer_segment,
DATEDIFF(CURRENT_DATE(), registration_date) AS days_since_registration,
CURRENT_TIMESTAMP() AS last_updated
FROM bronze_incremental_customers
WHERE
    age BETWEEN 18 AND 120
    AND email IS NOT NULL
    AND total_purchases >= 0
    """)

DataFrame[]

In [0]:
display(spark.sql("SELECT * FROM silver_incremental_customers"))

customer_id,name,email,country,customer_type,registration_date,age,gender,total_purchases,customer_segment,days_since_registration,last_updated
1,Customer 1,customer1@example.com,Australia,Regular,2011-05-15,22,Male,191,Low Value,5077,2025-04-08T15:53:16.356Z
2,Customer 2,customer2@example.com,France,Premium,2018-11-27,52,Other,145,Low Value,2324,2025-04-08T15:53:16.356Z
3,Customer 3,customer3@example.com,Canada,Premium,2015-10-01,32,Other,691,Low Value,3477,2025-04-08T15:53:16.356Z
4,Customer 4,customer4@example.com,USA,Premium,2011-01-19,70,Other,644,Low Value,5193,2025-04-08T15:53:16.356Z
5,Customer 5,customer5@example.com,Germany,Regular,2021-08-26,66,Other,508,Low Value,1321,2025-04-08T15:53:16.356Z
6,Customer 6,customer6@example.com,France,Premium,2015-03-02,20,Male,704,Low Value,3690,2025-04-08T15:53:16.356Z
7,Customer 7,customer7@example.com,China,Premium,2018-05-24,24,Female,892,Low Value,2511,2025-04-08T15:53:16.356Z
8,Customer 8,customer8@example.com,China,Regular,2023-10-02,26,Male,488,Low Value,554,2025-04-08T15:53:16.356Z
9,Customer 9,customer9@example.com,Japan,Premium,2014-10-05,36,Other,30,Low Value,3838,2025-04-08T15:53:16.356Z
10,Customer 10,customer10@example.com,Brazil,Premium,2017-08-30,30,Male,959,Low Value,2778,2025-04-08T15:53:16.356Z


In [0]:
spark.sql("""
MERGE INTO silver_customers target
USING silver_incremental_customers source
ON target.customer_id = source.customer_id
WHEN MATCHED THEN
    UPDATE SET *
WHEN NOT MATCHED THEN
    INSERT *
""")

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]