Task 2
Overview: Standardize and load the bronze data into silver layer.
- Perform following transformations where applicable:
o All fields to be named in snake_case, replace all dots with
underscores.
o Phone numbers to be separated by dashes (f.e 123-321-567)
o All date and datetime fields to be loaded with the proper data type
(date & timestamp)
- For target schema use your buddy group schema
- Name the silver tables: <name>_silver__<orders/customers/etc.>. For
example: kirilovl_silver_orders
- In case of conflicts with already existing tables in your schema, add _exam
suffix
- Keep all rows, but no historization is required
- Include bronze ingest_datetime and rename it consume_datetime
- Exclude batch_id from source
- Add inserted_datetime to track the time when the row was loaded.
- Add deduplication to exclude duplicate rows on all business columns, keep
the first consumed record.

In [0]:
###################
##### Silver ######
###################

from pyspark.sql.functions import regexp_replace, col, concat, lit, substring, length, current_timestamp, to_date, row_number
from pyspark.sql.window import Window


###################
# Customers table #
###################

# Load bronze customers table
bronze_customers_df = spark.table("de_pyspark_training_catalog.buddy_group_1.amanolov_bronze_customers_exam")

# Rename columns to snake_case and replace . with _
for col_name in bronze_customers_df.columns:
    new_col_name = col_name.lower().replace('.', '_')
    bronze_customers_df = bronze_customers_df.withColumnRenamed(col_name, new_col_name)

# Rename ingest_datetime to consume_datetime
silver_customers_df = bronze_customers_df.withColumnRenamed("ingest_datetime", "consume_datetime")

# Drop batch_id
silver_customers_df = silver_customers_df.drop("batch_id")

# phone_number conversion with regexp_replace
# 1. Remove all non-digit characters except +
silver_customers_df = silver_customers_df.withColumn(
    "phone_number",
    regexp_replace(col("phone_number"), "[^\\d+]", "")
)

# 2. Extract country code (assume always '+1')
silver_customers_df = silver_customers_df.withColumn(
    "country_code",
    substring(col("phone_number"), 1, 2)
)

# 3. Extract rest of the phone No
silver_customers_df = silver_customers_df.withColumn(
    "clean_phone_number",
    substring(col("phone_number"), 3, 10)
)

# 4. Format as XXX-XXX-XXXX
silver_customers_df = silver_customers_df.withColumn(
    "formatted_phone",
    regexp_replace(col("clean_phone_number"), "(\\d{3})(\\d{3})(\\d{4})", "$1-$2-$3")
)

# 5. Combine country code and formatted number
silver_customers_df = silver_customers_df.withColumn(
    "phone_number",
    concat(col("country_code"), lit(" "), col("formatted_phone"))
)

# 6. Drop temporary cols
silver_customers_df = silver_customers_df.drop("country_code", "clean_phone_number", "formatted_phone")


# Cast date_of_birth to date type
silver_customers_df = silver_customers_df.withColumn("date_of_birth", to_date(col("date_of_birth")))

# Add inserted_datetime
silver_customers_df = silver_customers_df.withColumn("inserted_datetime", current_timestamp())

# Deduplicate on customer_id, keep earliest consume_datetime
window_spec = Window.partitionBy("customer_id").orderBy("consume_datetime")
silver_customers_df = silver_customers_df.withColumn("row_num", row_number().over(window_spec))
silver_customers_df = silver_customers_df.filter(col("row_num") == 1).drop("row_num")

# Save to silver table
(silver_customers_df.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("de_pyspark_training_catalog.buddy_group_1.amanolov_silver_customers_exam")
)


In [0]:
# Look for column names to replace above
spark.sql("""
    DESCRIBE de_pyspark_training_catalog.buddy_group_1.amanolov_bronze_customers_exam
""").show(truncate=False)



+---------------------------+---------+-------+
|col_name                   |data_type|comment|
+---------------------------+---------+-------+
|CUSTOMER_ID                |string   |NULL   |
|CUST_FIRST_NAME            |string   |NULL   |
|CUST_LAST_NAME             |string   |NULL   |
|CUST_ADDRESS.COUNTRY_ID    |string   |NULL   |
|CUST_ADDRESS.STATE_PROVINCE|string   |NULL   |
|CUST_ADDRESS.CITY          |string   |NULL   |
|CUST_ADDRESS.POSTAL_CODE   |string   |NULL   |
|CUST_ADDRESS.STREET_ADDRESS|string   |NULL   |
|PHONE_NUMBER               |string   |NULL   |
|CUST_EMAIL                 |string   |NULL   |
|ACCOUNT_MGR_ID             |string   |NULL   |
|DATE_OF_BIRTH              |string   |NULL   |
|MARITAL_STATUS             |string   |NULL   |
|GENDER                     |string   |NULL   |
|_rescued_data              |string   |NULL   |
|ingest_datetime            |timestamp|NULL   |
|batch_id                   |int      |NULL   |
+---------------------------+---------+-

In [0]:
# Check if phone_number conversion is correct
spark.sql("""
    SELECT phone_number FROM de_pyspark_training_catalog.buddy_group_1.amanolov_silver_customers_exam
""").show(truncate=False)

+---------------+
|phone_number   |
+---------------+
|+1 317-123-4104|
|+1 317-123-4111|
|+1 319-123-4301|
|+1 745-123-4306|
|+1 414-123-4307|
|+1 414-123-4308|
|+1 608-123-4309|
|+1 608-123-4318|
|+1 414-123-4323|
|+1 414-123-4324|
|+1 414-123-4328|
|+1 608-123-4332|
|+1 608-123-4344|
|+1 414-123-4347|
|+1 414-123-4348|
|+1 414-123-4350|
|+1 745-123-4367|
|+1 414-123-4369|
|+1 715-123-4372|
|+1 414-123-4373|
+---------------+
only showing top 20 rows


In [0]:
###################
##### Silver ######
###################

###################
# Orders table #
###################

from pyspark.sql.functions import current_timestamp, col, row_number
from pyspark.sql.window import Window

# Load Bronze Orders
bronze_orders_df = spark.table("de_pyspark_training_catalog.buddy_group_1.amanolov_bronze_orders_exam")

# Rename Columns to snake_case and replace . with _
for col_name in bronze_orders_df.columns:
    new_col_name = col_name.lower().replace('.', '_')
    bronze_orders_df = bronze_orders_df.withColumnRenamed(col_name, new_col_name)

# Rename ingest_datetime to consume_datetime
silver_orders_df = bronze_orders_df.withColumnRenamed("ingest_datetime", "consume_datetime")

# Drop batch_id
silver_orders_df = silver_orders_df.drop("batch_id")

# Register temp view for SQL workaround
silver_orders_df.createOrReplaceTempView("temp_silver_orders")

# Parse the order_date using SQL workaround
silver_orders_df = spark.sql("""
    SELECT *,
        coalesce(
            try_to_timestamp(regexp_replace(order_date, '\\\\.', ':'), 'dd-MMM-yy hh:mm:ss a'),
            try_to_timestamp(regexp_replace(order_date, '\\\\.', ':'), 'dd-MMM-yy')
        ) AS clean_order_date
    FROM temp_silver_orders
""")

# Replace the old order_date with the cleaned one
silver_orders_df = silver_orders_df.drop("order_date").withColumnRenamed("clean_order_date", "order_date")

# Add inserted_datetime
silver_orders_df = silver_orders_df.withColumn("inserted_datetime", current_timestamp())

# Deduplicate on order_id
window_spec = Window.partitionBy("order_id").orderBy("consume_datetime")
silver_orders_df = silver_orders_df.withColumn("row_num", row_number().over(window_spec))
silver_orders_df = silver_orders_df.filter(col("row_num") == 1).drop("row_num")

# Save with schema overwrite
(silver_orders_df.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("de_pyspark_training_catalog.buddy_group_1.amanolov_silver_orders_exam")
)


In [0]:
# Look for column names to replace above
spark.sql("""
    DESCRIBE de_pyspark_training_catalog.buddy_group_1.amanolov_bronze_orders_exam
""").show(truncate=False)

+---------------+---------+-------+
|col_name       |data_type|comment|
+---------------+---------+-------+
|ORDER_ID       |string   |NULL   |
|ORDER_DATE     |string   |NULL   |
|ORDER_MODE     |string   |NULL   |
|CUSTOMER_ID    |string   |NULL   |
|ORDER_STATUS   |string   |NULL   |
|ORDER_TOTAL    |string   |NULL   |
|SALES_REP_ID   |string   |NULL   |
|PROMOTION_ID   |string   |NULL   |
|_rescued_data  |string   |NULL   |
|ingest_datetime|timestamp|NULL   |
|batch_id       |int      |NULL   |
+---------------+---------+-------+



In [0]:
###################
##### Silver ######
###################

###################
# Order_items table #
###################

from pyspark.sql.functions import current_timestamp, col, row_number
from pyspark.sql.window import Window

# Load bronze order_items table
bronze_order_items_df = spark.table("de_pyspark_training_catalog.buddy_group_1.amanolov_bronze_order_items_exam")

# Rename columns to snake_case and replace . with _
for col_name in bronze_order_items_df.columns:
    new_col_name = col_name.lower().replace('.', '_')
    bronze_order_items_df = bronze_order_items_df.withColumnRenamed(col_name, new_col_name)

# Rename ingest_datetime to consume_datetime
silver_order_items_df = bronze_order_items_df.withColumnRenamed("ingest_datetime", "consume_datetime")

# Drop batch_id
silver_order_items_df = silver_order_items_df.drop("batch_id")

# Add inserted_datetime
silver_order_items_df = silver_order_items_df.withColumn("inserted_datetime", current_timestamp())

# Deduplicate on order_id and line_item_id, keep earliest consume_datetime
window_spec = Window.partitionBy("order_id", "line_item_id").orderBy("consume_datetime")
silver_order_items_df = silver_order_items_df.withColumn("row_num", row_number().over(window_spec))
silver_order_items_df = silver_order_items_df.filter(col("row_num") == 1).drop("row_num")

# Save to silver table
(silver_order_items_df.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("de_pyspark_training_catalog.buddy_group_1.amanolov_silver_order_items_exam")
)


In [0]:
spark.sql("""
    DESCRIBE de_pyspark_training_catalog.buddy_group_1.amanolov_bronze_order_items_exam
""").show(truncate=False)

+---------------+---------+-------+
|col_name       |data_type|comment|
+---------------+---------+-------+
|ORDER_ID       |string   |NULL   |
|LINE_ITEM_ID   |string   |NULL   |
|PRODUCT_ID     |string   |NULL   |
|UNIT_PRICE     |string   |NULL   |
|QUANTITY       |string   |NULL   |
|_rescued_data  |string   |NULL   |
|ingest_datetime|timestamp|NULL   |
|batch_id       |int      |NULL   |
+---------------+---------+-------+



In [0]:
###################
##### Silver ######
###################

###################
# products table #
###################

from pyspark.sql.functions import current_timestamp, col, row_number
from pyspark.sql.window import Window

# Load bronze products table
bronze_products_df = spark.table("de_pyspark_training_catalog.buddy_group_1.amanolov_bronze_products_exam")

# Rename columns to snake_case and replace dots with underscores
for col_name in bronze_products_df.columns:
    new_col_name = col_name.lower().replace('.', '_')
    bronze_products_df = bronze_products_df.withColumnRenamed(col_name, new_col_name)

# Rename ingest_datetime to consume_datetime
silver_products_df = bronze_products_df.withColumnRenamed("ingest_datetime", "consume_datetime")

# Drop batch_id
silver_products_df = silver_products_df.drop("batch_id")

# Add inserted_datetime
silver_products_df = silver_products_df.withColumn("inserted_datetime", current_timestamp())

# Deduplicate on product_id, keep earliest consume_datetime
window_spec = Window.partitionBy("product_id").orderBy("consume_datetime")
silver_products_df = silver_products_df.withColumn("row_num", row_number().over(window_spec))
silver_products_df = silver_products_df.filter(col("row_num") == 1).drop("row_num")

# Save to silver table
(silver_products_df.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("de_pyspark_training_catalog.buddy_group_1.amanolov_silver_products_exam")
)

In [0]:
spark.sql("""
    DESCRIBE de_pyspark_training_catalog.buddy_group_1.amanolov_bronze_products_exam
""").show(truncate=False)

+---------------+---------+-------+
|col_name       |data_type|comment|
+---------------+---------+-------+
|PRODUCT_ID     |string   |NULL   |
|PRODUCT_NAME   |string   |NULL   |
|CATEGORY_NAME  |string   |NULL   |
|WEIGHT_CLASS   |string   |NULL   |
|PRODUCT_STATUS |string   |NULL   |
|LIST_PRICE     |string   |NULL   |
|MIN_PRICE      |string   |NULL   |
|_rescued_data  |string   |NULL   |
|ingest_datetime|timestamp|NULL   |
|batch_id       |int      |NULL   |
+---------------+---------+-------+

