# silver.ipynb
Applies cleaning and transformation to the raw data in the bronze layer

In [1]:
from pyspark.sql import SparkSession
from spark_config import spark_config_bronze, spark_config_silver, spark_config_minio

In [2]:
# Stop any existing spark sessions, from previous jupyter runs
spark = SparkSession.builder.getOrCreate()
spark.stop()

# Create a new spark session
builder = SparkSession.builder
builder.appName("silver")
builder.master("spark://spark-master:7077")

# Apply common spark configs, for the bronze catalog and access to minio
spark_config_bronze(builder)
spark_config_silver(builder)
spark_config_minio(builder)

spark = builder.getOrCreate()

In [3]:
spark.sql("""
    CREATE SCHEMA IF NOT EXISTS silver.data_platform_example
    LOCATION 's3a://silver/data_platform_example'
""")

DataFrame[]

In [4]:
spark.sql("""
CREATE TABLE IF NOT EXISTS silver.data_platform_example.page_load (
    event_name STRING,
    event_version STRING,
    event_ts TIMESTAMP,
    page STRING,
    user_name STRING,
    browser STRING
)
USING ICEBERG
LOCATION 's3a://silver/data_platform_example/page_load'
""")

25/02/27 07:18:36 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


DataFrame[]

In [5]:
# Rename and typecast bronze columns into the silver schema
df = spark.sql(f"""
SELECT
    metadata.name AS event_name,
    metadata.version AS event_version,
    CAST(metadata.timestamp AS TIMESTAMP) AS event_ts,
    payload.page AS page,
    payload.user_name AS user_name,
    payload.browser AS browser
FROM bronze.data_platform_example.page_load_v1
ORDER BY event_ts
""")
df.show()
print(f"Number of rows {df.count()}")

                                                                                

+----------+-------------+--------------------+---------+------------------+-------+
|event_name|event_version|            event_ts|     page|         user_name|browser|
+----------+-------------+--------------------+---------+------------------+-------+
| page_load|           v1|2025-02-20 07:19:...|    /home|     Cassidy Moody|Firefox|
| page_load|           v1|2025-02-20 07:19:...|    /home|     Cassidy Moody|Firefox|
| page_load|           v1|2025-02-20 07:50:...|    /home|      Steve Walker|Firefox|
| page_load|           v1|2025-02-20 07:50:...|    /home|      Steve Walker|Firefox|
| page_load|           v1|2025-02-20 08:03:...|    /home|     Denise Turner|Firefox|
| page_load|           v1|2025-02-20 08:03:...|    /home|     Denise Turner|Firefox|
| page_load|           v1|2025-02-20 08:13:...|    /home|Crystal Richardson| Chrome|
| page_load|           v1|2025-02-20 08:13:...|    /home|Crystal Richardson| Chrome|
| page_load|           v1|2025-02-20 08:26:...|/products|     Tyr



Number of rows 2000


                                                                                

In [6]:
# Remove any duplicate rows, because bronze jobs can cause duplication if they process the same file twice
df = df.dropDuplicates()
df.show()
print(f"Number of rows {df.count()}")

                                                                                

+----------+-------------+--------------------+---------+------------------+-------+
|event_name|event_version|            event_ts|     page|         user_name|browser|
+----------+-------------+--------------------+---------+------------------+-------+
| page_load|           v1|2025-02-26 14:51:...|    /cart|        Amy Davies|Firefox|
| page_load|           v1|2025-02-24 08:17:...| /contact|   Andrew Stafford|   Edge|
| page_load|           v1|2025-02-23 05:42:...|   /about|     Cassidy Moody|Firefox|
| page_load|           v1|2025-02-22 04:32:...|    /home|      Steve Walker|Firefox|
| page_load|           v1|2025-02-21 04:33:...|   /about|     Cassidy Moody| Chrome|
| page_load|           v1|2025-02-22 18:53:...|    /home|     Tyrone Turner|   Edge|
| page_load|           v1|2025-02-26 19:00:...|/checkout|   Andrew Stafford| Chrome|
| page_load|           v1|2025-02-24 03:47:...|    /home|      James Thomas| Safari|
| page_load|           v1|2025-02-26 21:19:...|/products| Matthew

In [7]:
# Write the output to the silver table
df.write \
    .format("iceberg") \
    .mode("overwrite") \
    .saveAsTable("silver.data_platform_example.page_load")

In [8]:
spark.stop()