# bronze.ipynb
Ingests data from the raw folder into bronze iceberg tables

In [4]:
from pyspark.sql import SparkSession
from spark_config import configure_spark_session

In [5]:
# Stop any existing spark sessions, from previous jupyter runs
spark = SparkSession.builder.getOrCreate()
spark.stop()

# Create a new spark session
builder = SparkSession.builder
builder.appName("bronze")
builder.master("spark://spark-master:7077")

# Apply common spark configs, for the nessie catalog and access to minio
configure_spark_session(builder)

spark = builder.getOrCreate()

In [6]:
spark.sql(f"""
    CREATE SCHEMA IF NOT EXISTS example.bronze
    LOCATION 's3a://bronze/'
""")

DataFrame[]

In [7]:
df = spark.read \
        .option("recursiveFileLookup", "true") \
        .json("s3a://raw/page_load/v1/")

df.printSchema()
df.show()

25/04/03 00:43:07 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

root
 |-- metadata: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- timestamp: string (nullable = true)
 |    |-- version: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- browser: string (nullable = true)
 |    |-- page: string (nullable = true)
 |    |-- user_name: string (nullable = true)

+--------------------+--------------------+
|            metadata|             payload|
+--------------------+--------------------+
|{page_load, 2025-...|{Firefox, /home, ...|
|{page_load, 2025-...|{Firefox, /home, ...|
|{page_load, 2025-...|{Chrome, /product...|
|{page_load, 2025-...|{Chrome, /cart, G...|
|{page_load, 2025-...|{Chrome, /home, C...|
|{page_load, 2025-...|{Firefox, /home, ...|
|{page_load, 2025-...|{Firefox, /contac...|
|{page_load, 2025-...|{Chrome, /about, ...|
|{page_load, 2025-...|{Edge, /products,...|
|{page_load, 2025-...|{Edge, /home, Pau...|
|{page_load, 2025-...|{Safari, /product...|
|{page_load, 2025-...|{Safari, /produ

In [8]:
# "example" is the name of our catalog, as configured at the top of the file
# "bronze" is the name of the schema, used for domain separation
# "page_load_v1" is the name of the iceberg table itself
#
table_name = "example.bronze.page_load_v1"

table_exists = spark.catalog.tableExists(table_name)
if not table_exists:
    print("Table does not exist, creating new table")
    # If this is our first run ever, the table won't exist, and we need to create it
    df.write.format('iceberg').saveAsTable(table_name)
else:
    # On further runs, we don't want to overwrite the whole table, just add to it
    # because this job is meant to be run incrementally.
    print("Table exists, appending to existing table")
    df.write.format('iceberg').mode('append').saveAsTable(table_name)


Table does not exist, creating new table


                                                                                

In [9]:
spark.stop()