This notebook ingests raw data from the source system and stores it in the Bronze layer.


In [0]:
#----------------------------------------------------------------------
# Define paths for data layers using Volumes
#----------------------------------------------------------------------
raw_path = "/Volumes/workspace/default/raw_data/"
bronze_path = "/Volumes/workspace/default/bronze/"

#----------------------------------------------------------------------
# Import necessary PySpark modules for schema definition and transformations
#----------------------------------------------------------------------
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, DoubleType
from pyspark.sql.functions import current_timestamp, to_timestamp


In [0]:


# 1. Load Customer Data into Bronze as Delta file

#--------------------------------------------------------------------------------
# 1. Define schema explicitly for better performance (avoid schema inference)
#--------------------------------------------------------------------------------

customer_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("customer_unique_id", StringType(), True),
    StructField("customer_zip_code_prefix", StringType(), True),
    StructField("customer_city", StringType(), True),
    StructField("customer_state", StringType(), True)
])

#--------------------------------------------------------------------------------
# 2. Read raw CSV data using the defined schema
#--------------------------------------------------------------------------------
df_customer = spark.read \
    .option("header", True) \
    .schema(customer_schema) \
    .csv(raw_path + "olist_customers_dataset.csv")

# Optional display
#df_customer.display()

# --------------------------------------------
# 3. Add ingestion timestamp column for lineage and tracking
# --------------------------------------------
df_customer = df_customer.withColumn("created_at", current_timestamp())

# --------------------------------------------
# 4. Write to Bronze path in Delta format
# mode("overwrite") ensures repeatable testing; has to be changed to "append" for prod
# partitioning is skipped since customer dataset is small
# --------------------------------------------

df_customer.write \
    .format("delta") \
    .mode("overwrite") \
    .save(bronze_path + "customers") 


In [0]:
# 2. Load Products Data into Bronze as Delta file
#--------------------------------------------------------------------------------
# 1. Define schema explicitly for better performance (avoid schema inference)
#--------------------------------------------------------------------------------
product_schema = StructType([
    StructField("product_id", StringType(), True),
    StructField("product_category_name", StringType(), True),
    StructField("product_name_length", IntegerType(), True),
    StructField("product_description_length", IntegerType(), True),
    StructField("product_photos_qty", IntegerType(), True),
    StructField("product_weight_g", IntegerType(), True),
    StructField("product_length_cm", IntegerType(), True),
    StructField("product_height_cm", IntegerType(), True),
    StructField("product_width_cm", IntegerType(), True)
])

#--------------------------------------------------------------------------------
# 2. Read raw CSV data using the defined schema
#--------------------------------------------------------------------------------

df_product = spark.read \
    .option("header", True) \
    .schema(product_schema) \
    .csv(raw_path + "olist_products_dataset.csv")

# Optional display
#df_product.display()

# --------------------------------------------
# 3. Add ingestion timestamp column for lineage and tracking
# --------------------------------------------
df_product = df_product.withColumn("created_at", current_timestamp())

# --------------------------------------------
# Write to Bronze in Delta Format
# --------------------------------------------

df_product.write \
    .format("delta") \
    .mode("overwrite") \
    .save(bronze_path + "products") 


In [0]:
# 3. Load Sellers Data into Bronze as Delta file

seller_schema = StructType([
    StructField("seller_id", StringType(), True),
    StructField("seller_zip_code_prefix", StringType(), True),
    StructField("seller_city", StringType(), True),
    StructField("seller_state", StringType(), True)
])

df_seller = spark.read \
    .option("header", True) \
    .schema(seller_schema) \
    .csv(raw_path + "olist_sellers_dataset.csv")

# Optional display
#df_seller.display()

# --------------------------------------------
# Add Ingestion Timestamp
# --------------------------------------------
df_seller = df_seller.withColumn("created_at", current_timestamp())
# --------------------------------------------
# Write to Bronze in Delta Format
# --------------------------------------------

df_seller.write \
    .format("delta") \
    .mode("overwrite") \
    .save(bronze_path + "sellers") 


In [0]:
# 4. Load Orders Data into Bronze as Delta file
# 1. Define schema for the dataset
order_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("order_status", StringType(), True),
    StructField("order_purchase_timestamp", TimestampType(), True),
    StructField("order_approved_at", TimestampType(), True),
    StructField("order_delivered_carrier_date", TimestampType(), True),
    StructField("order_delivered_customer_date", TimestampType(), True),
    StructField("order_estimated_delivery_date", TimestampType(), True)
])

# 2. Read CSV into DataFrame
df_orders = spark.read \
    .option("header", True) \
    .schema(order_schema) \
    .csv(raw_path + "olist_orders_dataset.csv")



# 3. Add ingestion timestamp

df_orders = df_orders.withColumn("created_at", current_timestamp())

# Optional display
#df_orders.display()


# 4. Write to Bronze layer in Delta format

df_orders.write \
    .format("delta") \
    .mode("overwrite") \
    .save(bronze_path + "orders") 


In [0]:
# 5. Load Order reviews Data into Bronze as Delta file
# 1. Define schema for the dataset
order_reviews_schema = StructType([
    StructField("review_id", StringType(), True),
    StructField("order_id", StringType(), True),
    StructField("review_score", IntegerType(), True),
    StructField("review_comment_title", StringType(), True),
    StructField("review_comment_message", StringType(), True),
    StructField("review_creation_date", TimestampType(), True),
    StructField("review_answer_timestamp", TimestampType(), True)
])

# 2. Read CSV into DataFrame
df_order_reviews = spark.read \
    .option("header", True) \
    .schema(order_reviews_schema) \
    .csv(raw_path + "olist_order_reviews_dataset.csv")


# 3. Add ingestion timestamp
df_order_reviews = df_order_reviews.withColumn("created_at", current_timestamp())


# Optional display
#df_order_reviews.display()

# 4. Write to Bronze layer in Delta format
df_order_reviews.write \
    .format("delta") \
    .mode("overwrite") \
    .save(bronze_path + "orders_reviews") 



In [0]:
# 6. Load Order items Data into Bronze as Delta file

# 1. Define schema for the dataset 
order_items_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("order_item_id", IntegerType(), True),
    StructField("product_id", StringType(), True),
    StructField("seller_id", StringType(), True),
    StructField("shipping_limit_date", TimestampType(), True),
    StructField("price", DoubleType(), True),
    StructField("freight_value", DoubleType(), True)
])

# 2. Read CSV into DataFrame
df_order_items = spark.read \
    .option("header", True) \
    .schema(order_items_schema) \
    .csv(raw_path + "olist_order_items_dataset.csv")


# 3. Add ingestion timestamp
df_order_items = df_order_items.withColumn("created_at", current_timestamp())

# Optional display
#df_order_items.display()

# 4. Write to Bronze layer in Delta format

df_order_items.write \
    .format("delta") \
    .mode("overwrite") \
    .save(bronze_path + "orders_items") 


In [0]:
# 7. Load Order Payments Data into Bronze as Delta file

# 1. Define schema for the dataset 
order_payments_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("payment_sequential", IntegerType(), True),
    StructField("payment_type", StringType(), True),
    StructField("payment_installments", IntegerType(), True),
    StructField("payment_value", DoubleType(), True)
])

# 2. Read CSV into DataFrame
df_order_payments = spark.read \
    .option("header", True) \
    .schema(order_payments_schema) \
    .csv(raw_path + "olist_order_items_dataset.csv")


# 3. Add ingestion timestamp
df_order_payments = df_order_payments.withColumn("created_at", current_timestamp())

# Optional display
#df_order_payments.display()

# 4. Write to Bronze layer in Delta format
df_order_payments.write \
    .format("delta") \
    .mode("overwrite") \
    .save(bronze_path + "order_payments") 


In [0]:
# 8. Load geolocation Data into Bronze as Delta file

# 1. Define schema for the dataset
geolocation_schema = StructType([
    StructField("geolocation_zip_code_prefix", StringType(), True),
    StructField("geolocation_lat", DoubleType(), True),
    StructField("geolocation_lng", DoubleType(), True),
    StructField("geolocation_city", StringType(), True),
    StructField("geolocation_state", StringType(), True)
])

# 2. Read CSV into DataFrame
df_geolocation = spark.read \
    .option("header", True) \
    .schema(geolocation_schema) \
    .csv(raw_path + "olist_geolocation_dataset.csv")

# 3. Add ingestion timestamp
df_geolocation = df_geolocation.withColumn("created_at", current_timestamp())

# Optional display
#df_geolocation.display()

# 4. Write to Bronze layer in Delta format
df_geolocation.write \
    .format("delta") \
    .mode("overwrite") \
    .save(bronze_path + "geolocation") 


In [0]:
# 9. Load Product Category Name translation Data into Bronze as Delta file

# 1. Define schema for the dataset
product_category_schema = StructType([
    StructField("product_category_name", StringType(), True),
    StructField("product_category_name_english", StringType(), True)
])

# 2. Read CSV into DataFrame
df_product_category = spark.read \
    .option("header", True) \
    .schema(product_category_schema) \
    .csv(raw_path + "product_category_name_translation.csv")



# 3. Add ingestion timestamp
df_product_category = df_product_category.withColumn("created_at", current_timestamp())

# Optional display
#df_product_category.display()

# 4. Write to Bronze layer in Delta format
df_product_category.write \
    .format("delta") \
    .mode("overwrite") \
    .save(bronze_path + "product_category") 
