In [0]:
spark

## Bronze Layer â€“ Raw Data Ingestion

The Bronze layer is responsible for ingesting raw e-commerce data from the source into the data lake without applying any transformations.  
This ensures that the original data is preserved for traceability, reprocessing, and auditing purposes.


In [0]:
# List the files in the olist-data container
dbutils.fs.ls("abfss://olist-data@retailds.dfs.core.windows.net/")

[FileInfo(path='abfss://olist-data@retailds.dfs.core.windows.net/bronze/', name='bronze/', size=0, modificationTime=1766773696000),
 FileInfo(path='abfss://olist-data@retailds.dfs.core.windows.net/gold/', name='gold/', size=0, modificationTime=1766773719000),
 FileInfo(path='abfss://olist-data@retailds.dfs.core.windows.net/raw/', name='raw/', size=0, modificationTime=1766773686000),
 FileInfo(path='abfss://olist-data@retailds.dfs.core.windows.net/silver/', name='silver/', size=0, modificationTime=1766773707000)]

### Data Source
- Dataset: Olist Brazilian E-Commerce Dataset
- Storage: Azure Data Lake Storage Gen2
- Format: CSV files

In [0]:
# Read the orders csv file from raw/olist/ folder
df = spark.read.option("header", "true").option("inferSchema", "true").csv("abfss://olist-data@retailds.dfs.core.windows.net/raw/olist/olist_orders_dataset.csv")

In [0]:
# Display the first 5 rows of orders dataframe
display(df.limit(5))

order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02T10:56:33Z,2017-10-02T11:07:15Z,2017-10-04T19:55:00Z,2017-10-10T21:25:13Z,2017-10-18T00:00:00Z
53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24T20:41:37Z,2018-07-26T03:24:27Z,2018-07-26T14:31:00Z,2018-08-07T15:27:45Z,2018-08-13T00:00:00Z
47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08T08:38:49Z,2018-08-08T08:55:23Z,2018-08-08T13:50:00Z,2018-08-17T18:06:29Z,2018-09-04T00:00:00Z
949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18T19:28:06Z,2017-11-18T19:45:59Z,2017-11-22T13:39:59Z,2017-12-02T00:28:42Z,2017-12-15T00:00:00Z
ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13T21:18:39Z,2018-02-13T22:20:29Z,2018-02-14T19:46:34Z,2018-02-16T18:17:02Z,2018-02-26T00:00:00Z


In [0]:
# Print the schema of orders dataframe
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [0]:
# Count the number of rows in orders dataframe
df.count()

99441

### Bronze Storage Strategy
The raw data is stored in Delta format in the Bronze layer to enable schema enforcement, versioning, and efficient downstream processing.


In [0]:
# Write orders dataframe into bronze/ folder
df.write \
  .format("delta") \
  .mode("overwrite") \
  .option("overwriteSchema", "true") \
  .save("abfss://olist-data@retailds.dfs.core.windows.net/bronze/orders")


com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:132)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:132)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:190)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
# Read remaining csv file from raw/olist/ folder and write into bronze/ folder
tables = [
    "customers",
    "geolocation",
    "order_items",
    "order_payments",
    "order_reviews",
    "products",
    "sellers"

]
for table in tables:
    df = spark.read.option("header", "true").option("inferSchema", "true").csv(f"abfss://olist-data@retailds.dfs.core.windows.net/raw/olist/olist_{table}_dataset.csv")

    df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"abfss://olist-data@retailds.dfs.core.windows.net/bronze/{table}")


In [0]:
# Count the number of rows in orders table
spark.sql("""select count(*) from bronze.orders""").display()

count(1)
99441


In [0]:
# Display the first 5 rows of orders table 
spark.sql("""select * from bronze.orders limit 5""").display()

order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02T10:56:33Z,2017-10-02T11:07:15Z,2017-10-04T19:55:00Z,2017-10-10T21:25:13Z,2017-10-18T00:00:00Z
53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24T20:41:37Z,2018-07-26T03:24:27Z,2018-07-26T14:31:00Z,2018-08-07T15:27:45Z,2018-08-13T00:00:00Z
47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08T08:38:49Z,2018-08-08T08:55:23Z,2018-08-08T13:50:00Z,2018-08-17T18:06:29Z,2018-09-04T00:00:00Z
949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18T19:28:06Z,2017-11-18T19:45:59Z,2017-11-22T13:39:59Z,2017-12-02T00:28:42Z,2017-12-15T00:00:00Z
ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13T21:18:39Z,2018-02-13T22:20:29Z,2018-02-14T19:46:34Z,2018-02-16T18:17:02Z,2018-02-26T00:00:00Z
