In [0]:
# Verify the CSV file is accessible in mounted storage
display(dbutils.fs.ls("/mnt/raw-data"))


path,name,size,modificationTime
dbfs:/mnt/raw-data/dirty_cafe_sales.csv,dirty_cafe_sales.csv,516003,1761562490000


In [0]:
from pyspark.sql.types import StructType, StructField, StringType

# Define schema with underscores instead of spaces (all STRING)
schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("item", StringType(), True),
    StructField("quantity", StringType(), True),
    StructField("price_per_unit", StringType(), True),
    StructField("total_spent", StringType(), True),
    StructField("payment_method", StringType(), True),
    StructField("location", StringType(), True),
    StructField("transaction_date", StringType(), True)
])

print("✅ Schema defined")


✅ Schema defined


In [0]:
# Read CSV and rename columns to match schema
df_raw = spark.read \
    .option("header", "true") \
    .csv("/mnt/raw-data/dirty_cafe_sales.csv")

# Rename columns to replace spaces with underscores
df_raw = df_raw.toDF(*[col.lower().replace(" ", "_") for col in df_raw.columns])

# Cast all columns to String (Bronze layer standard)
for column in df_raw.columns:
    df_raw = df_raw.withColumn(column, df_raw[column].cast(StringType()))

# Display row count
count = df_raw.count()
print(f"✅ Loaded {count} rows from CSV")

# Show first 10 rows
display(df_raw.limit(10))


✅ Loaded 10000 rows from CSV


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
txn_1961373,Coffee,2,2,4,Credit Card,Takeaway,08.09.2023
TXn_4977031,Cake,4,3,12,Cash,In-store,16.05.2023
TXN4271903,Cookie,4,1,ERROR,Credit Card,In-store,19.07.2023
TXN_7034554,Salad,2,5,10,UNKNOWN,UNKNOWN,27.04.2023
TXN3160411,Coffee,2,2,4,Digital Wallet,In-store,11.06.2023
TXN_260 2893,Smoothie,5,4,20,Credit Card,,31.03.2023
TXN_44,UNKNOWN,3,3,9,ERROR,Takeaway,06.10.2023
TXN_6699534,Sandwich,4,4,16,Cash,UNKNOWN,28.10.2023
TXN_4717867,,5,3,15,,Takeaway,28.07.2023
TXN_2064365,Sandwich,5,4,20,,In-store,31.12.2023


In [0]:
# Define path for Bronze Delta Table
bronze_path = "/mnt/bronze/cafe_sales"
bronze_table_name = "cafe_sales_bronze"

# Write DataFrame to Delta Lake (Bronze layer)
df_raw.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(bronze_path)

print(f"✅ Data saved to Delta Table at: {bronze_path}")


✅ Data saved to Delta Table at: /mnt/bronze/cafe_sales


In [0]:
# Register Delta Table as SQL table in Databricks catalog
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {bronze_table_name}
    USING DELTA
    LOCATION '{bronze_path}'
""")

print(f"✅ Table '{bronze_table_name}' registered in catalog")


✅ Table 'cafe_sales_bronze' registered in catalog


In [0]:
# Read back from Delta Table to verify
df_bronze = spark.read.format("delta").load(bronze_path)

print(f"Row count in Bronze: {df_bronze.count()}")
display(df_bronze.limit(10))


Row count in Bronze: 10000


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
txn_1961373,Coffee,2,2,4,Credit Card,Takeaway,08.09.2023
TXn_4977031,Cake,4,3,12,Cash,In-store,16.05.2023
TXN4271903,Cookie,4,1,ERROR,Credit Card,In-store,19.07.2023
TXN_7034554,Salad,2,5,10,UNKNOWN,UNKNOWN,27.04.2023
TXN3160411,Coffee,2,2,4,Digital Wallet,In-store,11.06.2023
TXN_260 2893,Smoothie,5,4,20,Credit Card,,31.03.2023
TXN_44,UNKNOWN,3,3,9,ERROR,Takeaway,06.10.2023
TXN_6699534,Sandwich,4,4,16,Cash,UNKNOWN,28.10.2023
TXN_4717867,,5,3,15,,Takeaway,28.07.2023
TXN_2064365,Sandwich,5,4,20,,In-store,31.12.2023


In [0]:
# Alternative: Query the registered table using SQL
spark.sql(f"SELECT * FROM {bronze_table_name} LIMIT 10").show()


+--------------+--------+--------+--------------+-----------+--------------+--------+----------------+
|transaction_id|    item|quantity|price_per_unit|total_spent|payment_method|location|transaction_date|
+--------------+--------+--------+--------------+-----------+--------------+--------+----------------+
|   txn_1961373|  Coffee|       2|             2|          4|   Credit Card|Takeaway|      08.09.2023|
|   TXn_4977031|    Cake|       4|             3|         12|          Cash|In-store|      16.05.2023|
|    TXN4271903|  Cookie|       4|             1|      ERROR|   Credit Card|In-store|      19.07.2023|
| \tTXN_7034554|   Salad|       2|             5|         10|       UNKNOWN| UNKNOWN|      27.04.2023|
|  TXN3160411\t|  Coffee|       2|             2|          4|Digital Wallet|In-store|      11.06.2023|
|  TXN_260 2893|Smoothie|       5|             4|         20|   Credit Card|    NULL|      31.03.2023|
|        TXN_44| UNKNOWN|       3|             3|          9|         ERR

In [0]:
# Refresh catalog and verify
spark.catalog.refreshTable("cafe_sales_bronze")
print("✅ Catalog cache refreshed")

# Display schema to confirm all columns
df_bronze.printSchema()


✅ Catalog cache refreshed
root
 |-- transaction_id: string (nullable = true)
 |-- item: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- price_per_unit: string (nullable = true)
 |-- total_spent: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_date: string (nullable = true)

