In [0]:
# Read data from Bronze2 Delta Table
bronze2_path = "/mnt/bronze2/cafe_sales"
df_bronze2 = spark.read.format("delta").load(bronze2_path)

# Display initial row count and sample data
print(f"Initial row count (Bronze2): {df_bronze2.count()}")
display(df_bronze2.limit(10))


Initial row count (Bronze2): 3555


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07-11-2023
TXN_3160411,Coffee,2.0,2.0,4.0,no-cash payment,In-store,11-06-2023
TXN_4271903,Cookie,4.0,1.0,4.0,no-cash payment,In-store,19-07-2023
TXN_1000555,Tea,1.0,1.5,1.5,no-cash payment,In-store,19-10-2023
TXN_1002457,Cookie,5.0,1.0,5.0,no-cash payment,Takeaway,29-09-2023
TXN_1004184,Smoothie,1.0,4.0,4.0,no-cash payment,In-store,18-05-2023
TXN_1004563,Tea,5.0,1.5,7.5,no-cash payment,In-store,28-10-2023
TXN_1005331,Coffee,1.0,2.0,2.0,no-cash payment,Takeaway,04-11-2023
TXN_1005377,Cake,5.0,3.0,15.0,no-cash payment,Takeaway,03-06-2023
TXN_1006942,Salad,1.0,5.0,5.0,no-cash payment,In-store,30-11-2023


In [0]:
from pyspark.sql.functions import col, regexp_replace, length, upper

# Count records before validation
before_count = df_bronze2.count()

# Remove records that don't have exactly 3 letters and 7 digits
df_valid = df_bronze2.filter(
    (col("transaction_id").isNotNull()) &
    (length(regexp_replace(col("transaction_id"), "[^A-Za-z]", "")) == 3) &
    (length(regexp_replace(col("transaction_id"), "[^0-9]", "")) == 7)
)

after_filter_count = df_valid.count()
print(f"✅ Usunięto {before_count - after_filter_count} rekordów bez dokładnie 3 liter i 7 cyfr")
print(f"Pozostało rekordów: {after_filter_count}")
display(df_valid.limit(10))


✅ Usunięto 0 rekordów bez dokładnie 3 liter i 7 cyfr
Pozostało rekordów: 3555


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07-11-2023
TXN_3160411,Coffee,2.0,2.0,4.0,no-cash payment,In-store,11-06-2023
TXN_4271903,Cookie,4.0,1.0,4.0,no-cash payment,In-store,19-07-2023
TXN_1000555,Tea,1.0,1.5,1.5,no-cash payment,In-store,19-10-2023
TXN_1002457,Cookie,5.0,1.0,5.0,no-cash payment,Takeaway,29-09-2023
TXN_1004184,Smoothie,1.0,4.0,4.0,no-cash payment,In-store,18-05-2023
TXN_1004563,Tea,5.0,1.5,7.5,no-cash payment,In-store,28-10-2023
TXN_1005331,Coffee,1.0,2.0,2.0,no-cash payment,Takeaway,04-11-2023
TXN_1005377,Cake,5.0,3.0,15.0,no-cash payment,Takeaway,03-06-2023
TXN_1006942,Salad,1.0,5.0,5.0,no-cash payment,In-store,30-11-2023


In [0]:
from pyspark.sql.functions import concat, substring, lit

# Extract letters and digits, then format as UPPERCASE_LETTERS + '_' + DIGITS
df_formatted = df_valid.withColumn(
    "transaction_id",
    concat(
        upper(substring(regexp_replace(col("transaction_id"), "[^A-Za-z]", ""), 1, 3)),
        lit("_"),
        substring(regexp_replace(col("transaction_id"), "[^0-9]", ""), 1, 7)
    )
)

print("✅ 1. Zamieniono pierwsze 3 znaki na wielkie litery")
print("✅ 2. Dodano underscore między literami a cyframi")
display(df_formatted.limit(10))


✅ 1. Zamieniono pierwsze 3 znaki na wielkie litery
✅ 2. Dodano underscore między literami a cyframi


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07-11-2023
TXN_3160411,Coffee,2.0,2.0,4.0,no-cash payment,In-store,11-06-2023
TXN_4271903,Cookie,4.0,1.0,4.0,no-cash payment,In-store,19-07-2023
TXN_1000555,Tea,1.0,1.5,1.5,no-cash payment,In-store,19-10-2023
TXN_1002457,Cookie,5.0,1.0,5.0,no-cash payment,Takeaway,29-09-2023
TXN_1004184,Smoothie,1.0,4.0,4.0,no-cash payment,In-store,18-05-2023
TXN_1004563,Tea,5.0,1.5,7.5,no-cash payment,In-store,28-10-2023
TXN_1005331,Coffee,1.0,2.0,2.0,no-cash payment,Takeaway,04-11-2023
TXN_1005377,Cake,5.0,3.0,15.0,no-cash payment,Takeaway,03-06-2023
TXN_1006942,Salad,1.0,5.0,5.0,no-cash payment,In-store,30-11-2023


In [0]:
# Final validation: keep only records matching pattern XXX_NNNNNNN
df_validated = df_formatted.filter(
    col("transaction_id").rlike("^[A-Z]{3}_[0-9]{7}$")
)

final_count = df_validated.count()
removed_count = after_filter_count - final_count
print(f"✅ 3. Ostateczna walidacja - usunięto {removed_count} niepoprawnych formatów")
print(f"Pozostało rekordów: {final_count}")
display(df_validated.limit(10))


✅ 3. Ostateczna walidacja - usunięto 0 niepoprawnych formatów
Pozostało rekordów: 3555


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07-11-2023
TXN_3160411,Coffee,2.0,2.0,4.0,no-cash payment,In-store,11-06-2023
TXN_4271903,Cookie,4.0,1.0,4.0,no-cash payment,In-store,19-07-2023
TXN_1000555,Tea,1.0,1.5,1.5,no-cash payment,In-store,19-10-2023
TXN_1002457,Cookie,5.0,1.0,5.0,no-cash payment,Takeaway,29-09-2023
TXN_1004184,Smoothie,1.0,4.0,4.0,no-cash payment,In-store,18-05-2023
TXN_1004563,Tea,5.0,1.5,7.5,no-cash payment,In-store,28-10-2023
TXN_1005331,Coffee,1.0,2.0,2.0,no-cash payment,Takeaway,04-11-2023
TXN_1005377,Cake,5.0,3.0,15.0,no-cash payment,Takeaway,03-06-2023
TXN_1006942,Salad,1.0,5.0,5.0,no-cash payment,In-store,30-11-2023


In [0]:
from pyspark.sql.functions import when, regexp_replace

# Convert date format from dd.mm.yyyy to dd-mm-yyyy
df_date_fixed = df_validated.withColumn(
    "transaction_date",
    when(
        col("transaction_date").contains("."),
        regexp_replace(col("transaction_date"), "\\.", "-")
    ).otherwise(col("transaction_date"))
)

print("✅ Format daty zamieniony z 'dd.mm.yyyy' na 'dd-mm-yyyy'")
display(df_date_fixed.limit(10))


✅ Format daty zamieniony z 'dd.mm.yyyy' na 'dd-mm-yyyy'


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07-11-2023
TXN_3160411,Coffee,2.0,2.0,4.0,no-cash payment,In-store,11-06-2023
TXN_4271903,Cookie,4.0,1.0,4.0,no-cash payment,In-store,19-07-2023
TXN_1000555,Tea,1.0,1.5,1.5,no-cash payment,In-store,19-10-2023
TXN_1002457,Cookie,5.0,1.0,5.0,no-cash payment,Takeaway,29-09-2023
TXN_1004184,Smoothie,1.0,4.0,4.0,no-cash payment,In-store,18-05-2023
TXN_1004563,Tea,5.0,1.5,7.5,no-cash payment,In-store,28-10-2023
TXN_1005331,Coffee,1.0,2.0,2.0,no-cash payment,Takeaway,04-11-2023
TXN_1005377,Cake,5.0,3.0,15.0,no-cash payment,Takeaway,03-06-2023
TXN_1006942,Salad,1.0,5.0,5.0,no-cash payment,In-store,30-11-2023


In [0]:
# Standardize payment methods: 'Digital Wallet' and 'Credit Card' → 'no-cash payment'
df_payment_std = df_date_fixed.withColumn(
    "payment_method",
    when(
        upper(col("payment_method")).isin("DIGITAL WALLET", "CREDIT CARD"),
        "no-cash payment"
    ).otherwise(col("payment_method"))
)

print("✅ Zamieniono 'Digital Wallet' i 'Credit Card' na 'no-cash payment'")
display(df_payment_std.limit(10))


✅ Zamieniono 'Digital Wallet' i 'Credit Card' na 'no-cash payment'


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07-11-2023
TXN_3160411,Coffee,2.0,2.0,4.0,no-cash payment,In-store,11-06-2023
TXN_4271903,Cookie,4.0,1.0,4.0,no-cash payment,In-store,19-07-2023
TXN_1000555,Tea,1.0,1.5,1.5,no-cash payment,In-store,19-10-2023
TXN_1002457,Cookie,5.0,1.0,5.0,no-cash payment,Takeaway,29-09-2023
TXN_1004184,Smoothie,1.0,4.0,4.0,no-cash payment,In-store,18-05-2023
TXN_1004563,Tea,5.0,1.5,7.5,no-cash payment,In-store,28-10-2023
TXN_1005331,Coffee,1.0,2.0,2.0,no-cash payment,Takeaway,04-11-2023
TXN_1005377,Cake,5.0,3.0,15.0,no-cash payment,Takeaway,03-06-2023
TXN_1006942,Salad,1.0,5.0,5.0,no-cash payment,In-store,30-11-2023


In [0]:
# Clear cache before writing to avoid schema analysis issues
spark.catalog.clearCache()

# Overwrite Bronze2 Delta Table with validated and standardized data
df_payment_std.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(bronze2_path)

print(f"✅ Bronze2 table updated with validated and standardized data")
print(f"Final row count: {df_payment_std.count()}")


✅ Bronze2 table updated with validated and standardized data
Final row count: 3555


In [0]:
# Refresh catalog and verify changes for Bronze2
from pyspark.sql.functions import length, col

# Refresh catalog cache for Bronze2 table
spark.catalog.refreshTable("cafe_sales_bronze2")
print("✅ Catalog cache refreshed for Bronze2")

# Read back from Bronze2 to verify changes
df_verify = spark.read.format("delta").load(bronze2_path)

# Display sample validated and standardized data from Bronze2
print("\n=== Sample validated and standardized data from Bronze2 ===")
display(df_verify.limit(20))


✅ Catalog cache refreshed for Bronze2

=== Sample validated and standardized data from Bronze2 ===


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07-11-2023
TXN_3160411,Coffee,2.0,2.0,4.0,no-cash payment,In-store,11-06-2023
TXN_4271903,Cookie,4.0,1.0,4.0,no-cash payment,In-store,19-07-2023
TXN_1000555,Tea,1.0,1.5,1.5,no-cash payment,In-store,19-10-2023
TXN_1002457,Cookie,5.0,1.0,5.0,no-cash payment,Takeaway,29-09-2023
TXN_1004184,Smoothie,1.0,4.0,4.0,no-cash payment,In-store,18-05-2023
TXN_1004563,Tea,5.0,1.5,7.5,no-cash payment,In-store,28-10-2023
TXN_1005331,Coffee,1.0,2.0,2.0,no-cash payment,Takeaway,04-11-2023
TXN_1005377,Cake,5.0,3.0,15.0,no-cash payment,Takeaway,03-06-2023
TXN_1006942,Salad,1.0,5.0,5.0,no-cash payment,In-store,30-11-2023
