In [0]:
# Read data from Bronze2 Delta Table 
bronze2_path = "/mnt/bronze2/cafe_sales"
df_bronze2 = spark.read.format("delta").load(bronze2_path)

# Display initial row count and sample data
print(f"Initial row count (Bronze2): {df_bronze2.count()}")
display(df_bronze2.limit(10))



Initial row count (Bronze2): 3556


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07.11.2023
TXN3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,11.06.2023
TXN4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,19.07.2023
TXN_1000555,Tea,1.0,1.5,1.5,Credit Card,In-store,19.10.2023
TXN_1002457,Cookie,5.0,1.0,5.0,Digital Wallet,Takeaway,29.09.2023
TXN_1004184,Smoothie,1.0,4.0,4.0,Credit Card,In-store,18.05.2023
TXN_1004563,Tea,5.0,1.5,7.5,Credit Card,In-store,28.10.2023
TXN_1005331,Coffee,1.0,2.0,2.0,Digital Wallet,Takeaway,04.11.2023
TXN_1005377,Cake,5.0,3.0,15.0,Digital Wallet,Takeaway,03.06.2023
TXN_1006942,Salad,1.0,5.0,5.0,Credit Card,In-store,30.11.2023


In [0]:
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import StringType

# Separate numeric and string columns
numeric_columns = ["quantity", "price_per_unit", "total_spent"]
string_columns = [col_name for col_name in df_bronze2.columns if col_name not in numeric_columns]

# Remove ALL leading and trailing whitespace ONLY from STRING columns
df_trimmed = df_bronze2
for column in string_columns:
    df_trimmed = df_trimmed.withColumn(
        column, 
        regexp_replace(col(column), r'^\s+|\s+$', '')  # Remove leading/trailing whitespace
    )

print(f"✅ Removed leading/trailing whitespace from STRING columns")
display(df_trimmed.limit(10))


✅ Removed leading/trailing whitespace from STRING columns


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07.11.2023
TXN3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,11.06.2023
TXN4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,19.07.2023
TXN_1000555,Tea,1.0,1.5,1.5,Credit Card,In-store,19.10.2023
TXN_1002457,Cookie,5.0,1.0,5.0,Digital Wallet,Takeaway,29.09.2023
TXN_1004184,Smoothie,1.0,4.0,4.0,Credit Card,In-store,18.05.2023
TXN_1004563,Tea,5.0,1.5,7.5,Credit Card,In-store,28.10.2023
TXN_1005331,Coffee,1.0,2.0,2.0,Digital Wallet,Takeaway,04.11.2023
TXN_1005377,Cake,5.0,3.0,15.0,Digital Wallet,Takeaway,03.06.2023
TXN_1006942,Salad,1.0,5.0,5.0,Credit Card,In-store,30.11.2023


In [0]:
# List of STRING columns to remove internal whitespace (exclude PAYMENT_METHOD and numeric columns)
numeric_columns = ["quantity", "price_per_unit", "total_spent"]
columns_to_clean = [
    col_name for col_name in df_trimmed.columns 
    if col_name not in ["payment_method"] + numeric_columns
]

# Remove ALL internal whitespace from specified STRING columns
df_cleaned = df_trimmed
for column in columns_to_clean:
    df_cleaned = df_cleaned.withColumn(
        column, 
        regexp_replace(col(column), r'\s+', '')  # Replace all whitespace with empty string
    )

print(f"✅ Removed internal whitespace from STRING columns (except payment_method)")
display(df_cleaned.limit(10))


✅ Removed internal whitespace from STRING columns (except payment_method)


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07.11.2023
TXN3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,11.06.2023
TXN4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,19.07.2023
TXN_1000555,Tea,1.0,1.5,1.5,Credit Card,In-store,19.10.2023
TXN_1002457,Cookie,5.0,1.0,5.0,Digital Wallet,Takeaway,29.09.2023
TXN_1004184,Smoothie,1.0,4.0,4.0,Credit Card,In-store,18.05.2023
TXN_1004563,Tea,5.0,1.5,7.5,Credit Card,In-store,28.10.2023
TXN_1005331,Coffee,1.0,2.0,2.0,Digital Wallet,Takeaway,04.11.2023
TXN_1005377,Cake,5.0,3.0,15.0,Digital Wallet,Takeaway,03.06.2023
TXN_1006942,Salad,1.0,5.0,5.0,Credit Card,In-store,30.11.2023


In [0]:
# Clear cache before writing to avoid schema analysis issues
spark.catalog.clearCache()

# Overwrite Bronze2 Delta Table with whitespace-cleaned data
df_cleaned.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(bronze2_path)

print(f"✅ Bronze2 table updated with whitespace-cleaned data")
print(f"Final row count: {df_cleaned.count()}")


✅ Bronze2 table updated with whitespace-cleaned data
Final row count: 3556


In [0]:
# Refresh catalog and verify changes for Bronze2
from pyspark.sql.functions import length, col

# Refresh catalog cache for Bronze2 table
spark.catalog.refreshTable("cafe_sales_bronze2")
print("✅ Catalog cache refreshed for Bronze2")

# Read back from Bronze2 to verify changes
df_verify = spark.read.format("delta").load(bronze2_path)

# Display sample cleaned data from Bronze2
print("\n=== Sample cleaned data from Bronze2 ===")
display(df_verify.limit(20))


✅ Catalog cache refreshed for Bronze2

=== Sample cleaned data from Bronze2 ===


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07.11.2023
TXN3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,11.06.2023
TXN4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,19.07.2023
TXN_1000555,Tea,1.0,1.5,1.5,Credit Card,In-store,19.10.2023
TXN_1002457,Cookie,5.0,1.0,5.0,Digital Wallet,Takeaway,29.09.2023
TXN_1004184,Smoothie,1.0,4.0,4.0,Credit Card,In-store,18.05.2023
TXN_1004563,Tea,5.0,1.5,7.5,Credit Card,In-store,28.10.2023
TXN_1005331,Coffee,1.0,2.0,2.0,Digital Wallet,Takeaway,04.11.2023
TXN_1005377,Cake,5.0,3.0,15.0,Digital Wallet,Takeaway,03.06.2023
TXN_1006942,Salad,1.0,5.0,5.0,Credit Card,In-store,30.11.2023
