In [0]:
# ============================================================
# ETL Pipeline for 
# ============================================================

# 1. EXTRACT
# Read the uploaded CSV file from /FileStore
df = spark.table(
    "workspace.default.final_df"
)

# 2. TRANSFORM
from pyspark.sql.functions import col, when, round as spark_round, mean as spark_mean

# Remove nulls from key columns
df = df.dropna(subset=["Weekly_Sales", "Store"])

# Standardize column names
for old_name in df.columns:
    new_name = old_name.replace("_", "").replace(" ", "")
    if old_name != new_name:
        df = df.withColumnRenamed(old_name, new_name)

# Fill nulls in MarkDown columns with 0
markdown_cols = ["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"]
for col_name in markdown_cols:
    if col_name in df.columns:
        df = df.withColumn(col_name, when(col(col_name).isNull(), 0).otherwise(col(col_name)))

# Feature Engineering
df = df.withColumn("DailySales", spark_round(col("WeeklySales") / 7, 2))

# Calculate Discount as mean of MarkDown columns
df = df.withColumn(
    "Discount",
    spark_round(
        sum([col(c) for c in markdown_cols]) / len(markdown_cols),
        2
    )
)

display(df.head(10))

# 3. LOAD
# Register as a Delta table for SQL querying
df.write.mode("overwrite").saveAsTable("workspace.default.df_ready")
