In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable


In [0]:
%run /Workspace/Users/amank0639@gmail.com/fmcg_repo/consolidated_pipeline/1_Setup/utilities

In [0]:
dbutils.widgets.text("Catalog","fmcg")
dbutils.widgets.text("Data_Source","products")

In [0]:
catalog=dbutils.widgets.get("Catalog")
data_source=dbutils.widgets.get("Data_Source")

In [0]:
base_path=f"s3://sportsbar-bucket/{data_source}"


In [0]:

products_raw_df=spark.read.format("csv").option("header",True).option("inferSchema",True).load(base_path)\
            .select("*",F.current_timestamp().alias("read_timestamp"),F.col("_metadata.file_name").alias("file_name")\
                ,F.col("_metadata.file_size").alias("file_size"))
products_raw_df.printSchema()
display(products_raw_df)

products_raw_df.write.format("delta").mode("overwrite").option("enableChangeDataFeed","true")\
    .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")


In [0]:
products_bronze_df=spark.read.table(f"{catalog}.{bronze_schema}.{data_source}")
display(products_bronze_df)

In [0]:
cleaned_df=(
    products_bronze_df
    .dropDuplicates(subset=["product_id"])
    .withColumn("category",F.initcap(F.col("category")))
    .withColumn("variant",F.split("product_name",r"\(")[1])     
    .withColumn("variant",F.regexp_replace("variant",r"\)",""))
    .withColumn("product_name",F.regexp_replace("product_name","(?i)protien","Protein"))
    .withColumn("category",F.regexp_replace("category","(?i)protien","Protein"))
    .withColumn("product_id",F.when(F.col("product_id").rlike("^[0-9]+$"),F.col("product_id")).otherwise("99999999"))
    .withColumnRenamed("product_name","product")
    )
display(cleaned_df)

In [0]:
df_silver=(cleaned_df.
           withColumn("division",
            F.when(F.col("category")=="Energy Bars","Nutrition Bars").
            when(F.col("category")=="Protien Bars","Nutrition Bars").
            when(F.col("category")=="Granola & Cereals","Breakfast Foods").
            when(F.col("category")=="Recovery Dairy","Dairy & Recovery").
            when(F.col("category")=="Healthy Snacks","Healthy Snacks").
            when(F.col("category")=="Electrolyte Mix","Hydration & Electrolytes").
            otherwise("Other")
            ).
           withColumn("product_code",F.sha2(F.col("product").cast("string"),256))
           )

display(df_silver)

In [0]:
df_silver.write\
    .format("delta")\
    .mode("overwrite")\
    .option("delta.enableChangeDataFeed","true")\
    .option("mergeSchema","true")\
    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

In [0]:
df_silver=spark.table(f"{catalog}.{silver_schema}.{data_source}")
df_gold=df_silver.select("product_code","product_id","division","category","product","variant")


In [0]:
df_gold.write\
    .format("delta")\
    .mode("overwrite")\
    .option("delta.enableChangeDataFeed","true")\
    .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

display(spark.table(f"{catalog}.{gold_schema}.sb_dim_{data_source}"))

### Mergin parent and child tables


In [0]:
delta_table=DeltaTable.forName(spark,f"{catalog}.{gold_schema}.dim_{data_source}")
df_gold=spark.sql(f"SELECT product_code,division,category,product,variant From {catalog}.{gold_schema}.sb_dim_{data_source}")


In [0]:
delta_table.alias("target").merge(df_gold.alias("source"),"target.product_code=source.product_code")\
    .whenMatchedUpdate(
        set={
            "product_code":"source.product_code",
            "division":"source.division",
            "category":"source.category",
            "product":"source.product",
            "variant":"source.variant"
        }
    )\
    .whenNotMatchedInsert(
        values={
            "product_code":"source.product_code",
            "division":"source.division",
            "category":"source.category",
            "product":"source.product",
            "variant":"source.variant"
        }
    ).execute()

In [0]:
display(spark.table(f"{catalog}.{gold_schema}.dim_{data_source}"))