In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable
from pyspark.sql.window import Window

In [0]:
%run /Workspace/Users/amank0639@gmail.com/fmcg_repo/consolidated_pipeline/1_Setup/utilities

In [0]:
dbutils.widgets.text("Data_Source","gross_price")
dbutils.widgets.text("Catalog","fmcg")

In [0]:
data_source=dbutils.widgets.get("Data_Source")
catalog=dbutils.widgets.get("Catalog")

In [0]:
base_path=f"s3://sportsbar-bucket/{data_source}/*.csv"

In [0]:
raw_df=(spark
        .read
        .format("csv")
        .option("header",True)
        .option("inferSchema",True)
        .load(base_path)
        .withColumn("read_timestamp",F.current_timestamp())
        .select("*","_metadata.file_name","_metadata.file_size")
        )
display(raw_df)

In [0]:
raw_df\
    .write\
    .format("delta")\
    .mode("overwrite")\
    .option("delta.enableChangeDataFeed", "true")\
    .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

### Silver Processing

In [0]:
bronze_df=spark.read.table(f"{catalog}.{bronze_schema}.{data_source}")

In [0]:
silver_df=bronze_df.withColumn("month",
                               F.coalesce(
                                   F.try_to_date(F.col("month"),"yyyy/MM/dd"),
                                   F.try_to_date(F.col("month"),"yyyy-MM-dd"),
                                   F.try_to_date(F.col("month"),"dd/MM/yyyy"),
                                   F.try_to_date(F.col("month"),"dd-MM-yyyy")
                               ))
display(silver_df)

In [0]:
silver_df=(silver_df.withColumn("gross_price",F.when(F.col("gross_price").rlike("[a-zA-Z]"),F.lit("0"))
                                .otherwise(F.col("gross_price")))
                    .withColumn("gross_price",F.abs(F.col("gross_price").cast("double")))
)
display(silver_df)


In [0]:
products_df=spark.read.table(f"{catalog}.{silver_schema}.products")

joined_df=silver_df.alias("t1").join(products_df.alias("t2"),F.col("t1.product_id")==F.col("t2.product_id")).select(F.col("t1.product_id"),F.col("t2.product_code"),F.col("t1.month"),F.col("t1.gross_price"),F.col("t1.read_timestamp"),F.col("t1.file_name"),F.col("t1.file_size"))

display(joined_df)


In [0]:
joined_df.write\
    .format("delta")\
    .mode("overwrite")\
    .option("delta.enableChangeDataFeed","true")\
    .option("mergeSchema","true")\
    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

### Gold Processing

In [0]:
silver_df=spark.read.table(f"{catalog}.{silver_schema}.{data_source}")

gold_df=(
    silver_df.
    select(F.col("product_code"),F.col("month"),F.col("gross_price"),
           F.year(F.col("month")).alias("year"),
           F.when(F.col("gross_price")==0,1).otherwise(0).alias("is_zero"))
    )

display(gold_df)

In [0]:

window1=Window.partitionBy(F.col("product_code"),F.col("year")).orderBy(F.col("is_zero").asc(),F.month(F.col("month")).desc())

gold_df=(
        gold_df.
        withColumn("rank",F.rank().over(window1)).
        filter("rank=1").drop("rank").
        select(F.col("product_code"),F.col("gross_price").alias("price_inr"),F.col("year"))
        )

display(gold_df)

In [0]:
gold_df.write\
    .format("delta")\
    .mode("overwrite")\
    .option("delta.enableChangeDataFeed","true")\
    .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

### Merging with Parent Table

In [0]:
deltaTable=DeltaTable.forName(spark,f"{catalog}.{gold_schema}.dim_{data_source}")

gold_df=spark.read.table(f"{catalog}.{gold_schema}.sb_dim_{data_source}")




In [0]:
deltaTable.alias("target").merge(gold_df.alias("source"),"target.product_code=source.product_code")\
    .whenMatchedUpdate(
        set={
            "product_code":"source.product_code",
            "price_inr":"source.price_inr",
            "year":"source.year"
        }
    )\
    .whenNotMatchedInsert(
        values={
            "product_code":"source.product_code",
            "price_inr":"source.price_inr",
            "year":"source.year"
        }
    ).execute()