In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [0]:
%run /Workspace/Users/amank0639@gmail.com/fmcg_repo/consolidated_pipeline/1_Setup/utilities

In [0]:
dbutils.widgets.text("Data_Source","orders")
dbutils.widgets.text("Catalog","fmcg")

In [0]:
data_source=dbutils.widgets.get("Data_Source")
catalog=dbutils.widgets.get("Catalog")

In [0]:
base_path=f"s3://sportsbar-bucket/{data_source}"
landing_path=f"{base_path}/landing"
processed_path=f"{base_path}/processed"

print(base_path)
print(landing_path)
print(processed_path)

#define tables

bronze_table=f"{catalog}.{bronze_schema}.{data_source}"
silver_table=f"{catalog}.{silver_schema}.{data_source}"
gold_table=f"{catalog}.{gold_schema}.sb_fact_{data_source}"



In [0]:
df=(
    spark
    .read
    .format("csv")
    .option("header","true")
    .option("inferSchema","true")
    .load(landing_path)
    .withColumn("read_timestamp",F.current_timestamp())
    .select("*","_metadata.file_name","_metadata.file_size")
    )

display(df)

In [0]:
df.write\
    .format("delta")\
    .mode("append")\
    .option("delta.enableChangeDataFeed","true")\
    .saveAsTable(bronze_table)

In [0]:
files=dbutils.fs.ls(landing_path)

for file in files:
    dbutils.fs.mv(
        file.path,
        f"{processed_path}/{file.name}",
        True
    )

In [0]:
bronze_df=spark.read.table(bronze_table)

#keep orders which have order quantity
bronze_df=bronze_df.filter("order_qty is not null")
display(bronze_df)

In [0]:
#check customer_id column
bronze_df=(
    bronze_df
    .withColumn("customer_id",
                F.when(F.col("customer_id").rlike("^[0-9]+$"),
                F.col("customer_id"))
                .otherwise("999999")
                .cast("string"))
           )
display(bronze_df)

In [0]:
bronze_df=bronze_df.withColumn("order_placement_date",F.regexp_replace(F.col("order_placement_date"),r"^[A-Za-z]+,\s*",""))

display(bronze_df)

In [0]:
bronze_df=bronze_df.withColumn("order_placement_date",
                F.coalesce(
                        F.try_to_date(F.col("order_placement_date"),"yyyy/MM/dd"),
                        F.try_to_date(F.col("order_placement_date"),"dd-MM-yyyy"),
                        F.try_to_date(F.col("order_placement_date"),"dd/MM/yyyy"),
                        F.try_to_date(F.col("order_placement_date"),"MMMM dd, yyyy")
                        )
                    )
display(bronze_df)

In [0]:
bronze_df=bronze_df.dropDuplicates(["order_id","order_placement_date","customer_id","product_id","order_qty"])
display(bronze_df)

In [0]:
products_df=spark.read.table("fmcg.silver.products")
display(products_df)

In [0]:
joined_df=(
    bronze_df
    .alias("t1")
    .join(products_df.alias("t2"),F.col("t1.product_id")==F.col("t2.product_id"),"inner")
    .select(F.col("t1.*"),F.col("t2.product_code"))
    )
display(joined_df)

In [0]:
if not(spark.catalog.tableExists(silver_table)):
    joined_df.write\
        .format("delta")\
        .mode("overwrite")\
        .option("delta.enableChangeDataFeed","true")\
        .option("mergeSchema","true")\
        .saveAsTable(silver_table)
else:
    deltaTable=DeltaTable.forName(spark,silver_table)

    deltaTable.alias("silver")\
        .merge(joined_df.alias("bronze"),"silver.order_id=bronze.order_id AND silver.order_placement_date=bronze.order_placement_date AND silver.customer_id=bronze.customer_id AND silver.product_code=bronze.product_code")\
        .whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

### Gold processing

In [0]:
gold_df=(
    spark
    .read
    .table(silver_table)
    .select(F.col("order_id"),F.col("order_placement_date").alias("date"),
            F.col("customer_id").alias("customer_code"),F.col("product_code"),
            F.col("product_id"),F.col("order_qty").alias("sold_quantity"))
    )
display(gold_df)

In [0]:
if not(spark.catalog.tableExists(gold_table)):
    gold_df.write\
        .format("delta")\
        .mode("overwrite")\
        .option("delta.enableChangeDataFeed","true")\
        .option("mergeSchema","true")\
        .saveAsTable(gold_table)
else:
    deltaTable=DeltaTable.forName(spark,gold_table)

    deltaTable.alias("source")\
        .merge(gold_df.alias("gold"),
               "source.date=gold.date AND source.order_id=gold.order_id AND source.customer_code=gold.customer_code AND source.product_code=gold.product_code")\
        .whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()



### Merge with Parent table

In [0]:
df_child=(
    spark
    .read
    .table(gold_table)
    .select(F.col("date"),F.col("product_code"),F.col("customer_code"),F.col("sold_quantity"))
    )

display(df_child)

In [0]:
#convert all the dates to 1st date of the respective month

df_child=df_child.withColumn("date",F.date_trunc("month",F.col("date")).cast("date"))\
        .groupBy("date","product_code","customer_code").agg(F.sum("sold_quantity").alias("sold_quantity"))
display(df_child)



In [0]:
deltaTable=DeltaTable.forName(spark,"fmcg.gold.fact_orders")

deltaTable.alias("parent").merge(df_child.alias("child"),"child.date=parent.date AND child.product_code=parent.product_code AND child.customer_code=parent.customer_code")\
.whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()