In [0]:
from pyspark.sql import functions as F, types as T
from delta.tables import DeltaTable

In [0]:
%run /Workspace/consolidated_pipeline/1_setup/utilities

In [0]:
print(bronze_schema, silver_schema, gold_schema)

In [0]:
dbutils.widgets.text("catalog_name", "fmcg", "Catalog")
dbutils.widgets.text("data_source", "gross_price", "Data Source")

In [0]:
catalog_name = dbutils.widgets.get("catalog_name")
data_source = dbutils.widgets.get("data_source")
print(catalog_name, data_source)

base_path = "s3://ag-sportsbar/gross_price/*.csv"
print(base_path)

In [0]:
df = (
    spark.read.option("header", True)
    .option("inferSchema", "true")
    .csv(base_path)
    .withColumn("_ingested_at", F.current_timestamp())
    .select("*", "_metadata.file_name", "_metadata.file_size", "_metadata.file_path")
)
display(df.limit(5))

In [0]:
df.write.format("delta").option("delta.enableChangeDataFeed", "true").option(
    "mergeSchema", "true"
).mode("overwrite").saveAsTable(f"{catalog_name}.{bronze_schema}.{data_source}")

### Silver

In [0]:
df_bronze = spark.sql(f"select * from {catalog_name}.{bronze_schema}.{data_source}")
display(df_bronze.limit(5))

In [0]:
df_bronze.select("month").distinct().show()

In [0]:
date_formats = ["yyyy/MM/dd", "dd/MM/yyyy", "yyyy-MM-dd", "dd-MM-yyyy"]

df_silver = df_bronze.withColumn(
    "month",
    F.coalesce(
        F.try_to_date(F.col("month"), "yyyy-MM-dd"),
        F.try_to_date(F.col("month"), "dd-MM-yyyy"),
        F.try_to_date(F.col("month"), "yyyy/MM/dd"),
        F.try_to_date(F.col("month"), "dd/MM/yyyy"),
    ),
)
df_silver.select("month").distinct().show()

In [0]:
df_silver = df_silver.withColumn(
    "gross_price",
    F.when(
        F.col("gross_price").rlike(r"^-?\d+(\.\d+)?$"),
        F.when(
            F.col("gross_price").cast("int") < 0, F.col("gross_price") * -1
        ).otherwise(F.col("gross_price").cast("int")),
    ).otherwise(F.lit(0)),
)
df_silver.show(10)

In [0]:
df_products = spark.table(f"{catalog_name}.{silver_schema}.products")
df_joined = df_silver.join(
    df_products.select("product_id", "product_code"), on="product_id", how="inner"
)
df_joined = df_joined.select(
    "product_id",
    "product_code",
    "month",
    "gross_price",
    "_ingested_at",
    "file_name",
    "file_size",
    "file_path",
)
df_joined.show(5)

In [0]:
df_joined.write.format("delta").option("delta.enableChangeDataFeed", "true").option(
    "mergeSchema", "true"
).mode("overwrite").saveAsTable(f"{catalog_name}.{silver_schema}.{data_source}")

### Gold

In [0]:
df_silver = spark.sql(f"select * from {catalog_name}.{silver_schema}.{data_source}")
display(df_silver.limit(5))

In [0]:
df_gold = df_silver.withColumn("year", F.year(F.col("month"))).withColumn(
    "is_zero", F.when(F.col("gross_price") == 0, 1).otherwise(0)
)
display(df_gold.limit(5))

In [0]:
from pyspark.sql.window import Window

In [0]:
W = Window.partitionBy("product_code", "year").orderBy(
    F.col("is_zero"), F.col("month").desc()
)
df_gold = df_gold.withColumn("rank", F.row_number().over(W)).filter("rank = 1").drop("rank")

In [0]:
display(df_gold)

In [0]:
df_gold.write.format("delta").option("delta.enableChangeDataFeed", "true").option(
    "mergeSchema", "true"
).mode("overwrite").saveAsTable(f"{catalog_name}.{gold_schema}.sb_dim_{data_source}")

### Merging Data Scource with Parent

In [0]:
df_final = spark.sql(f"select * from {catalog_name}.{gold_schema}.sb_dim_{data_source}")
df_final = df_final.select("product_code", "gross_price", "year").withColumnRenamed("gross_price", "price_inr")
display(df_final.limit(5))

In [0]:
delta_table = DeltaTable.forName(spark, "fmcg.gold.dim_gross_price")
df_child_table = df_final

In [0]:
delta_table.alias("target").merge(
    source=df_child_table.alias("source"),
    condition="target.product_code = source.product_code",
).whenMatchedUpdate(
    set={"price_inr": "source.price_inr", "year": "source.year"}
).whenNotMatchedInsert(
    values={
        "product_code": "source.product_code",
        "price_inr": "source.price_inr",
        "year": "source.year",
    }
).execute()