In [0]:
from pyspark.sql import functions as F, types as T
from delta.tables import DeltaTable

In [0]:
%run /Workspace/consolidated_pipeline/1_setup/utilities

In [0]:
print(bronze_schema, silver_schema, gold_schema)

In [0]:
dbutils.widgets.text("catalog_name", "fmcg", "Catalog")
dbutils.widgets.text("data_source", "products", "Data Source")

In [0]:
catalog_name = dbutils.widgets.get("catalog_name")
data_source = dbutils.widgets.get("data_source")

base_path = f"s3://ag-sportsbar/{data_source}/*.csv"
print(base_path)

In [0]:
df = (
    spark.read.option("header", "true")
    .option("inferSchema", "true")
    .csv(base_path)
    .withColumn("_ingested_at", F.current_timestamp())
    .select("*", "_metadata.file_name", "_metadata.file_size", "_metadata.file_path")
)
display(df.limit(5))

In [0]:
df.printSchema()

In [0]:
df.write.format("delta").option("delta.enableChangeDataFeed", "true").option(
    "mergeSchema", "true"
).mode("overwrite").saveAsTable(f"{catalog_name}.{bronze_schema}.{data_source}")

### Silver

In [0]:
df_bronze = spark.read.table(f"{catalog_name}.{bronze_schema}.{data_source}")
display(df_bronze.limit(5))

In [0]:
df_bronze.groupBy("product_id").count().filter("count > 1").show()

In [0]:
df_silver = df_bronze.dropDuplicates(["product_id"])
df_silver.groupBy("product_id").count().filter("count > 1").show()

In [0]:
df_silver.select("category").distinct().show()

In [0]:
df_silver = df_silver.withColumn(
    "category",
    F.when(F.col("category").isNull(), None).otherwise(F.initcap("category")),
)
df_silver.select("category").distinct().show()

In [0]:
df_silver = df_silver.withColumn(
    "product_name", F.regexp_replace(F.col("product_name"), "(?i)Protien", "Protein")
).withColumn("category", F.regexp_replace(F.col("category"), "(?i)Protien", "Protein"))
display(df_silver.limit(5))

In [0]:
# Add division column
df_silver = df_silver.withColumn(
    "division",
    F.when(F.col("category") == "Energy Bars", "Nutrition Bars")
    .when(F.col("category") == "Protein Bars", "Nutrition Bars")
    .when(F.col("category") == "Granola & Cereals", "Breakfast Foods")
    .when(F.col("category") == "Recovery Dairy", "Dairy & Recovery")
    .when(F.col("category") == "Healthy Snacks", "Healthy Snacks")
    .when(F.col("category") == "Electrolyte Mix", "Hydration & Electrolytes")
    .otherwise("Other"),
)

df_silver = df_silver.withColumn(
    "variant", F.regexp_replace(F.col("product_name"), r"\((.*?)\)", 1)
)

df_silver = (
    df_silver.withColumn("product_code", F.sha2(F.col("product_id"), 256))
    .withColumn(
        "product_id",
        F.when(
            F.col("product_id")
            .cast("string")
            .rlike("^[0-9]+$"),  # rlike() is a function use regular expressions
            F.col("product_id").cast("string"),
        ).otherwise(F.lit(999999).cast("string")),
    )
    .withColumnRenamed("product_name", "product")
)

In [0]:
display(df_silver)

In [0]:
df_silver = df_silver.select(
    "product_code",
    "division",
    "category",
    "product",
    "variant",
    "product_id",
    "_ingested_at",
    "file_name",
    "file_size",
)

In [0]:
display(df_silver)

In [0]:
df_silver.write.format("delta").option("delta.enableChangeDataFeed", "true").option(
    "mergeSchema", "true"
).mode("overwrite").saveAsTable(f"{catalog_name}.{silver_schema}.{data_source}")

### Gold

In [0]:
df_silver = spark.sql(f"SELECT * FROM {catalog_name}.{silver_schema}.{data_source}")
df_gold = df_silver.select("product_code", "division", "category", "product", "variant")
display(df_gold.limit(5))

In [0]:
df_gold.write.format("delta").option("delta.enableChangeDataFeed", "true").option(
    "mergeSchema", "true"
).mode("overwrite").saveAsTable(f"{catalog_name}.{gold_schema}.sb_dim_{data_source}")

### Merging Data Source with Parent

In [0]:
delta_table = DeltaTable.forName(spark, "fmcg.gold.dim_products")
df_child_products = spark.sql(
    f"SELECT product_code, division, category, product, variant FROM fmcg.gold.sb_dim_products;"
)
df_child_products.show(5)

In [0]:
delta_table.alias("target").merge(
    source=df_child_products.alias("source"),
    condition="target.product_code = source.product_code",  # join condition, we don't repeat it in the set{...} while updation
).whenMatchedUpdate(
    set={
        "division": "source.division",
        "category": "source.category",
        "product": "source.product",
        "variant": "source.variant",
    }
).whenNotMatchedInsert(
    values={
        "product_code": "source.product_code",
        "division": "source.division",
        "category": "source.category",
        "product": "source.product",
        "variant": "source.variant",
    }
).execute()