In [0]:
from pyspark.sql import functions as F, types as T
from delta.tables import DeltaTable

In [0]:
%run /Workspace/consolidated_pipeline/1_setup/utilities

In [0]:
print(bronze_schema, silver_schema, gold_schema)

In [0]:
# make widgets for catalog and data_source
dbutils.widgets.text("catalog", "fmcg", "Catalog")  # 2 --> 3 (folder name) --> 1
dbutils.widgets.text("data_source", "customers", "Data Scource")

In [0]:
catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")
print(catalog, data_source)

base_path = f"s3://ag-sportsbar/{data_source}/*.csv"
print(base_path)

In [0]:
df = (
    spark.read.option("header", "true")
    .option("inferSchema", "true")
    .csv(base_path)
    .withColumn("_ingested_at", F.current_timestamp())
    .withColumn("_source_file", F.col("_metadata.file_path"))
)
display(df.limit(10))

In [0]:
# enable CDF allows us to track the changes at the row level
df.write.format("delta").option("delta.enableChangeDataFeed", "true").option(
    "mergeSchema", "true"
).mode("overwrite").saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

### Silver Processing

In [0]:
# df_bronze = spark.read.table(f"{catalog}.{bronze_schema}.{data_source}")
df_bronze = spark.sql(f"select * from {catalog}.{bronze_schema}.{data_source};")
df_bronze.show(10, truncate=False)

In [0]:
df_bronze.printSchema()

In [0]:
df_bronze.groupBy("customer_id").count().filter(F.col("count") > 1).show()

In [0]:
df_silver = df_bronze.dropDuplicates(["customer_id"])

# # sanity check
# df_silver.groupBy("customer_id").count().filter("count > 1").show()

In [0]:
print(f"Rows before dropping the duplicates: {df_bronze.count()}")
print(f"Rows after dropping the duplicates: {df_silver.count()}")

In [0]:
display(df_silver.filter(F.col("customer_name") != F.trim(F.col("customer_name"))))

In [0]:
df_silver = df_silver.withColumn("customer_name", F.trim(F.col("customer_name")))

In [0]:
display(df_silver.filter(F.col("customer_name") != F.trim(F.col("customer_name"))))

In [0]:
df_silver.select("customer_name").distinct().show(truncate=False)

In [0]:
df_silver = df_silver.withColumn("customer_name", F.initcap(F.col("customer_name")))

In [0]:
df_silver.select("customer_name").distinct().show(truncate=False)

In [0]:
df_silver.select("city").distinct().show()

In [0]:
city_mapping = {
    "Bengaluruu": "Bengaluru",
    "Bengalore": "Bengaluru",
    "Hyderabadd": "Hyderabad",
    "Hyderbad": "Hyderabad",
    "NewDelhi": "New Delhi",
    "NewDheli": "New Delhi",
    "NewDelhee": "New Delhi",
}

allowed = ["Bengaluru", "Hyderabad", "New Delhi"]

df_silver = df_silver.replace(city_mapping, subset="city").withColumn(
    "city",
    F.when(F.col("city").isin(allowed), F.col("city"))
    .when(F.col("city").isNull(), None)
    .otherwise(None),
)
df_silver.select("city").show(10)

In [0]:
df_silver.select('city').distinct().show()

In [0]:
df_silver.filter(F.col("city").isNull()).show()

In [0]:
null_customer_names = [
    "Sprintx Nutrition",
    "Zenathlete Foods",
    "Primefuel Nutrition",
    "Recovery Lane",
]

df_silver.filter(F.col("customer_name").isin(null_customer_names)).show()

In [0]:
# Business Confirmation Note: City corrections confirmed by business team
customer_city_fix = {
    # Sprintx Nutrition
    789403: "New Delhi",
    # Zenathlete Foods
    789420: "Bengaluru",
    # Primefuel Nutrition
    789521: "Hyderabad",
    # Recovery Lane
    789603: "Hyderabad",
}

df_fix = spark.createDataFrame(
    [(k, v) for k, v in customer_city_fix.items()],
    ["customer_id", "fixed_city"],
)
df_silver = df_silver.join(df_fix, on="customer_id", how="left")
df_silver.show(5)

In [0]:
df_silver = df_silver.withColumn(
    "city",
    F.coalesce(
        F.col("city"), F.col("fixed_city")
    ),  # coalesce returns the first non-null value
).drop("fixed_city")
df_silver.show()

In [0]:
# sanity check
null_customer_names = [
    "Sprintx Nutrition",
    "Zenathlete Foods",
    "Primefuel Nutrition",
    "Recovery Lane",
]
df_silver.filter(F.col("customer_name").isin(null_customer_names)).show()

In [0]:
df_silver.printSchema()

In [0]:
df_silver = df_silver.withColumn("customer_id", F.col("customer_id").cast("string"))
df_silver.printSchema()

In [0]:
df_silver = (
    df_silver.withColumn(
        "customer",
        F.concat_ws(
            "-", "customer_name", F.coalesce(F.col("city"), F.lit("unknown"))
        ),  # F.concat_ws is concat with separator, F.concat_ws(separator, *cols)
    )
    .withColumn("market", F.lit("India"))
    .withColumn("platform", F.lit("Sports Bar"))
    .withColumn("channel", F.lit("Acquisition"))
)
df_silver.show(truncate=False)

In [0]:
df_silver.write.format("delta").option("delta.enableChangeDataFeed", "true").mode(
    "overwrite"
).option("mergeSchema", "true").saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

### Gold

In [0]:
df_silver = spark.sql(f"select * from {catalog}.{silver_schema}.{data_source}")
display(df_silver.limit(5))

In [0]:
# take required columns only
df_gold = df_silver.select("customer_id", "customer_name", "city", "customer", "market", "platform", "channel")

In [0]:
df_gold.write.format("delta").option("delta.enableChangeDataFeed", "true").mode(
    "overwrite"
).option("mergeSchema", "true").saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

### Merging Data Scource with Parent

In [0]:
delta_table = DeltaTable.forName(spark, f"{catalog}.{gold_schema}.dim_{data_source}")
df_child_customers = spark.table("fmcg.gold.sb_dim_customers").select(
    F.col("customer_id").alias("customer_code"),
    "customer",
    "market",
    "platform",
    "channel",
)

In [0]:
delta_table.alias("target").merge(
    source=df_child_customers.alias("source"),
    condition="target.customer_code = source.customer_code",
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()