In [0]:
%run /Workspace/Users/amank0639@gmail.com/fmcg_repo/consolidated_pipeline/1_Setup/utilities

In [0]:
dbutils.widgets.text("catalog","test_catalog")
dbutils.widgets.text("Data Source","test_source")

In [0]:
catalog=dbutils.widgets.get("catalog")
data_source=dbutils.widgets.get("Data Source")

base_path=f"s3://sportsbar-bucket/{data_source}/*.csv"

In [0]:
from pyspark.sql.types import StructType,StructField,StringType
from pyspark.sql.functions import col,current_timestamp,count,initcap,trim,when,udf,coalesce,concat,lit
customer_schema=(
    StructType([
        StructField("customer_id",StringType(),True),
        StructField("customer_name",StringType(),True),
        StructField("city",StringType(),True)
    ])
    )
df=(spark.
    read.
    format("csv").
    option("header",True).
    schema(customer_schema).
    load(base_path).
    withColumn("read_timestamp",current_timestamp()).
    select('*','_metadata.file_name','_metadata.file_size')
    )
display(df)

In [0]:
df\
.write\
.mode("overwrite")\
.format("delta")\
.option("enableChangeDataFeed","true")\
.saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

### Silver Processing

In [0]:
customers_df=spark.table(f"{catalog}.{bronze_schema}.{data_source}")

#quality checks
#1.check schema
customers_df.printSchema()

#2.check for duplicate data
duplicate_cid=customers_df.groupBy("customer_id").agg(count("customer_id").alias("count")).filter(col("count")>1)
display(duplicate_cid)

#3.check customer_name column
check_customer_name_col=customers_df.filter("customer_name!=trim(customer_name)")
display(check_city_col)

#4.check city column data
check_city_col=customers_df.select("city").distinct()
display(check_city_col)

city_mappings={
    "Bengalore":"Bengaluru",
    "Hyderabadd":"Hyderabad",
    "Hyderbad":"Hyderabad",
    "NewDelhee":"New Delhi",
    "Bengaluruu":"Bengaluru",
    "NewDheli":"New Delhi"
}

allowed={"Bengaluru","Hyderabad","New Delhi"}

@udf
def getCity(city):
    return city_mappings.get(city,city) if(city in allowed) else None

In [0]:
cleaned_df=(
    customers_df.dropDuplicates(subset=["customer_id"]).
    withColumn("customer_name",trim(col("customer_name"))).
    withColumn("customer_name",initcap(col("customer_name"))).
    withColumn("city",getCity(col("city")))
            )

display(cleaned_df)

data=[("789403","New Delhi"),("789420","Bengaluru"),("789521","Hyderabad"),("789603","Hyderabad")]
customer_city_fix_df=spark.createDataFrame(data=data,schema=["customer_id","fixed_city"])


In [0]:
joined_df=(
    cleaned_df.alias("t1").
    join(customer_city_fix_df.alias("t2"),col("t1.customer_id")==col("t2.customer_id"),"left").
    select(col("t1.*"),col("t2.fixed_city")).
    withColumn("city",coalesce(col("city"),col("fixed_city"))).
    drop("fixed_city")
    )

display(joined_df)

In [0]:
customer_silver_df=(
    joined_df.
    withColumn("customer",concat(col("customer_name"),lit("-"),coalesce(col("city"),lit("Unknown")))).
    withColumns({
        "market":lit("India"),
        "platform":lit("Sports Bar"),
        "channel":lit("Acquisition")
    })
    )
display(silver_df)

In [0]:
customer_silver_df\
    .write\
    .format("delta")\
    .mode("overwrite")\
    .option("enableChangeDataFeed","true")\
    .option("mergeSchema","true")\
    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")
    

### Gold Processing

In [0]:
customer_gold_df=spark.table(f"{catalog}.{silver_schema}.{data_source}")\
                    .select("customer_id","customer_name","city","customer","market","platform","channel")
display(customer_gold_df)

In [0]:
customer_gold_df\
    .write\
    .format("delta")\
    .mode("overwrite")\
    .option("enableChangeDataFeed","true")\
    .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

In [0]:
from delta.tables import DeltaTable

customers_table=DeltaTable.forName(spark,f"{catalog}.{gold_schema}.dim_{data_source}")

sb_customers_table=spark.table(f"{catalog}.{gold_schema}.sb_dim_{data_source}")\
        .select(col("customer_id").alias("customer_code"),col("customer"),col("market"),col("platform"),col("channel"))

customers_table.alias("t1").merge(sb_customers_table.alias("t2"),"t1.customer_code=t2.customer_code")\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()

display(spark.table(f"{catalog}.{gold_schema}.dim_{data_source}"))
