In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable


In [0]:
%run /Workspace/Ecommerce/consolidate_pipeline/Setup/utilities

In [0]:
dbutils.widgets.text("catalog","fmcg","Catalog")
dbutils.widgets.text("data_source","customers","Data source")


In [0]:
catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

print(catalog,data_source)

In [0]:
base_path = f's3://child-sportsbar-cd/{data_source}/*csv'
print(base_path)

In [0]:
df = (
    spark.read.format('csv')
         .option('header',True)
         .option('inferSchema',True)
         .load(base_path)
         .withColumn('read_timestamp',F.current_timestamp())
         .select("*","_metadata.file_name","_metadata.file_size")
)

display(df.limit(10))

In [0]:
df.printSchema()

Write it to bronze table

In [0]:
df.write\
    .format('delta')\
    .option("delta.enableChangeDataFeed","true")\
    .mode('overwrite')\
    .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")


# Silver data proocessing

In [0]:
df_bronze = spark.sql(f"SELECT * FROM {catalog}.{bronze_schema}.{data_source}")
 

In [0]:
df_duplicates = df_bronze.groupBy('customer_id').count().filter(F.col('count')>1)
df_duplicates

In [0]:
df_silver = df_bronze.dropDuplicates(['customer_id'])

In [0]:
df_silver.withColumn('customer_name',F.trim(F.col('customer_name')))

In [0]:
df_silver.select('city').distinct().show()

In [0]:
city_mapping = {
    'Bengaluruu':'Bengaluru',
    'Bengalore':'Bengaluru',

    'NewDheli':'New Delhi',
    'NewDelhee':'New Delhi',

    'Hyderabadd':'Hyderabad',
    'Hyderabadd':'Hyderabad'
}

In [0]:
allowed = ['Bengaluru','New Delhi','Hyderabad']
df_silver = (
    df_silver
    .replace(city_mapping,subset=["city"])
    .withColumn(
        "city",
        F.when(F.col("city").isNull(),None)
         .when(F.col('city').isin(allowed),F.col('city'))
         .otherwise(None)
       )
    )

display(df_silver)

In [0]:
df_silver.select('city').distinct().show()

In [0]:
#Title case
df_silver.select('customer_name').distinct().show()

In [0]:
df_silver = df_silver.withColumn('customer_name', F.when(F.col('customer_name').isNull(), None).otherwise(F.initcap(F.col('customer_name'))))

In [0]:
df_silver.select('customer_name').distinct().show()

In [0]:
print(df_silver.filter(F.col('city').isNull()).count())
df_silver.filter(F.col('city').isNull()).show(truncate=False)

In [0]:
null_customers_name = ['Sprintx Nutrition','Zenathlete Foods','Primefuel Nutrition','Recovery Lane']

In [0]:
df_silver.filter(F.col('customer_name').isin(null_customers_name)).show(truncate=False)

In [0]:
customers_fix = {
    789403:"New Delhi",

    789420:"Bengaluru",
    789421:"Hyderabad",

    789521: "Hyderabad"
}

df_fix = spark.createDataFrame(
    [(k,v) for k,v in customers_fix.items()],
    ['customer_id','fixed_city']
)

display(df_fix)

In [0]:
df_silver = (
    df_silver
     .join(df_fix,'customer_id','left')
     .withColumn(
         'city',
         F.coalesce(F.col('city'), F.col('fixed_city'))
       )
     .drop('fixed_city')
     )

df_silver.filter(F.col('city').isNull()).show(truncate=False)


In [0]:
df_silver = df_silver.withColumn('customer_id',F.col('customer_id').cast('string'))
print(df_silver.printSchema())

In [0]:
df_silver = (
    df_silver
     .withColumn("customer",F.concat_ws("_","customer_name",F.coalesce(F.col("city"),F.lit('Unknown'))))
     .withColumn("market",F.lit("India"))
     .withColumn("platform", F.lit("Sports Bar"))
     .withColumn("channel",F.lit("Acquisition"))
)

display(df_silver.limit(5))

In [0]:
df_silver.write\
    .format('delta')\
    .option("delta.enableChangeDataFeed","true")\
    .option("mergeSchema","true")\
    .mode('overwrite')\
    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")


## Gold Layer processing

In [0]:
df_silver = spark.sql(f"SELECT * FROM {catalog}.{silver_schema}.{data_source}")

In [0]:
df_gold = df_silver.select("customer_id","customer_name","city","customer","market","platform","channel")

In [0]:
df_gold.write\
    .format('delta')\
    .option("delta.enableChangeDataFeed","true")\
    .option("mergeSchema","true")\
    .mode('overwrite')\
    .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

In [0]:
delta_table = DeltaTable.forName(spark,"fmcg.gold.dim_customers")
df_child_customers = spark.table("fmcg.gold.sb_dim_customers").select(
    F.col("customer_id").alias("customer_code"),
    "customer",
    "channel",
    "market",
    "platform"
)

display(df_child_customers.limit(3))

In [0]:
# merging tables

delta_table.alias("target").merge(
    source=df_child_customers.alias("source"),
    condition="target.customer_code = source.customer_code"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()