In [0]:
dbutils.widgets.text("p_file_date", "2024-11-01")
v_file_date = dbutils.widgets.get("p_file_date")


In [0]:
%run ../includes/configurations

In [0]:
bronze_folder_path

Out[92]: '/mnt/clvprojectadls/bronze'

In [0]:
silver_folder_path

Out[93]: '/mnt/clvprojectadls/silver'

#Ingest file date


In [0]:
v_file_date

Out[95]: '2024-11-10'

#This notebook reads data from the customer_info file

#Specifying the schema  

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType,DateType

In [0]:
from pyspark.sql.functions import current_timestamp,lit

In [0]:
customer_info_schema = StructType(fields = [StructField('CustomerId', StringType(),False),
                                            StructField("SignupDate", DateType(), True),
                                            StructField('LoyaltyStatus',StringType(),True)])

# Read the customer_info file

In [0]:
customer_info_df = spark.read\
                   .option('header',True)\
                   .schema(customer_info_schema)\
                   .csv(f"{bronze_folder_path}/{v_file_date}/customer_info.csv")

In [0]:
display(customer_info_df)

CustomerId,SignupDate,LoyaltyStatus
CUST021,2023-01-14,Inactive
CUST022,2023-10-15,Inactive
CUST023,2023-04-12,Inactive
CUST024,2023-11-29,
CUST025,2023-12-26,Inactive
CUST026,2023-10-07,Active
CUST027,2023-08-03,Inactive
CUST028,2023-04-23,Inactive
CUST029,2023-08-18,Active
CUST030,2023-10-29,


# Rename the column


In [0]:
customer_info_renamed_df = customer_info_df.withColumnRenamed("CustomerId","customer_id")\
                          .withColumnRenamed("SignupDate","signup_date")\
                            .withColumnRenamed("LoyaltyStatus","loyal_status")\
                            .withColumn("file_date", lit(v_file_date))

In [0]:
display(customer_info_renamed_df)

customer_id,signup_date,loyal_status,file_date
CUST021,2023-01-14,Inactive,2024-11-10
CUST022,2023-10-15,Inactive,2024-11-10
CUST023,2023-04-12,Inactive,2024-11-10
CUST024,2023-11-29,,2024-11-10
CUST025,2023-12-26,Inactive,2024-11-10
CUST026,2023-10-07,Active,2024-11-10
CUST027,2023-08-03,Inactive,2024-11-10
CUST028,2023-04-23,Inactive,2024-11-10
CUST029,2023-08-18,Active,2024-11-10
CUST030,2023-10-29,,2024-11-10


# Add a column called ingestion date

In [0]:
customer_info_final_df = customer_info_renamed_df.withColumn('ingestion_date', current_timestamp())

In [0]:
display(customer_info_final_df)

customer_id,signup_date,loyal_status,file_date,ingestion_date
CUST021,2023-01-14,Inactive,2024-11-10,2024-11-13T21:43:22.194+0000
CUST022,2023-10-15,Inactive,2024-11-10,2024-11-13T21:43:22.194+0000
CUST023,2023-04-12,Inactive,2024-11-10,2024-11-13T21:43:22.194+0000
CUST024,2023-11-29,,2024-11-10,2024-11-13T21:43:22.194+0000
CUST025,2023-12-26,Inactive,2024-11-10,2024-11-13T21:43:22.194+0000
CUST026,2023-10-07,Active,2024-11-10,2024-11-13T21:43:22.194+0000
CUST027,2023-08-03,Inactive,2024-11-10,2024-11-13T21:43:22.194+0000
CUST028,2023-04-23,Inactive,2024-11-10,2024-11-13T21:43:22.194+0000
CUST029,2023-08-18,Active,2024-11-10,2024-11-13T21:43:22.194+0000
CUST030,2023-10-29,,2024-11-10,2024-11-13T21:43:22.194+0000


# Save the table to the silver folder

In [0]:
spark._jsparkSession.catalog().tableExists("clv_silver.customer_info")

Out[105]: True

In [0]:
spark.conf.set("spark.databricks.optimizer.dynamicPartitionPruning", "true")

from delta.tables import DeltaTable

# Check if the table exists in the catalog
if spark._jsparkSession.catalog().tableExists("clv_silver.customer_info"):
    # Load the existing Delta table
    deltaTable = DeltaTable.forPath(spark, "/mnt/clvprojectadls/silver/customer_info")
    
    # Perform the merge operation
    deltaTable.alias("tgt").merge(customer_info_final_df.alias("src"), "tgt.customer_id = src.customer_id") \
        .whenMatchedUpdateAll() \
        .whenNotMatchedInsertAll() \
        .execute()
else:
    # Write the DataFrame as a new Delta table with partitioning by customer_id
    customer_info_final_df.write \
        .mode('overwrite') \
        .partitionBy('customer_id') \
        .format('delta') \
        .saveAsTable("clv_silver.customer_info")


In [0]:
%sql
SELECT *
FROM clv_silver.customer_info
ORDER BY customer_id

customer_id,signup_date,loyal_status,file_date,ingestion_date
CUST001,2024-01-01,active,2024-11-01,2024-11-13T21:42:48.955+0000
CUST002,2024-03-15,inactive,2024-11-01,2024-11-13T21:42:48.955+0000
CUST003,2023-01-13,Inactive,2024-11-05,2024-11-13T21:43:06.703+0000
CUST004,2023-05-21,Active,2024-11-05,2024-11-13T21:43:06.703+0000
CUST005,2023-05-06,,2024-11-05,2024-11-13T21:43:06.703+0000
CUST006,2023-04-25,Inactive,2024-11-05,2024-11-13T21:43:06.703+0000
CUST007,2023-03-13,,2024-11-05,2024-11-13T21:43:06.703+0000
CUST008,2023-02-22,,2024-11-05,2024-11-13T21:43:06.703+0000
CUST009,2023-12-13,Inactive,2024-11-05,2024-11-13T21:43:06.703+0000
CUST010,2023-10-07,,2024-11-05,2024-11-13T21:43:06.703+0000
