In [0]:
%run ../includes/configurations

In [0]:
bronze_folder_path

Out[83]: '/mnt/clvprojectadls/bronze'

In [0]:
silver_folder_path

Out[84]: '/mnt/clvprojectadls/silver'

In [0]:
dbutils.widgets.text("p_file_date", "2024-11-01")
v_file_date = dbutils.widgets.get("p_file_date")

#This notebook reads data from the discount_date file

#Specifying the schema  

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType,DateType

In [0]:
from pyspark.sql.functions import current_timestamp, lit

In [0]:
discounts_data_schema = StructType(fields = [StructField('CustomerId', StringType(),False),
                                            StructField("TransactionDate", DateType(), True),
                                            StructField('DiscountAmount',DoubleType(),True)])

# Read the customer_info file

In [0]:
discounts_data_df = spark.read\
                   .option('header',True)\
                   .schema(discounts_data_schema)\
                   .csv(f"{bronze_folder_path}/{v_file_date}/discounts_data.csv")

In [0]:
display(discounts_data_df)

CustomerId,TransactionDate,DiscountAmount
CUST021,2023-06-20,2.79
CUST021,2023-03-05,1.54
CUST021,2023-12-06,4.91
CUST021,2023-03-07,4.91
CUST022,2023-02-07,4.21
CUST022,2023-02-08,3.26
CUST022,2023-05-14,1.27
CUST022,2023-07-17,4.02
CUST023,2023-04-12,4.93
CUST023,2023-12-13,4.67


# Rename the column


In [0]:
discounts_data_renamed_df = discounts_data_df.withColumnRenamed("CustomerId","customer_id")\
                          .withColumnRenamed("TransactionDate","transaction_date")\
                            .withColumnRenamed("DiscountDate","discount_date")\
                            .withColumn("file_date", lit(v_file_date))

In [0]:
display(discounts_data_renamed_df)

customer_id,transaction_date,DiscountAmount,file_date
CUST021,2023-06-20,2.79,2024-11-10
CUST021,2023-03-05,1.54,2024-11-10
CUST021,2023-12-06,4.91,2024-11-10
CUST021,2023-03-07,4.91,2024-11-10
CUST022,2023-02-07,4.21,2024-11-10
CUST022,2023-02-08,3.26,2024-11-10
CUST022,2023-05-14,1.27,2024-11-10
CUST022,2023-07-17,4.02,2024-11-10
CUST023,2023-04-12,4.93,2024-11-10
CUST023,2023-12-13,4.67,2024-11-10


# Add a column called ingestion date

In [0]:
discounts_data_final_df = discounts_data_renamed_df.withColumn('ingestion_date', current_timestamp())

In [0]:
display(discounts_data_final_df )

customer_id,transaction_date,DiscountAmount,file_date,ingestion_date
CUST021,2023-06-20,2.79,2024-11-10,2024-11-13T21:44:41.834+0000
CUST021,2023-03-05,1.54,2024-11-10,2024-11-13T21:44:41.834+0000
CUST021,2023-12-06,4.91,2024-11-10,2024-11-13T21:44:41.834+0000
CUST021,2023-03-07,4.91,2024-11-10,2024-11-13T21:44:41.834+0000
CUST022,2023-02-07,4.21,2024-11-10,2024-11-13T21:44:41.834+0000
CUST022,2023-02-08,3.26,2024-11-10,2024-11-13T21:44:41.834+0000
CUST022,2023-05-14,1.27,2024-11-10,2024-11-13T21:44:41.834+0000
CUST022,2023-07-17,4.02,2024-11-10,2024-11-13T21:44:41.834+0000
CUST023,2023-04-12,4.93,2024-11-10,2024-11-13T21:44:41.834+0000
CUST023,2023-12-13,4.67,2024-11-10,2024-11-13T21:44:41.834+0000


# Save the table to the silver folder

In [0]:
spark.conf.set("spark.databricks.optimizer.dynamicPartitionPruning", "true")

from delta.tables import DeltaTable

# Check if the table exists in the catalog
if spark._jsparkSession.catalog().tableExists("clv_silver.discounts_data"):
    # Load the existing Delta table
    deltaTable = DeltaTable.forPath(spark, "/mnt/clvprojectadls/silver/discounts_data")
    
    # Perform the merge operation
    deltaTable.alias("tgt").merge(discounts_data_final_df.alias("src"), "tgt.customer_id = src.customer_id") \
        .whenMatchedUpdateAll() \
        .whenNotMatchedInsertAll() \
        .execute()
else:
    # Write the DataFrame as a new Delta table with partitioning by customer_id
    discounts_data_final_df .write \
        .mode('overwrite') \
        .partitionBy('customer_id') \
        .format('delta') \
        .saveAsTable("clv_silver.discounts_data")

In [0]:
%sql
SELECT *
FROM clv_silver.discounts_data
ORDER BY customer_id

customer_id,transaction_date,DiscountAmount,file_date,ingestion_date
CUST001,2023-06-18,3.69,2024-11-01,2024-11-13T21:44:10.679+0000
CUST002,2023-01-05,1.62,2024-11-01,2024-11-13T21:44:10.679+0000
CUST003,2023-09-01,1.66,2024-11-05,2024-11-13T21:44:24.151+0000
CUST003,2023-10-20,3.88,2024-11-05,2024-11-13T21:44:24.151+0000
CUST004,2023-11-07,1.38,2024-11-05,2024-11-13T21:44:24.151+0000
CUST004,2023-02-06,4.76,2024-11-05,2024-11-13T21:44:24.151+0000
CUST004,2023-10-24,4.67,2024-11-05,2024-11-13T21:44:24.151+0000
CUST005,2023-05-26,4.42,2024-11-05,2024-11-13T21:44:24.151+0000
CUST006,2023-08-15,4.05,2024-11-05,2024-11-13T21:44:24.151+0000
CUST006,2023-12-15,4.15,2024-11-05,2024-11-13T21:44:24.151+0000
