In [0]:
%run ../includes/configurations 

In [0]:
bronze_folder_path

Out[35]: '/mnt/clvprojectadls/bronze'

In [0]:
silver_folder_path

Out[36]: '/mnt/clvprojectadls/silver'

In [0]:
dbutils.widgets.text("p_file_date", "2024-11-01")
v_file_date = dbutils.widgets.get("p_file_date")

#This notebook reads data from the visit_frequency file

#Specifying the schema  

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType,DateType

In [0]:
from pyspark.sql.functions import current_timestamp,lit

In [0]:
visit_frequency_schema = StructType(fields = [StructField('CustomerId', StringType(),False),
                                            StructField("VisitCount", IntegerType(), True)
                                            ])

# Read the customer_info file

In [0]:
visit_frequency_df = spark.read\
                   .option('header',True)\
                   .schema(visit_frequency_schema)\
                   .csv(f"{bronze_folder_path}/{v_file_date}/visit_frequency.csv")

In [0]:
display(visit_frequency_df)

CustomerId,VisitCount
CUST021,13
CUST022,11
CUST023,6
CUST024,15
CUST025,18
CUST026,11
CUST027,18
CUST028,12
CUST029,9
CUST030,20


# Rename the column


In [0]:
visit_frequency_renamed_df = visit_frequency_df.withColumnRenamed("CustomerId","customer_id")\
                                               .withColumnRenamed("VisitCount","visit_count")\
                                               .withColumn("file_date", lit(v_file_date))
                         


In [0]:
display(visit_frequency_renamed_df)

customer_id,visit_count,file_date
CUST021,13,2024-11-10
CUST022,11,2024-11-10
CUST023,6,2024-11-10
CUST024,15,2024-11-10
CUST025,18,2024-11-10
CUST026,11,2024-11-10
CUST027,18,2024-11-10
CUST028,12,2024-11-10
CUST029,9,2024-11-10
CUST030,20,2024-11-10


# Add a column called ingestion date

In [0]:
visit_frequency_final_df = visit_frequency_renamed_df.withColumn('ingestion_date', current_timestamp())

In [0]:
display(visit_frequency_final_df)

customer_id,visit_count,file_date,ingestion_date
CUST021,13,2024-11-10,2024-11-13T21:46:20.777+0000
CUST022,11,2024-11-10,2024-11-13T21:46:20.777+0000
CUST023,6,2024-11-10,2024-11-13T21:46:20.777+0000
CUST024,15,2024-11-10,2024-11-13T21:46:20.777+0000
CUST025,18,2024-11-10,2024-11-13T21:46:20.777+0000
CUST026,11,2024-11-10,2024-11-13T21:46:20.777+0000
CUST027,18,2024-11-10,2024-11-13T21:46:20.777+0000
CUST028,12,2024-11-10,2024-11-13T21:46:20.777+0000
CUST029,9,2024-11-10,2024-11-13T21:46:20.777+0000
CUST030,20,2024-11-10,2024-11-13T21:46:20.777+0000


# Save the table to the silver folder

In [0]:
spark.conf.set("spark.databricks.optimizer.dynamicPartitionPruning", "true")

from delta.tables import DeltaTable

# Check if the table exists in the catalog
if spark._jsparkSession.catalog().tableExists("clv_silver.visit_frequency"):
    # Load the existing Delta table
    deltaTable = DeltaTable.forPath(spark, "/mnt/clvprojectadls/silver/visit_frequency")
    
    # Perform the merge operation
    deltaTable.alias("tgt").merge(visit_frequency_final_df.alias("src"), "tgt.customer_id = src.customer_id") \
        .whenMatchedUpdateAll() \
        .whenNotMatchedInsertAll() \
        .execute()
else:
    # Write the DataFrame as a new Delta table with partitioning by customer_id
    visit_frequency_final_df.write \
        .mode('overwrite') \
        .partitionBy('customer_id') \
        .format('delta') \
        .saveAsTable("clv_silver.visit_frequency")

In [0]:
%sql
SELECT *
FROM clv_silver.visit_frequency
ORDER BY customer_id

customer_id,visit_count,file_date,ingestion_date
CUST001,15,2024-11-01,2024-11-13T21:45:19.336+0000
CUST002,22,2024-11-01,2024-11-13T21:45:19.336+0000
CUST003,15,2024-11-05,2024-11-13T21:45:35.707+0000
CUST004,11,2024-11-05,2024-11-13T21:45:35.707+0000
CUST005,14,2024-11-05,2024-11-13T21:45:35.707+0000
CUST006,5,2024-11-05,2024-11-13T21:45:35.707+0000
CUST007,14,2024-11-05,2024-11-13T21:45:35.707+0000
CUST008,7,2024-11-05,2024-11-13T21:45:35.707+0000
CUST009,14,2024-11-05,2024-11-13T21:45:35.707+0000
CUST010,17,2024-11-05,2024-11-13T21:45:35.707+0000
