In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ETL Process for Customer Data
# MAGIC This notebook performs an ETL process on customer data, integrating data from multiple Excel files, cleaning, transforming, and saving the results to a Delta table.

# COMMAND ----------

# MAGIC
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DateType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Load Data from DBFS
# MAGIC Load data from Excel files stored in DBFS.

# COMMAND ----------

# MAGIC
try:
    # Load data from DBFS (assuming the Excel files are already uploaded to DBFS)
    logger.info("Loading data from Excel files in DBFS...")
    uk_bank_holidays_df = spark.read.format("com.crealytics.spark.excel") \
        .option("header", "true") \
        .load("/dbfs/mnt/data/UK_Bank_Holidays.xlsx")

    new_customers_df = spark.read.format("com.crealytics.spark.excel") \
        .option("header", "true") \
        .load("/dbfs/mnt/data/New_Customers.xlsx")

    roi_new_customers_df = spark.read.format("com.crealytics.spark.excel") \
        .option("header", "true") \
        .load("/dbfs/mnt/data/ROI_New_Customers.xlsx")

    logger.info("Data loaded successfully.")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Data Cleaning and Standardization
# MAGIC Standardize column names and handle missing values.

# COMMAND ----------

# MAGIC
# Data Cleaning and Standardization
    logger.info("Standardizing column names and handling missing values...")
    uk_bank_holidays_df = uk_bank_holidays_df.withColumnRenamed("Bank holiday", "UK_Bank_Holiday")
    new_customers_df = new_customers_df.fillna({'New Customers': 0})
    roi_new_customers_df = roi_new_customers_df.fillna({'New Customers': 0})

    # Ensure date columns are in the correct format
    new_customers_df = new_customers_df.withColumn("Date", F.to_date("Date", "yyyy-MM-dd"))
    uk_bank_holidays_df = uk_bank_holidays_df.withColumn("Date", F.to_date("Date", "yyyy-MM-dd"))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Data Integration
# MAGIC Join datasets to integrate data.

# COMMAND ----------

# MAGIC
# Data Integration
    logger.info("Joining datasets...")
    integrated_df = new_customers_df.join(roi_new_customers_df, "Reporting Date", "inner") \
        .join(uk_bank_holidays_df, new_customers_df["Date"] == uk_bank_holidays_df["Date"], "left")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Add Custom Fields
# MAGIC Add custom fields to the integrated dataset.

# COMMAND ----------

# MAGIC
# Add Custom Fields
    logger.info("Adding custom fields...")
    integrated_df = integrated_df.withColumn("Reporting Day", 
                                             F.when(F.col("Day").startswith("S"), "N")
                                             .otherwise(F.when(F.col("UK_Bank_Holiday").isNull(), "Y").otherwise("N")))

    # Construct UK Bank Holiday date
    integrated_df = integrated_df.withColumn("UK_Bank_Holiday", 
                                             F.to_date(F.concat_ws("-", F.col("Year"), F.month("Date"), F.dayofmonth("Date")), "yyyy-MM-dd"))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Aggregation and Business Rules
# MAGIC Aggregate metrics by month and region.

# COMMAND ----------

# MAGIC
# Aggregation and Business Rules
    logger.info("Aggregating metrics by month and region...")
    aggregated_df = integrated_df.groupBy(F.month("Reporting Date").alias("Reporting Month"), 
                                          F.year("Reporting Date").alias("Year")) \
        .agg(F.sum("New Customers").alias("Total New Customers"))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Output to Delta Format
# MAGIC Write the final processed data to a Delta table.

# COMMAND ----------

# MAGIC
# Output to Delta format for better performance
    logger.info("Writing the final processed data to a Delta table...")
    aggregated_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.new_customers_ready_to_report")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
