In [0]:
%sql
CREATE CATALOG IF NOT EXISTS bank;
CREATE SCHEMA IF NOT EXISTS bank.customer;
CREATE VOLUME IF NOT EXISTS bank.customer.raw;

In [0]:
import smtplib
from email.mime.text import MIMEText


def send_email(subject, body):
    sender = "antoalphi.1@gmail.com"
    recipient = "antoalphi.1@gmail.com"
    msg = MIMEText(body)
    msg["Subject"] = subject
    msg["From"] = sender
    msg["To"] = recipient

    try:
        smtp = smtplib.SMTP("smtp.gmail.com", 587)
        smtp.starttls()
        smtp.login("antoalphi.1@gmail.com", "uann hayy ftwd erfx")
        smtp.sendmail(sender, [recipient], msg.as_string())
        smtp.quit()
    except Exception as e:
        print(f"Failed to send email: {e}")

In [0]:
import os
from pyspark.sql.functions import current_timestamp, lit
from datetime import datetime

# Define volume path
volume_path = "/Volumes/bank/customer/raw/"

# Generate dynamic batch ID based on current date
current_date = datetime.now().strftime("%Y%m%d")
batch_id = f"batch_{current_date}"


# List files in the volume
try:
    files = dbutils.fs.ls(volume_path)
except Exception as e:
    error_message = f"Error processing file path: {str(e)}"
    send_email(f"Failed to list files in volume: error_message", error_message)
    raise

# Process each file
for file in files:
    file_path = file.path
    file_name = os.path.basename(file_path).split(".")[0]
    table_name = f"bank.customer.{file_name}"

    try:
        # Detect file format and read accordingly
        if file_path.endswith(".csv"):
            df = spark.read.option("header", "true").csv(file_path)
        elif file_path.endswith(".json"):
            df = spark.read.option("multiline", "true").json(file_path)
        elif file_path.endswith(".parquet"):
            df = spark.read.parquet(file_path)
        else:
            send_email(f"Unsupported file format: {file_path}", "Unsupported file format")
            continue

        # Add metadata columns
        df = (
            df.withColumn("ingestion_timestamp", current_timestamp())
            .withColumn("source_file_name", lit(file_name))
            .withColumn("source_path", lit(file_path))
            .withColumn("ingested_by", lit("databricks_job"))
            .withColumn("batch_id", lit(batch_id))
        )

        # Write to Delta table with partitioning if applicable
        # Partition by ingestion date (not business fields)
        if "Geography" in df.columns:
            df.write.format("delta").mode("overwrite").partitionBy("region").saveAsTable(table_name)
        else:
            df.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable(table_name)

        send_email("Job Success", "✅ Successfully ingested and created table: {table_name}")

    except Exception as e:
        error_message = f"Error processing file {file_path}: {str(e)}"
        send_email(" ❌ Databricks Job Failure", error_message)
        raise