In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import logging
import os

In [2]:
# Spark & Logging Setup

spark = SparkSession.builder \
    .appName("ETL-Pipeline") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

RAW_PATH = "data/input"
BRONZE_PATH = "data/bronze"
LOG_PATH = "logs"

os.makedirs(f"{BRONZE_PATH}", exist_ok=True)
os.makedirs(f"{LOG_PATH}", exist_ok=True)

# Configure Logging

logfile = os.path.join(LOG_PATH, "etl_transformation_log.log")
logger = logging.getLogger("ETL")
logger.setLevel(logging.INFO)

fh = logging.FileHandler(logfile)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)

# Avoid duplicate logs if handler already exists
if not logger.handlers:
    logger.addHandler(fh)

In [3]:
logger.info("=== Transformation job started ===")

def load_raw(name):
    path = os.path.join(RAW_PATH, f"{name}.csv" )
    logger.info(f"loading raw file: {path}")
    return spark.read.option("header", True).csv(path=path)

def clean_orders(df):
    logger.info("Cleaning orders data")
    df1 = df.dropDuplicates(["order_id"])\
            .withColumn("order_date", col("order_date").cast("date"))\
            .dropna(subset = ["order_date"])
    return df1

def clean_products(df):
    """Clean products dataset"""
    logger.info("Cleaning products data")
    return (df.dropDuplicates(["product_id"])
              .withColumn("price", col("price").cast("double")))

def clean_customers(df):
    """Clean customers dataset"""
    logger.info("Cleaning customers data")
    return df.dropDuplicates(["customer_id"])

def save_to_bronze(df, name):
    path = os.path.join(BRONZE_PATH, f"{name}.parquet")
    df.write.mode("overwrite").parquet(path)
    logger.info(f"Saved {name} to Bronze layer: {path}")

In [4]:
# Transformation Pipeline
# ------------------
try:
    orders = load_raw("orders")
    products = load_raw("products")
    customers = load_raw("customers")

    orders_clean = clean_orders(orders)
    products_clean = clean_products(products)
    customers_clean = clean_customers(customers)

    save_to_bronze(orders_clean, "orders_clean")
    save_to_bronze(products_clean, "products_clean")
    save_to_bronze(customers_clean, "customers_clean")

    logger.info("Transformation job completed successfully")

except Exception as e:
    logger.error(f"Transformation job failed: {str(e)}")
    raise