In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, year, month
import logging, os

In [14]:
spark = SparkSession.builder \
    .appName("ETL-Pipeline") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

BRONZE_PATH = "data/bronze"
SILVER_PATH = "data/silver"
LOG_PATH = "logs"

os.makedirs(SILVER_PATH, exist_ok=True)
os.makedirs(LOG_PATH, exist_ok=True)

# Logging setup
logfile = os.path.join(LOG_PATH, "etl_transformation_log.log")
logger = logging.getLogger("ETL")
logger.setLevel(logging.INFO)

fh = logging.FileHandler(logfile)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)

# Avoid duplicate logs if handler already exists
if not logger.handlers:
    logger.addHandler(fh)

In [15]:
logger.info("===Silver layer transformation===")

def load_bronze(name: str):
    path = os.path.join(BRONZE_PATH, f"{name}.parquet")
    logger.info(f"Loading Bronze dataset: {path}")
    return spark.read.parquet(path)

def orders_extended(orders, products, customers):
    logger.info("Joining 3 dataframes")
    ord_extnd = orders.join(products, "product_id", "left")\
                        .join(customers, "customer_id", "left")\
                        .withColumn("order_value", col("quantity")*col("price"))
    return ord_extnd

def save_to_silver(df, name):
    path = os.path.join(SILVER_PATH, name)
    df = df.coalesce(1)
    df = df.withColumn("Year", year("order_date")).withColumn("Month", month("order_date"))
    (df.write
       .mode("overwrite")
       .partitionBy("region", "Year", "Month")
       .parquet(path))
    logging.info(f"Saved {name} to Silver layer: {path}")

INFO:ETL:===Silver layer transformation===


In [None]:
try:
    orders = load_bronze("orders_clean")
    products = load_bronze("products_clean")
    customers = load_bronze("customers_clean")

    enriched_orders = orders_extended(orders, products, customers)

    save_to_silver(enriched_orders, "enriched_orders")

    logger.info("Silver enrichment job completed successfully")

except Exception as e:
    logger.error(f"Silver job failed: {str(e)}")
    raise

INFO:ETL:Loading Bronze dataset: data/bronze/orders_clean.parquet
INFO:ETL:Loading Bronze dataset: data/bronze/products_clean.parquet
INFO:ETL:Loading Bronze dataset: data/bronze/customers_clean.parquet
INFO:ETL:Joining 3 dataframes
INFO:ETL:âœ… Silver enrichment job completed successfully
