In [10]:
import os
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum , avg, countDistinct, concat

In [11]:
# Configuration
GOLD_PATH = "data/gold"
os.makedirs(GOLD_PATH, exist_ok=True)
os.makedirs("logs", exist_ok = True)
logging.basicConfig(
    filename=os.path.join("logs", "etl_gold.log"),
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

In [12]:
# Spark Session

spark = SparkSession.builder \
    .appName("ETL-Gold-Layer") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

In [13]:
# Load Silver tables
def load_silver(name):
    path = os.path.join("data/silver", name)
    logger.info(f"Loading silver dataset: {path}")
    return spark.read.parquet(path)

# Customer KPIs
def customer_kpis(orders_df):
    logger.info("Building Customer KPIs")
    orders_df.cache()

    cust_kpi_df = orders_df.groupBy("customer_id", "name", "region") \
        .agg(
            countDistinct("order_id").alias("total_orders"),
            sum("order_value").alias("lifetime_value"),
            avg("order_value").alias("avg_order_value"),
            countDistinct(concat(col("Year"), col("Month"))).alias("active_months")
        )

    path = os.path.join(GOLD_PATH, "customer_kpis")
    cust_kpi_df.coalesce(1).write.mode("overwrite").parquet(path)
    logger.info(f"Saved Customer KPIs to Gold layer: {path}")
    orders_df.unpersist()
    return cust_kpi_df


# Product KPIs
def product_kpis(orders_df):
    logger.info("Building Product KPIs")
    orders_df.cache()

    product_df = orders_df.groupBy("product_id", "product_name", "category") \
        .agg(
            sum("quantity").alias("units_sold"),
            sum("order_value").alias("total_revenue"),
            avg("order_value").alias("avg_transaction_value")
        )

    path = os.path.join(GOLD_PATH, "product_kpis")
    product_df.repartition("category").write.mode("overwrite").parquet(path)
    logger.info(f"Saved Product KPIs to Gold layer: {path}")
    orders_df.unpersist()
    return product_df

# Regional KPIs
def region_kpis(orders_df):
    logger.info("Building Regional KPIs")
    orders_df.cache()

    region_df = orders_df.groupBy("region", "Year", "Month") \
        .agg(
            sum("order_value").alias("total_revenue"),
            sum("quantity").alias("total_units")
        )

    path = os.path.join(GOLD_PATH, "region_kpis")
    region_df.repartition("region", "Year", "Month").write.mode("overwrite").parquet(path)
    logger.info(f"Saved Regional KPIs to Gold layer: {path}")
    orders_df.unpersist()
    return region_df


In [14]:
# Run Gold Layer ETL
def run_gold_etl():
    # Load Silver table
    orders_df = load_silver("enriched_orders")
    # Generate KPIs
    customer_kpis(orders_df)
    product_kpis(orders_df)
    region_kpis(orders_df)

    logger.info("Gold ETL completed successfully")


# Execute
if __name__ == "__main__":
    run_gold_etl()