In [8]:
"""
Week 4 - Day 4: Modular ETL Pipeline using PySpark
--------------------------------------------------
Goal:
Build a clean, modular ETL pipeline (Extract → Transform → Load)
that reads sales data, processes it, and writes clean output.
"""

# ------------------------------
# 1. Import Required Libraries
# ------------------------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, to_date
import logging

# ------------------------------
# 2. Setup Logging
# ------------------------------
# Logging helps us record what happens during the ETL execution.
# It’s essential for debugging and production monitoring.
logging.basicConfig(
    filename="week4/etl_log.txt",           # log file path
    level=logging.INFO,                     # log level
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# ------------------------------
# 3. Spark Session Function
# ------------------------------
def get_spark_session(app_name="Modular_ETL"):
    """Create and return a Spark session."""
    try:
        spark = SparkSession.builder.appName(app_name).getOrCreate()
        logging.info("✅ Spark session created successfully")
        return spark
    except Exception as e:
        logging.error(f"❌ Failed to create Spark session: {e}")
        raise


# ------------------------------
# 4. Extract Function
# ------------------------------
def extract_data(spark, file_path):
    """
    Reads raw data from a CSV file using Spark.
    Args:
        spark: SparkSession object
        file_path: path to the input CSV file
    Returns:
        DataFrame with raw data
    """
    logging.info("📥 Extracting data from CSV")
    df = spark.read.csv(file_path, header=True, inferSchema=True)
    logging.info(f"Extracted {df.count()} rows from {file_path}")
    return df


# ------------------------------
# 5. Transform Function
# ------------------------------
def transform_data(df):
    """
    Cleans and transforms data.
    - Converts order_date to date type
    - Calculates revenue per order
    - Aggregates revenue by customer
    """
    logging.info("🔄 Transforming data")
    df = df.withColumn("order_date", to_date(col("order_date"))) \
           .withColumn("revenue", col("quantity") * col("unit_price"))
    agg_df = df.groupBy("customer_id").agg(_sum("revenue").alias("total_revenue"))
    logging.info(f"Transformed dataset with {agg_df.count()} customer records")
    return agg_df


# ------------------------------
# 6. Load Function
# ------------------------------
def load_data(df, output_path):
    """
    Saves transformed data to Parquet format.
    Args:
        df: DataFrame to save
        output_path: output directory for Parquet files
    """
    logging.info("💾 Loading data into Parquet")
    df.write.mode("overwrite").parquet(output_path)
    logging.info(f"Data successfully saved to {output_path}")


# ------------------------------
# 7. Main ETL Function
# ------------------------------
def main():
    """Main function to orchestrate the ETL pipeline."""
    spark = get_spark_session()
    try:
        df_raw = extract_data(spark, "data/sales.csv")
        df_transformed = transform_data(df_raw)
        load_data(df_transformed, "week4/output/customer_revenue.parquet")
        logging.info("🎯 ETL pipeline completed successfully!")
    except Exception as e:
        logging.error(f"❌ ETL pipeline failed: {e}")
    finally:
        spark.stop()
        logging.info("🛑 Spark session stopped.")


# ------------------------------
# 8. Entry Point
# ------------------------------
if __name__ == "__main__":
    main()

In [14]:
with open('week4/etl_log.txt', 'r') as f:
    print(f.read())

2025-10-09 08:11:23,073 - INFO - ✅ Spark session created successfully
2025-10-09 08:11:23,074 - INFO - 📥 Extracting data from CSV
2025-10-09 08:11:33,498 - INFO - Extracted 10 rows from data/sales.csv
2025-10-09 08:11:33,499 - INFO - 🔄 Transforming data
2025-10-09 08:11:34,896 - INFO - Transformed dataset with 7 customer records
2025-10-09 08:11:34,897 - INFO - 💾 Loading data into Parquet
2025-10-09 08:11:38,107 - INFO - Data successfully saved to week4/output/customer_revenue.parquet
2025-10-09 08:11:38,108 - INFO - 🎯 ETL pipeline completed successfully!
2025-10-09 08:11:38,361 - INFO - 🛑 Spark session stopped.
2025-10-09 08:14:55,206 - INFO - ✅ Spark session created successfully
2025-10-09 08:14:55,207 - INFO - 📥 Extracting data from CSV
2025-10-09 08:14:56,354 - INFO - Extracted 10 rows from data/sales.csv
2025-10-09 08:14:56,357 - INFO - 🔄 Transforming data
2025-10-09 08:14:57,146 - INFO - Transformed dataset with 7 customer records
2025-10-09 08:14:57,147 - INFO - 💾 Loading data i

In [17]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Modular_ETL").getOrCreate()

spark.read.parquet("week4/output/customer_revenue.parquet").show()

+-----------+-------------+
|customer_id|total_revenue|
+-----------+-------------+
|       1005|         99.9|
|       1002|        49.98|
|       1001|       122.44|
|       1006|          9.0|
|       1007|        89.97|
|       1003|        29.97|
|       1004|        39.98|
+-----------+-------------+

