# Lab 2.3 — Reproducibility and Logging Basics

This notebook demonstrates a reproducible ETL process using Pandas with structured logging, environment capture, and data integrity checks.


In [0]:
import logging
from datetime import datetime
import os

# Create logs folder if it doesn't exist
os.makedirs("logs", exist_ok=True)

# Generate timestamped log filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
log_file = f"logs/run_{timestamp}.log"

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    handlers=[
        logging.StreamHandler(),            # Console output
        logging.FileHandler(log_file)       # File output
    ]
)

# Example logging
logging.info("ETL run started")
logging.info("Cluster/runtime info: Databricks cluster XYZ")  # Replace with actual info if available
logging.info("Configuration values: CSV paths, output paths, etc.")


## Part B — Logging Setup

This section configures Python logging to record ETL execution details.
Logs are written both to the console and to a timestamped file.


In [0]:
import logging
from datetime import datetime
import os

# Create logs folder if it doesn't exist
os.makedirs("logs", exist_ok=True)

# Generate timestamped log filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
log_file = f"logs/run_{timestamp}.log"

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler(log_file)
    ]
)

logging.info("ETL run started")
logging.info("Runtime environment: Databricks notebook")
logging.info("Input path: data/Restaurant Orders/")


## Part C — Reproducibility Setup

This section ensures reproducibility by fixing random seeds, capturing the environment,
and hashing input data files.


In [0]:
import random
import numpy as np
import hashlib
import json

# Fix random seeds
os.environ['PYTHONHASHSEED'] = '0'
random.seed(0)
np.random.seed(0)
logging.info("Random seeds fixed")

# Capture environment
!pip freeze > requirements.txt
logging.info("Environment saved to requirements.txt")

# Function to hash files
def hash_file(file_path):
    with open(file_path, "rb") as f:
        return hashlib.sha256(f.read()).hexdigest()

data_hashes = {
    "menu_items.csv": hash_file("data/Restaurant Orders/menu_items.csv"),
    "order_details.csv": hash_file("data/Restaurant Orders/order_details.csv")
}

with open("data_hashes.json", "w") as f:
    json.dump(data_hashes, f, indent=4)

logging.info("SHA-256 hashes saved to data_hashes.json")


## Part D — ETL with Pandas

This section loads, cleans, merges, and analyzes restaurant order data.


In [0]:
import pandas as pd

menu = pd.read_csv("data/Restaurant Orders/menu_items.csv")
orders = pd.read_csv("data/Restaurant Orders/order_details.csv")
logging.info("CSV files loaded")

menu['category'] = menu['category'].str.strip().str.lower()
orders['order_date'] = pd.to_datetime(orders['order_date'])
orders['order_time'] = pd.to_datetime(orders['order_time'])
logging.info("Data cleaned")


In [0]:
combined = orders.merge(menu, left_on='item_id', right_on='menu_item_id')
logging.info("Tables merged")

tidy = combined[['order_id', 'order_date', 'order_time', 'item_name', 'category', 'price']]


In [0]:
# Top 5 items
top_items = tidy['item_name'].value_counts().head(5)
logging.info(f"Top items: {top_items.to_dict()}")

# Revenue by category
revenue_by_category = tidy.groupby('category')['price'].sum().sort_values(ascending=False)
logging.info(f"Revenue by category: {revenue_by_category.to_dict()}")

# Busiest hour
tidy['hour'] = tidy['order_time'].dt.hour
orders_by_hour = tidy.groupby('hour').size().sort_values(ascending=False)
logging.info(f"Busiest hour(s): {orders_by_hour.to_dict()}")


In [0]:
### Ethics Reflection
Sensitive information, such as customer names or payment details, should never be logged because it risks privacy. Similarly, API keys or passwords must not be logged to avoid security breaches. Reproducibility ensures accountability and fairness, allowing results to be verified and trusted when models influence decisions affecting people.


In [0]:
# Define output path
output_dir = "/FileStore/tables/etl_output"
os.makedirs(output_dir, exist_ok=True)

output_path = f"{output_dir}/metrics_{timestamp}.csv"

# Save results
tidy.to_csv(output_path, index=False)
logging.info(f"Metrics saved to {output_path}")

# Assertions
assert not tidy.empty, "ETL output is empty"

expected_columns = [
    'order_id', 'order_date', 'order_time',
    'item_name', 'category', 'price', 'hour'
]

for col in expected_columns:
    assert col in tidy.columns, f"Missing column: {col}"

logging.info("ETL assertions passed")


## Part F — Optional PySpark Comparison

This section demonstrates reading the same data using Spark.


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count

spark = SparkSession.builder.getOrCreate()

menu_spark = spark.read.csv("data/Restaurant Orders/menu_items.csv", header=True, inferSchema=True)
orders_spark = spark.read.csv("data/Restaurant Orders/order_details.csv", header=True, inferSchema=True)

top_item_spark = (
    orders_spark
    .groupBy("item_id")
    .agg(count("*").alias("quantity"))
    .orderBy(col("quantity").desc())
)

logging.info("PySpark metric computed")
