# Data Transformation Exploration (Circana → model-ready)

This notebook helps you validate the **data transformation pipeline** step-by-step before fitting Bayesian models.

## What you’ll get confidence in

- Input files load correctly (Circana CSV format)
- Expected columns exist after transformation
- Prices / sales / logs look sensible (no weird zeros or outliers)
- Retailer separation behaves as expected (`retailer_filter`)
- “Missing feature” handling works (e.g., Costco missing promo)

## Prereqs

- Place your raw files in `data/`:
  - `data/bjs.csv`
  - `data/sams.csv`
  - `data/costco.csv` (optional)

- Install dependencies:

```bash
pip install -r requirements.txt
```


In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from data_prep import ElasticityDataPrep, PrepConfig

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 180)

# Where outputs from this notebook will go
RESULTS_DIR = Path("results")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print("Notebook environment ready")
print("results/ directory:", RESULTS_DIR.resolve())


In [None]:
# ---- Configure paths (edit if your filenames differ) ----
BJS_PATH = "data/bjs.csv"
SAMS_PATH = "data/sams.csv"
COSTCO_PATH = None  # e.g., "data/costco.csv"

# ---- Configure data preparation ----
# Use retailer_filter="All" to keep retailers separate (needed for hierarchical model).
# Use retailer_filter="Overall" to combine into one pooled dataset.

cfg = PrepConfig(
    retailer_filter="All",
    include_seasonality=True,
    include_promotions=True,
    include_time_trend=True,
    retailers={
        "BJs": {"has_promo": True, "has_competitor": True},
        "Sams": {"has_promo": True, "has_competitor": True},
        # "Costco": {"has_promo": False, "has_competitor": True},
    },
    verbose=True,
)

prep = ElasticityDataPrep(cfg)

df = prep.transform(
    bjs_path=BJS_PATH,
    sams_path=SAMS_PATH,
    costco_path=COSTCO_PATH,
)

df.shape

In [None]:
# ---- Quick inspection ----
display(df.head(10))

# Columns
cols = sorted(df.columns.tolist())
print(f"Columns ({len(cols)}):")
print(cols)


In [None]:
# ---- Retailer breakdown (if applicable) ----
if "Retailer" in df.columns:
    display(df["Retailer"].value_counts())
    display(df.groupby("Retailer").agg(
        n_rows=("Date", "size"),
        min_date=("Date", "min"),
        max_date=("Date", "max"),
    ).sort_values("n_rows", ascending=False))
else:
    print("No Retailer column (likely retailer_filter='Overall' or single-retailer filtering).")


In [None]:
# ---- Numeric sanity checks ----
key_cols = [
    "Unit_Sales_SI","Unit_Sales_PL",
    "Price_SI","Price_PL",
    "Log_Unit_Sales_SI","Log_Price_SI","Log_Price_PL",
    "Promo_Intensity_SI",
    "Week_Number",
    "has_promo","has_competitor",
]
key_cols = [c for c in key_cols if c in df.columns]

display(df[key_cols].describe().T)

# Quick checks for suspicious values
if "Price_SI" in df.columns:
    print("Min Price_SI:", df["Price_SI"].min())
if "Unit_Sales_SI" in df.columns:
    print("Min Unit_Sales_SI:", df["Unit_Sales_SI"].min())

# Missingness report
missing_rate = df[key_cols].isna().mean().sort_values(ascending=False)
display(missing_rate.to_frame("missing_rate"))


In [None]:
# ---- Availability flags sanity (Costco-style missing features) ----
if "Retailer" in df.columns and "has_promo" in df.columns:
    display(df.groupby("Retailer")[["has_promo","has_competitor"]].mean())

    if "Promo_Intensity_SI" in df.columns:
        display(df.groupby("Retailer")["Promo_Intensity_SI"].agg(["mean","min","max"]))
else:
    print("No availability flags found (expected if you did not pass cfg.retailers).")


In [None]:
# ---- Basic plots (interim confidence) ----
df_plot = df.sort_values("Date")

plt.figure(figsize=(14, 4))
plt.plot(df_plot["Date"], df_plot["Unit_Sales_SI"], linewidth=1)
plt.title("Sparkling Ice Unit Sales over time")
plt.xlabel("Date")
plt.ylabel("Unit_Sales_SI")
plt.grid(alpha=0.3)
plt.show()

plt.figure(figsize=(6, 5))
plt.scatter(df_plot["Log_Price_SI"], df_plot["Log_Unit_Sales_SI"], s=12, alpha=0.4)
plt.title("Log Sales vs Log Own Price")
plt.xlabel("Log_Price_SI")
plt.ylabel("Log_Unit_Sales_SI")
plt.grid(alpha=0.3)
plt.show()


In [None]:
# ---- Export prepared data for auditability ----
out_path = RESULTS_DIR / "prepared_data_from_notebook.csv"
df.to_csv(out_path, index=False)
print("Wrote:", out_path)
