# Cell 1 — Title & Metadata (Markdown)

# 0.1 — EDA & Feature Engineering (config-driven)

**Goal:** config-driven EDA and light feature engineering for modeling readiness.

**Config:** `config/config.yaml`

In [None]:
import os, random, re, json
from pathlib import Path
from datetime import datetime

import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load config
PROJECT_DIR = Path.cwd().resolve().parent if Path.cwd().name == "notebooks" else Path.cwd().resolve()
CFG_PATH = PROJECT_DIR / "config" / "config.yaml"
assert CFG_PATH.exists(), f"Config not found: {CFG_PATH}"
with CFG_PATH.open("r", encoding="utf-8") as f:
    CFG = yaml.safe_load(f)

# Seed
SEED = int(CFG.get("project", {}).get("seed", 42))
random.seed(SEED); np.random.seed(SEED)
pd.options.display.max_columns = 120
pd.options.display.width = 160

CFG


In [None]:
paths = CFG.get("paths", {})
DATA_DIR = PROJECT_DIR / "data"
RAW_CSV = PROJECT_DIR / paths.get("raw_csv", "data/raw/addiction_population_data.csv")
PROCESSED_DIR = PROJECT_DIR / paths.get("processed_dir", "data/processed")
REPORTS_DIR = PROJECT_DIR / paths.get("reports_dir", "reports")
ARTIFACTS_DIR = PROJECT_DIR / paths.get("artifacts_dir", "artifacts")
SCHEMA_PATH = paths.get("schema") and (PROJECT_DIR / paths["schema"]) or None

FIG_DIR = REPORTS_DIR / "figures" / "0_1_eda"
for p in (PROCESSED_DIR, REPORTS_DIR, ARTIFACTS_DIR, FIG_DIR):
    p.mkdir(parents=True, exist_ok=True)

RAW_CSV

In [None]:
csv_opts = CFG.get("csv_options", {})
sep = csv_opts.get("sep", ",")
header = csv_opts.get("header", 0)
encoding = csv_opts.get("encoding", "utf-8")
na_values = csv_opts.get("na_values", ["", "NA", "NaN", "null"])
cfg_parse_dates = csv_opts.get("parse_dates", []) or None
cfg_dtype = csv_opts.get("dtype", {}) or {}

# Align dtype keys with existing columns to avoid KeyErrors
if RAW_CSV.exists():
    preview_cols = pd.read_csv(RAW_CSV, nrows=0, sep=sep, encoding=encoding).columns
    dtype_filtered = {k: v for k, v in cfg_dtype.items() if k in preview_cols}
else:
    raise FileNotFoundError(f"Raw CSV not found: {RAW_CSV}")

# Pandas dtype mapping normalization
dtype_map = {
    "int8": "Int8", "int16": "Int16", "int32": "Int32", "int64": "Int64",
    "float32": "Float32", "float64": "Float64",
    "boolean": "boolean", "category": "category", "string": "string"
}
for k, v in list(dtype_filtered.items()):
    dtype_filtered[k] = dtype_map.get(str(v).lower(), v)

df = pd.read_csv(
    RAW_CSV,
    sep=sep,
    header=header,
    encoding=encoding,
    na_values=na_values,
    dtype=dtype_filtered if dtype_filtered else None,
    parse_dates=cfg_parse_dates
)

rows, cols = df.shape
print(f"Loaded: {RAW_CSV} -> {rows} x {cols}")
df.head(3)


In [None]:
def parse_dates_best_effort(df: pd.DataFrame) -> pd.DataFrame:
    if CFG.get("csv_options", {}).get("parse_dates"):
        return df
    date_like = [c for c in df.columns if re.search(r"(date|dt|time|year|month)", str(c), re.I)]
    for c in date_like:
        try:
            df[c] = pd.to_datetime(df[c], errors="ignore", infer_datetime_format=True)
        except Exception:
            pass
    return df

def summarize(df: pd.DataFrame) -> pd.DataFrame:
    nunique = df.nunique(dropna=False)
    missing = df.isna().mean()
    dtypes = df.dtypes.astype(str)
    ex = df.head(1).transpose()[0].astype(str)
    return (pd.DataFrame({
        "dtype": dtypes,
        "n_unique": nunique,
        "missing_pct": (missing * 100).round(2),
        "example": ex
    }).reset_index(names="column").sort_values(["dtype", "column"]).reset_index(drop=True))

def numeric_and_categorical(df: pd.DataFrame, max_unique_for_cat: int = 20):
    num = df.select_dtypes(include=[np.number]).columns.tolist()
    low_card_num = [c for c in num if df[c].nunique(dropna=True) <= max_unique_for_cat]
    cats = list(df.columns.difference(num)) + low_card_num
    seen = set(); cats = [c for c in cats if (c not in seen) and (not seen.add(c))]
    num_true = [c for c in num if c not in low_card_num]
    return num_true, cats

def safe_series(x: pd.Series) -> pd.Series:
    return x.replace([np.inf, -np.inf], np.nan)

def winsorize_series(s: pd.Series, lower=0.005, upper=0.995) -> pd.Series:
    if s.notna().sum() < 10:
        return s
    return s.clip(lower=s.quantile(lower), upper=s.quantile(upper))

def detect_population_col(df: pd.DataFrame):
    cands = [c for c in df.columns if re.search(r"pop|population", str(c), re.I)]
    if "population" in df.columns:
        return "population"
    return cands[0] if cands else None

def detect_count_cols(df: pd.DataFrame):
    return [c for c in df.columns if re.search(r"(count|cases?|deaths?|events?)", str(c), re.I)
            and pd.api.types.is_numeric_dtype(df[c])]

def detect_gender_cols(df: pd.DataFrame):
    male = [c for c in df.columns if re.search(r"\bmale\b|_m(ale)?\b", str(c), re.I)]
    female = [c for c in df.columns if re.search(r"\bfemale\b|_f(emale)?\b", str(c), re.I)]
    return male, female


In [None]:
df = parse_dates_best_effort(df)
schema = summarize(df)
print(f"Columns: {len(df.columns)}")
schema.head(40)


In [None]:
num_cols, cat_cols = numeric_and_categorical(df)
print(f"Numeric: {len(num_cols)} | Categorical/low-card: {len(cat_cols)}")
(num_cols[:8], cat_cols[:8])

In [None]:
FIG_DIR.mkdir(parents=True, exist_ok=True)

def save_histograms(df, cols, limit=12):
    for col in cols[:limit]:
        fig = plt.figure()
        safe_series(df[col]).dropna().hist(bins=30)
        plt.title(f"Histogram: {col}")
        plt.xlabel(col); plt.ylabel("Frequency")
        plt.savefig(FIG_DIR / f"hist_{col}.png", bbox_inches="tight")
        plt.close(fig)

def save_boxplots(df, cols, limit=10):
    for col in cols[:limit]:
        fig = plt.figure()
        plt.boxplot(safe_series(df[col]).dropna(), vert=True, labels=[col])
        plt.title(f"Boxplot: {col}")
        plt.savefig(FIG_DIR / f"box_{col}.png", bbox_inches="tight")
        plt.close(fig)

def save_bar_small_cats(df, cats, limit=10, max_unique=20):
    small = [c for c in cats if df[c].nunique(dropna=True) <= max_unique][:limit]
    for col in small:
        counts = df[col].astype(str).fillna("NA").value_counts()
        fig = plt.figure()
        counts.plot(kind="bar")
        plt.title(f"Counts: {col}")
        plt.xlabel(col); plt.ylabel("Count")
        plt.savefig(FIG_DIR / f"bar_{col}.png", bbox_inches="tight")
        plt.close(fig)

def save_corr_heatmap(df, num_cols):
    if len(num_cols) < 2:
        return None
    corr = df[num_cols].corr(numeric_only=True)
    fig = plt.figure()
    plt.imshow(corr, aspect="auto")
    plt.xticks(range(len(num_cols)), num_cols, rotation=90)
    plt.yticks(range(len(num_cols)), num_cols)
    plt.title("Correlation (Pearson)")
    plt.colorbar()
    plt.tight_layout()
    out = FIG_DIR / "corr_heatmap.png"
    plt.savefig(out, bbox_inches="tight")
    plt.close(fig)
    return out

save_histograms(df, num_cols)
save_boxplots(df, num_cols)
save_bar_small_cats(df, cat_cols)
corr_path = save_corr_heatmap(df, num_cols)
sorted([p.name for p in FIG_DIR.glob("*.png")])[:12]


In [None]:
df_feat = df.copy()
created_cols = []

# Per-capita rates
pop_col = detect_population_col(df_feat)
if pop_col and pd.api.types.is_numeric_dtype(df_feat[pop_col]):
    for c in detect_count_cols(df_feat):
        rate_col = f"{c}_per_100k"
        with np.errstate(divide="ignore", invalid="ignore"):
            df_feat[rate_col] = (df_feat[c] / df_feat[pop_col]) * 1e5
        df_feat[rate_col] = safe_series(df_feat[rate_col])
        created_cols.append(rate_col)

# Gender proportions
male_cols, female_cols = detect_gender_cols(df_feat)
if male_cols and female_cols:
    mcol, fcol = male_cols[0], female_cols[0]
    total_col = f"{mcol}_plus_{fcol}_total"
    df_feat[total_col] = safe_series(df_feat[mcol]) + safe_series(df_feat[fcol])
    with np.errstate(divide="ignore", invalid="ignore"):
        df_feat[f"prop_{mcol}"] = safe_series(df_feat[mcol]) / df_feat[total_col]
        df_feat[f"prop_{fcol}"] = safe_series(df_feat[fcol]) / df_feat[total_col]
    created_cols += [total_col, f"prop_{mcol}", f"prop_{fcol}"]

# Date parts
date_cols = [c for c in df_feat.columns if np.issubdtype(df_feat[c].dtype, np.datetime64)]
for c in date_cols:
    df_feat[f"{c}_year"] = df_feat[c].dt.year
    df_feat[f"{c}_month"] = df_feat[c].dt.month
    df_feat[f"{c}_quarter"] = df_feat[c].dt.quarter
    created_cols += [f"{c}_year", f"{c}_month", f"{c}_quarter"]

# Winsorized copies
for c in df_feat.select_dtypes(include=[np.number]).columns:
    wz = f"{c}_wz"
    df_feat[wz] = winsorize_series(df_feat[c])
    created_cols.append(wz)

print(f"Created features: {len(created_cols)}")
df_feat.head(3)


In [None]:
write_parquet = bool(CFG.get("data", {}).get("write_parquet", True))
parquet_compression = CFG.get("data", {}).get("parquet_compression", "snappy")

out_csv = PROCESSED_DIR / f"{RAW_CSV.stem}.features.csv"
df_feat.to_csv(out_csv, index=False)

out_parquet = None
if write_parquet:
    out_parquet = PROCESSED_DIR / f"{RAW_CSV.stem}.features.parquet"
    df_feat.to_parquet(out_parquet, index=False, compression=parquet_compression)

report_path = REPORTS_DIR / "eda_report.md"
lines = []
lines.append(f"# EDA Report — generated {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
lines.append("")
lines.append(f"- Source: `{RAW_CSV}`")
lines.append(f"- Shape: {df.shape[0]} x {df.shape[1]}")
lines.append(f"- Numeric columns: {len([c for c in df.select_dtypes(include=[np.number]).columns])}")
lines.append(f"- Categorical (incl. low-card): {len([c for c in df.columns if c not in df.select_dtypes(include=[np.number]).columns])}")
if pop_col: lines.append(f"- Population column: `{pop_col}`")
if created_cols:
    preview = ", ".join(created_cols[:15]) + (" ..." if len(created_cols) > 15 else "")
    lines.append(f"- Created features: {preview}")
lines.append("")
lines.append("## Figures")
for p in sorted((REPORTS_DIR / "figures" / "0_1_eda").glob("*.png")):
    lines.append(f"- {p.relative_to(PROJECT_DIR)}")
report_path.write_text("\n".join(lines), encoding="utf-8")

# Run metadata (for traceability / MLflow-friendly)
run_meta = {
    "timestamp": datetime.utcnow().isoformat() + "Z",
    "seed": SEED,
    "inputs": {"raw_csv": str(RAW_CSV)},
    "outputs": {
        "csv": str(out_csv),
        "parquet": str(out_parquet) if out_parquet else None,
        "report": str(report_path),
        "fig_dir": str(FIG_DIR),
    },
    "config": {
        "project": CFG.get("project"),
        "csv_options": CFG.get("csv_options"),
        "paths": CFG.get("paths"),
        "data": CFG.get("data"),
    },
}
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
meta_path = ARTIFACTS_DIR / "eda_run_meta.json"
meta_path.write_text(json.dumps(run_meta, indent=2), encoding="utf-8")

(out_csv, out_parquet, report_path, meta_path)


# Cell 11 — Next steps (Markdown)

- Promote Cells 5–9 to `src/features/build_features.py` (config-driven).
- Add `make eda` target + pytest data checks (schema, ranges, missing).
- If `paths.schema` exists, validate dtypes and allowed categories against it.
