<details>
<summary style="
    cursor:pointer;background:#f7f7fb;border:1px solid #e5e7eb;
    padding:10px 12px;border-radius:10px;font-weight:700;"> 

**single reusable `eda_explorer.py` module** that you can import into any notebook?
combine all these **EDA relationship explorers** (categorical, numeric, interactions, churn, ANOVA, Tukey) into a

</summary> 

Perfect üî• ‚Äî this will finish your **EDA relationship suite** with a focused comparison of **numeric features vs churn**.

Awesome ‚Äî here‚Äôs a single, reusable module you can drop into `src/eda_explorer.py`.
It wraps everything we built: numeric summary (with skew/kurtosis), categorical summary (with entropy), categorical‚Üîchurn explorer (with Cram√©r‚Äôs V & chi-square), categorical√ócategorical interactions, continuous√ócategorical ANOVA + Tukey HSD, and numeric vs churn (with Welch t-test & Cohen‚Äôs d) ‚Äî plus tidy plotting helpers.

> Save as: **`src/eda_explorer.py`**

```python
# -*- coding: utf-8 -*-
"""
EDA Explorer ‚Äî Telco-ready utilities for fast, professional EDA.

Functions cover:
- Numeric descriptive summary (with skew & kurtosis)
- Categorical descriptive summary (counts, proportions, entropy)
- Categorical vs Churn explorer (Cram√©r‚Äôs V, chi¬≤, churn lift)
- Categorical √ó Categorical interactions vs Churn
- Continuous √ó Categorical ANOVA and Tukey HSD
- Numeric vs Churn (Welch t-test, Cohen's d)
- Matplotlib/Seaborn plotting helpers

Author: you
"""

from __future__ import annotations
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Optional

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import chi2_contingency, ttest_ind, f_oneway
try:
    from statsmodels.stats.multicomp import pairwise_tukeyhsd
    _HAS_SM = True
except Exception:
    _HAS_SM = False

# ---------------------------
# Core Helpers
# ---------------------------

def _numeric_cols(df: pd.DataFrame, exclude: List[str] = None) -> List[str]:
    exclude = set((exclude or []))
    return [c for c in df.select_dtypes(include="number").columns if c not in exclude]

def _cat_cols(df: pd.DataFrame, exclude: List[str] = None) -> List[str]:
    exclude = set((exclude or []))
    return [c for c in df.select_dtypes(include=["object", "category"]).columns if c not in exclude]

def _cramers_v(ct: pd.DataFrame) -> Tuple[float, float]:
    """Return (Cram√©r's V, p-value) for a contingency table."""
    chi2, p, dof, exp = chi2_contingency(ct)
    n = ct.values.sum()
    k = min(ct.shape) - 1
    if n == 0 or k <= 0:
        return np.nan, p
    v = float(np.sqrt((chi2 / n) / k))
    return v, float(p)

def _cohens_d(x: pd.Series, y: pd.Series) -> float:
    nx, ny = len(x), len(y)
    if nx < 2 or ny < 2:
        return np.nan
    sx, sy = x.std(ddof=1), y.std(ddof=1)
    pooled = np.sqrt(((nx - 1) * sx**2 + (ny - 1) * sy**2) / (nx + ny - 2))
    if pooled == 0:
        return 0.0
    return float((x.mean() - y.mean()) / pooled)

# ---------------------------
# 1) Numeric descriptive summary
# ---------------------------

def numeric_summary(
    df: pd.DataFrame,
    exclude_cols: List[str] = None,
    round_: int = 2
) -> pd.DataFrame:
    """
    Return describe().T with skew and kurtosis added.
    """
    num_cols = _numeric_cols(df, exclude=exclude_cols)
    if not num_cols:
        return pd.DataFrame()
    desc = df[num_cols].describe().T
    desc["skew"] = df[num_cols].skew(numeric_only=True)
    desc["kurtosis"] = df[num_cols].kurtosis(numeric_only=True)
    return desc.round(round_)

def plot_numeric_summary_style(desc: pd.DataFrame):
    """
    Pretty style for numeric summary (optional).
    """
    if desc.empty:
        return desc
    return (desc.style
        .background_gradient(cmap="Blues", subset=["mean", "50%"])
        .background_gradient(cmap="Purples", subset=["std"])
        .format(precision=2)
        .set_caption("üìä Descriptive Statistics for Numeric Features")
    )

# ---------------------------
# 2) Categorical descriptive summary
# ---------------------------

def categorical_summary(
    df: pd.DataFrame,
    exclude_cols: List[str] = None
) -> pd.DataFrame:
    """
    Return table with unique count, top category, top %, and Shannon entropy.
    """
    cats = _cat_cols(df, exclude=exclude_cols)
    rows = []
    n = len(df)
    for col in cats:
        vc = df[col].value_counts(dropna=False)
        if vc.empty:
            continue
        top = vc.index[0]
        top_pct = float(vc.iloc[0]) / n if n > 0 else np.nan
        n_unique = int(df[col].nunique(dropna=True))
        p = (vc / vc.sum()).astype(float)
        entropy = float(-(p * np.log2(p)).sum())
        rows.append({
            "feature": col,
            "unique": n_unique,
            "top_category": str(top),
            "top_%": round(top_pct * 100, 1),
            "entropy(bits)": round(entropy, 2),
        })
    out = pd.DataFrame(rows).sort_values("entropy(bits)", ascending=False).reset_index(drop=True)
    return out

def plot_categorical_summary_style(cat_df: pd.DataFrame):
    if cat_df.empty:
        return cat_df
    return (cat_df.style
        .bar(subset=["top_%"], color="#93c5fd", vmin=0, vmax=100)
        .background_gradient(subset=["entropy(bits)"], cmap="Purples")
        .set_caption("üìã Categorical Descriptive Statistics")
    )

# ---------------------------
# 3) Categorical vs Churn
# ---------------------------

def categorical_vs_churn(
    df: pd.DataFrame,
    churn_str: str = "Churn",      # "Yes"/"No"
    churn_bin: str = "Churn_flag", # 0/1
    topk_cats: int = 12,
    exclude: List[str] = None
) -> Tuple[Dict[str, pd.DataFrame], pd.DataFrame]:
    """
    For each categorical feature, compute churn rate per category,
    churn lift vs global, and Cram√©r's V + chi2 p-value.
    Returns:
      per_feature_tables: dict[feature] -> table
      summary_rank: ranking of features by max_abs_delta, cramers_v
    """
    cats = _cat_cols(df, exclude=exclude or [churn_str.lower(), "customerid"])
    per_feature = {}
    rows = []
    global_rate = float(df[churn_bin].mean())

    for col in cats:
        vc = df[col].value_counts(dropna=False)
        cats_eval = vc.index[:topk_cats]
        sub = df[df[col].isin(cats_eval)].copy()

        ct = pd.crosstab(sub[col], sub[churn_str])
        v, p = _cramers_v(ct) if ct.shape[1] == 2 and ct.shape[0] >= 2 else (np.nan, np.nan)

        grp = sub.groupby(col)[churn_bin].agg(["mean", "count"]).rename(columns={"mean": "churn_rate"})
        grp["category"] = grp.index.astype(str)
        grp["feature"] = col
        grp["global_rate"] = global_rate
        grp["delta_vs_global"] = grp["churn_rate"] - global_rate
        grp["abs_delta"] = grp["delta_vs_global"].abs()
        grp["cramers_v"] = v
        grp["chi2_p"] = p
        per_feature[col] = grp.sort_values("abs_delta", ascending=False).reset_index(drop=True)

        rows.append({
            "feature": col,
            "n_categories_total": int(vc.shape[0]),
            "n_categories_evaluated": int(len(cats_eval)),
            "global_churn_rate": global_rate,
            "max_abs_delta": float(grp["abs_delta"].max()),
            "cramers_v": v,
            "chi2_p": p,
        })

    summary = pd.DataFrame(rows).sort_values(["max_abs_delta", "cramers_v"], ascending=[False, False]).reset_index(drop=True)
    return per_feature, summary

def plot_churn_bar(tbl: pd.DataFrame, title_feature_name: Optional[str] = None, top_n: int = 12):
    """
    Plot churn rates by category for a single feature table (from categorical_vs_churn).
    """
    sns.set(style="whitegrid")
    t = tbl.nlargest(top_n, "abs_delta").copy()
    plt.figure(figsize=(8, 4.5))
    ax = sns.barplot(data=t, x="category", y="churn_rate", color="#ef4444", edgecolor="black")
    ax.axhline(t["global_rate"].iloc[0], color="#3b82f6", linestyle="--", label="Global churn")
    ax.set_title(f"{title_feature_name or t['feature'].iloc[0]} ‚Äî Churn Rate by Category\n"
                 f"(Cram√©r‚Äôs V={t['cramers_v'].iloc[0]:.3f}, p={t['chi2_p'].iloc[0]:.3g})")
    ax.set_xlabel(title_feature_name or t["feature"].iloc[0]); ax.set_ylabel("Churn rate")
    ax.set_ylim(0, max(0.001, float(t["churn_rate"].max()) * 1.15))
    ax.legend()
    ax.set_yticklabels([f"{tick*100:.0f}%" for tick in ax.get_yticks()])
    plt.xticks(rotation=20, ha="right")
    plt.tight_layout()
    plt.show()

# ---------------------------
# 4) Categorical √ó Categorical interactions
# ---------------------------

def interactions_categorical(
    df: pd.DataFrame,
    churn_str: str = "Churn",
    churn_bin: str = "Churn_flag",
    topk_cats: int = 8,
    max_pairs: int = 30,
    exclude: List[str] = None
) -> Tuple[Dict[Tuple[str, str], pd.DataFrame], pd.DataFrame]:
    """
    Evaluate feature pairs (cat √ó cat) vs churn; return per-pair tables and ranking.
    """
    import itertools
    cats = _cat_cols(df, exclude=(exclude or []) + [churn_str.lower(), "customerid"])
    pairs = list(itertools.combinations(cats, 2))[:max_pairs]

    tables = {}
    rows = []
    global_rate = float(df[churn_bin].mean())

    for colA, colB in pairs:
        topA = df[colA].value_counts().index[:topk_cats]
        topB = df[colB].value_counts().index[:topk_cats]
        sub = df[df[colA].isin(topA) & df[colB].isin(topB)].copy()
        if sub.empty: 
            continue
        sub["combo"] = sub[colA].astype(str) + " √ó " + sub[colB].astype(str)
        ct = pd.crosstab(sub["combo"], sub[churn_str])
        v, p = _cramers_v(ct) if ct.shape[1] == 2 and ct.shape[0] >= 2 else (np.nan, np.nan)

        grp = sub.groupby("combo")[churn_bin].agg(["mean", "count"]).rename(columns={"mean": "churn_rate"})
        grp["pair"] = f"{colA} √ó {colB}"
        grp["global_rate"] = global_rate
        grp["delta"] = grp["churn_rate"] - global_rate
        grp["abs_delta"] = grp["delta"].abs()
        grp["cramers_v"] = v
        grp["chi2_p"] = p
        tables[(colA, colB)] = grp

        rows.append({
            "pair": f"{colA} √ó {colB}",
            "n_combos": int(len(grp)),
            "max_abs_delta": float(grp["abs_delta"].max()),
            "cramers_v": v,
            "chi2_p": p,
        })

    summary = pd.DataFrame(rows).sort_values(["max_abs_delta", "cramers_v"], ascending=[False, False]).reset_index(drop=True)
    return tables, summary

def plot_interaction(tbl: pd.DataFrame, top_n: int = 10):
    """
    Bar plot for a single interaction table (from interactions_categorical).
    """
    sns.set(style="whitegrid")
    t = tbl.nlargest(top_n, "abs_delta").copy()
    plt.figure(figsize=(8, 4.5))
    sns.barplot(data=t, x="combo", y="churn_rate", color="#ef4444", edgecolor="black")
    plt.axhline(t["global_rate"].iloc[0], color="#3b82f6", linestyle="--", label="Global churn")
    plt.xticks(rotation=20, ha="right")
    plt.ylabel("Churn rate"); plt.xlabel(t["pair"].iloc[0])
    plt.title(f"{t['pair'].iloc[0]}\nCram√©r‚Äôs V={t['cramers_v'].iloc[0]:.3f}, p={t['chi2_p'].iloc[0]:.3g}")
    plt.legend(); plt.tight_layout(); plt.show()

# ---------------------------
# 5) Continuous √ó Categorical (ANOVA) + Tukey
# ---------------------------

def anova_continuous_vs_categorical(
    df: pd.DataFrame,
    numeric_cols: List[str],
    cat_cols: List[str],
    max_cats: int = 6
) -> pd.DataFrame:
    """
    For each numeric √ó categorical, run one-way ANOVA, compute mean_diff (max-min group mean).
    """
    rows = []
    for num_col in numeric_cols:
        for cat_col in cat_cols:
            if df[cat_col].nunique() < 2:
                continue
            vc = df[cat_col].value_counts()
            cats = vc.index[:max_cats]
            groups = [df.loc[df[cat_col] == c, num_col].dropna() for c in cats]
            if len(groups) < 2:
                continue
            try:
                stat, p = f_oneway(*groups)
            except Exception:
                stat, p = np.nan, np.nan
            group_means = df.groupby(cat_col)[num_col].mean()
            spread = float(group_means.max() - group_means.min())
            rows.append({
                "numeric": num_col,
                "categorical": cat_col,
                "mean_diff": spread,
                "anova_F": float(stat),
                "p_value": float(p),
                "n_groups": int(df[cat_col].nunique()),
            })
    return (pd.DataFrame(rows)
            .dropna(subset=["p_value"])
            .sort_values(["mean_diff", "anova_F"], ascending=[False, False])
            .reset_index(drop=True))

def tukey_posthoc(
    df: pd.DataFrame,
    numeric: str,
    categorical: str,
    max_cats: int = 6
) -> Optional[pd.DataFrame]:
    """
    Tukey HSD for a given numeric √ó categorical (requires statsmodels).
    """
    if not _HAS_SM:
        raise ImportError("statsmodels is required for Tukey HSD. Install: pip install statsmodels")
    vc = df[categorical].value_counts()
    cats = vc.index[:max_cats]
    sub = df[df[categorical].isin(cats)][[numeric, categorical]].dropna()
    if sub[categorical].nunique() < 2:
        return None
    tukey = pairwise_tukeyhsd(endog=sub[numeric], groups=sub[categorical], alpha=0.05)
    res_df = pd.DataFrame(tukey._results_table.data[1:], columns=tukey._results_table.data[0])
    res_df["numeric"] = numeric
    res_df["categorical"] = categorical
    # Normalize types
    res_df["meandiff"] = res_df["meandiff"].astype(float)
    res_df["p-adj"] = res_df["p-adj"].astype(float)
    res_df["reject"] = res_df["reject"].astype(bool)
    return res_df

def plot_num_vs_cat(df: pd.DataFrame, num_col: str, cat_col: str, top_k: int = 6):
    """
    Boxplot for numeric vs categorical with top-k categories.
    """
    cats = df[cat_col].value_counts().index[:top_k]
    sub = df[df[cat_col].isin(cats)]
    sns.set(style="whitegrid")
    plt.figure(figsize=(7.5, 4.5))
    sns.boxplot(data=sub, x=cat_col, y=num_col, palette="pastel", order=cats)
    plt.title(f"{num_col} by {cat_col}")
    plt.xticks(rotation=20, ha="right"); plt.tight_layout(); plt.show()

# ---------------------------
# 6) Numeric vs Churn
# ---------------------------

def numeric_vs_churn(
    df: pd.DataFrame,
    churn_bin: str = "Churn_flag",
    numeric_cols: List[str] = None
) -> pd.DataFrame:
    """
    Compare numeric features between churned vs non-churned:
    Welch t-test + Cohen's d + mean differences.
    """
    if numeric_cols is None:
        numeric_cols = _numeric_cols(df, exclude=[churn_bin])

    rows = []
    for col in numeric_cols:
        x0 = df.loc[df[churn_bin] == 0, col].dropna()
        x1 = df.loc[df[churn_bin] == 1, col].dropna()
        if len(x0) < 2 or len(x1) < 2:
            continue
        tstat, pval = ttest_ind(x0, x1, equal_var=False)
        d = _cohens_d(x1, x0)
        rows.append({
            "feature": col,
            "mean_churn0": float(x0.mean()),
            "mean_churn1": float(x1.mean()),
            "diff": float(x1.mean() - x0.mean()),
            "p_value": float(pval),
            "cohen_d": float(d),
        })
    return (pd.DataFrame(rows)
            .sort_values("p_value")
            .reset_index(drop=True))

def plot_numeric_vs_churn(df: pd.DataFrame, col: str, churn_bin: str = "Churn_flag"):
    """
    Violin/box visualization for a numeric feature vs churn flag.
    """
    sns.set(style="whitegrid")
    plt.figure(figsize=(7.5, 4.5))
    sns.violinplot(data=df, x=churn_bin, y=col, palette=["#93c5fd", "#f87171"], cut=0)
    plt.title(f"{col} Distribution by Churn")
    plt.xticks([0, 1], ["No Churn", "Churn"])
    plt.xlabel(""); plt.tight_layout(); plt.show()
```

---

## Quick usage example (in your notebook)

```python
from src.eda_explorer import *

# Numeric summary
desc = numeric_summary(df, exclude_cols=["customerID"])
display(plot_numeric_summary_style(desc))

# Categorical summary
cat_df = categorical_summary(df, exclude_cols=["customerID", "Churn"])
display(plot_categorical_summary_style(cat_df))

# Categorical vs Churn
per_feat, cat_rank = categorical_vs_churn(df, churn_str="Churn", churn_bin="Churn_flag", topk_cats=12)
display(cat_rank.style.format({'global_churn_rate':'{:.2%}','max_abs_delta':'{:.2%}','cramers_v':'{:.3f}','chi2_p':'{:.3g}'}))
# Plot the top feature
top_feature = cat_rank.loc[0, "feature"]
plot_churn_bar(per_feat[top_feature], title_feature_name=top_feature)

# Interactions (cat √ó cat)
tables, pairs_rank = interactions_categorical(df, churn_str="Churn", churn_bin="Churn_flag", topk_cats=8, max_pairs=30)
display(pairs_rank.style.format({'max_abs_delta':'{:.2%}','cramers_v':'{:.3f}','chi2_p':'{:.3g}'}))
# Plot the top interaction
first_pair_name = pairs_rank.loc[0, "pair"]
colA, colB = first_pair_name.split(" √ó ")
plot_interaction(tables[(colA, colB)])

# ANOVA: numeric √ó categorical
anova_df = anova_continuous_vs_categorical(df, numeric_cols=['tenure','MonthlyCharges','TotalCharges'],
                                           cat_cols=_cat_cols(df, exclude=['customerID','Churn']), max_cats=6)
display(anova_df.style.format({'mean_diff':'{:.2f}','anova_F':'{:.1f}','p_value':'{:.3g}'}))

# Tukey HSD (optional, requires statsmodels)
# from src.eda_explorer import tukey_posthoc
# tuk = tukey_posthoc(df, numeric='MonthlyCharges', categorical='InternetService', max_cats=6)
# display(tuk.head())

# Numeric vs Churn (Welch t-test + Cohen's d)
num_vs = numeric_vs_churn(df, churn_bin="Churn_flag", numeric_cols=['tenure','MonthlyCharges','TotalCharges'])
display(num_vs.style.format({'mean_churn0':'{:.2f}','mean_churn1':'{:.2f}','diff':'{:+.2f}','p_value':'{:.3g}','cohen_d':'{:.2f}'}))
for col in ['tenure','MonthlyCharges','TotalCharges']:
    plot_numeric_vs_churn(df, col, churn_bin="Churn_flag")
```

---

### Notes

* Optional dependency for Tukey HSD: `pip install statsmodels`
* All functions are **dataset-agnostic**; the Telco defaults (e.g., `Churn`, `Churn_flag`) are parameters.
* Keep your earlier **guard cell** to ensure `Churn_flag` exists and dtypes are correct before calling these.

In [None]:
def atomic_append_csv(df_like, path: Path):
    """Append dataframe-like rows to CSV atomically (temp file + replace)."""
    import tempfile, os
    df_obj = df_like if hasattr(df_like, "to_csv") else pd.DataFrame([df_like])
    header = not path.exists()
    with tempfile.NamedTemporaryFile("w", delete=False, dir=path.parent, suffix=".csv") as tmp:
        df_obj.to_csv(tmp.name, mode="w", index=False, header=header)
        tmp.flush()
        os.fsync(tmp.fileno())
        # Append by concatenating files iff path exists; else just rename
    if path.exists():
        # concatenate existing + tmp into a new file atomically
        with tempfile.NamedTemporaryFile("w", delete=False, dir=path.parent, suffix=".csv") as merged:
            with open(path, "r") as old, open(tmp.name, "r") as new:
                if header:
                    merged.write(old.read())
                    merged.write(new.read())
                else:
                    merged.write(old.read())
                    # skip header of new
                    merged.write("".join(new.readlines()[1:]))
            os.replace(merged.name, path)
        os.remove(tmp.name)
    else:
        os.replace(tmp.name, path)


In [None]:
# 2.12.1-2 üéØ Target & Demographic Diagnostics (Churn + SeniorCitizen) 
# 2.12.0 block** that adds both **Churn Imbalance** and **SeniorCitizen Audit** and **appends audit rows to your unified report** in the same atomic/align style you‚Äôve been using.
print("\n2.12.1) üéØ Target & Demographic Diagnostics")

import os
import pandas as pd
from IPython.display import display

assert "df" in globals(), "df not defined"
assert "SECTION2_REPORT_PATH" in globals(), "SECTION2_REPORT_PATH not defined"
REPORT_PATH = SECTION2_REPORT_PATH  # unified CSV you‚Äôve been appending to

def _atomic_append(path: pd.Path | str, chunk: pd.DataFrame):
    """Append chunk to unified CSV with schema alignment (atomic replace)."""
    path = pd.Path(path) if not isinstance(path, pd.Path) else path
    tmp  = path.with_suffix(path.suffix + ".tmp")
    try:
        path.parent.mkdir(parents=True, exist_ok=True)
        if path.exists():
            existing = pd.read_csv(path)
            all_cols = pd.Index(existing.columns).union(chunk.columns)
            out = pd.concat(
                [existing.reindex(columns=all_cols), chunk.reindex(columns=all_cols)],
                ignore_index=True
            )
        else:
            out = chunk
        # tidy a few numeric columns if present
        for col in ("percent", "imbalance_ratio", "pct_inconsistent", "top_freq", "pct_not_allowed"):
            if col in out.columns:
                out[col] = pd.to_numeric(out[col], errors="coerce").round(4)
        out.to_csv(tmp, index=False)
        os.replace(tmp, path)
        print(f"üßæ Appended diagnostics ‚Üí {path}")
    except Exception as e:
        try:
            if tmp.exists(): tmp.unlink(missing_ok=True)
        except Exception:
            pass
        print(f"‚ö†Ô∏è Could not append diagnostics: {e}")

In [None]:
# Multi-phase categorical QA, application, and verification.



Short answer: combining them in one cell **works**, but the **most effective, professional** approach is to keep them as **two steps** with **shared utilities**‚Äîi.e., factor common logic into a small module and call it from the notebook.

Here‚Äôs how I‚Äôd do it, ranked:

# 1) Best-practice (production-ready)

* **Separate concerns**

  * **2.7.1 Pre-apply** = detect/counterexample audit *before* you mutate.
  * **2.7.2 Post-apply** = verify conformance *after* your cleaning.
* **DRY utilities** (one source of truth): normalization, allowlist check, atomic writer, schema alignment, float rounding, timestamping.
* **Config-driven**: keep `ALLOWED` in YAML (and load it), protect/target columns in config.
* **Testable**: unit tests for normalization and allowlist logic; smoke test for the writer.
* **Notebook = orchestration only**: call functions, display previews, and link artifacts.

Minimal skeleton:

```
project/
‚îú‚îÄ src/telco_quality/
‚îÇ  ‚îú‚îÄ __init__.py
‚îÇ  ‚îú‚îÄ categorical.py          # normalize_text, pre_validate, post_validate
‚îÇ  ‚îî‚îÄ report_io.py            # append_atomic, shape_schema
‚îú‚îÄ config/
‚îÇ  ‚îî‚îÄ allowed_values.yaml
‚îî‚îÄ notebooks/
   ‚îî‚îÄ 01_EDA.ipynb
```

```python
# src/telco_quality/categorical.py
import pandas as pd

def normalize_text(s: pd.Series) -> pd.Series:
    s = s.astype("string")
    s = s.str.replace("\u00A0", " ", regex=False).str.replace(r"\s+", " ", regex=True)
    return s.str.strip()

def pre_validate(df, allowed: dict | None, protect: set, target: set) -> tuple[pd.DataFrame, pd.DataFrame]:
    # returns (summary_df, issues_df)
    ...

def post_validate(df, allowed: dict | None) -> tuple[pd.DataFrame, pd.DataFrame]:
    # returns (summary_df, issues_df)
    ...
```

```python
# src/telco_quality/report_io.py
import os, pandas as pd
def append_atomic(chunk: pd.DataFrame, path) -> None:
    tmp = path.with_suffix(path.suffix + ".tmp")
    path.parent.mkdir(parents=True, exist_ok=True)
    if path.exists():
        existing = pd.read_csv(path)
        all_cols = pd.Index(existing.columns).union(chunk.columns)
        out = pd.concat([existing.reindex(columns=all_cols),
                         chunk.reindex(columns=all_cols)], ignore_index=True)
    else:
        out = chunk
    out.to_csv(tmp, index=False)
    os.replace(tmp, path)
```

Notebook call (clean and readable):

```python
from pathlib import Path
from telco_quality.categorical import pre_validate, post_validate
from telco_quality.report_io import append_atomic

SECTION2_REPORT_PATH = Path("Level_3/reports/section2_data_quality.csv")

pre_sum, pre_issues   = pre_validate(df, ALLOWED, {"customerID"}, {"Churn"})
post_sum, post_issues = post_validate(df, ALLOWED)

append_atomic(pre_sum,  SECTION2_REPORT_PATH)
append_atomic(post_sum, SECTION2_REPORT_PATH)
display(pre_sum.sort_values(...).head(25))
display(post_sum.sort_values(...).head(20))
```

**Why this is ‚Äúbest‚Äù:**

* Clean boundaries, fewer bugs, easy to extend.
* Reuse *one* normalization function + *one* writer across many sections.
* Trivially unit-testable and interview-ready.

# 2) Good compromise (your current notebook, no `def`)

If you **must** avoid functions, your combined cell is fine. To make it more ‚Äúpro‚Äù without defs:

* **Single config block** at the top (paths, `protect_cols`, `target_cols`, `has_allow`, timestamp).
* **One normalization snippet** copied verbatim in both loops (pre & post) to stay deterministic.
* **One atomic writer** for both chunks (you did this).
* **Stable column order**: shape to a unified schema *once* and reuse that list.
* **Section tags**: use `"2.7.1_pre_validate"` and `"2.7.2_post_validate"` consistently.

# 3) Should you combine the two steps?

* **Purpose:** yes, they serve **different purposes** (pre-apply diagnostics vs post-apply verification). Keep them as **two labeled stages** even if they live in the same cell.
* **Combining** is OK for convenience, but **don‚Äôt merge the logic paths** (you want a clear boundary between pre and post to catch regressions).

# 4) Extra polish to look ‚Äúsenior‚Äù

* **Idempotency:** pre-apply must never mutate `df`. Post-apply should only read.
* **Deterministic artifacts:** include `section` + `run_ts` in every row; write separate `__pre_issues.csv` / `__post_issues.csv` for sampling.
* **Logging:** lightweight prints like `2.7.1)‚Ä¶`, counts, and artifact paths.
* **Schema stability:** always union columns on append (you already do).
* **Future-proof:** if this grows, graduate to **Great Expectations** or **Pandera** for declarative checks, but keep your writer for the unified CSV.

---

## Final recommendation

* For your portfolio and maintainability: **use Option 1** (small module with functions) and call it from the notebook.
* If you want to stay purely notebook-inline: your combined cell is **good**, just keep **2.7.1** and **2.7.2** distinct inside it and reuse **one writer + one normalization** block.

If you want, I can turn your current combined cell into the small `src/telco_quality/categorical.py` module (with exactly the same behavior and column schema) so you can import it immediately.


In [None]:
"""
Telco Customer Churn - Exploratory Data Analysis
Simple yet comprehensive EDA for understanding customer churn patterns
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

def load_data():
    """Load the Telco Customer Churn dataset"""
    data_path = Path("data/raw/Telco-Customer-Churn.csv")
    df = pd.read_csv(data_path)
    return df

def basic_info(df):
    """Display basic information about the dataset"""
    print("=" * 50)
    print("DATASET OVERVIEW")
    print("=" * 50)
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print("\nData Types:")
    print(df.dtypes)
    print("\nFirst 5 rows:")
    print(df.head())
    
def check_missing_values(df):
    """Check for missing values and data quality issues"""
    print("\n" + "=" * 50)
    print("DATA QUALITY CHECK")
    print("=" * 50)
    
    # Missing values
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Missing %': missing_pct
    }).sort_values('Missing Count', ascending=False)
    
    print("Missing Values:")
    print(missing_df[missing_df['Missing Count'] > 0])
    
    # Check for empty strings or spaces
    print("\nChecking for empty strings...")
    for col in df.select_dtypes(include=['object']).columns:
        empty_count = (df[col] == '').sum() + (df[col] == ' ').sum()
        if empty_count > 0:
            print(f"{col}: {empty_count} empty values")
    
    # Duplicates
    duplicates = df.duplicated().sum()
    print(f"\nDuplicate rows: {duplicates}")

def analyze_target_variable(df):
    """Analyze the target variable (Churn)"""
    print("\n" + "=" * 50)
    print("TARGET VARIABLE ANALYSIS")
    print("=" * 50)
    
    churn_counts = df['Churn'].value_counts()
    churn_pct = df['Churn'].value_counts(normalize=True) * 100
    
    print("Churn Distribution:")
    for val, count, pct in zip(churn_counts.index, churn_counts.values, churn_pct.values):
        print(f"{val}: {count} ({pct:.1f}%)")
    
    # Visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Count plot
    sns.countplot(data=df, x='Churn', ax=ax1)
    ax1.set_title('Churn Distribution')
    ax1.set_ylabel('Count')
    
    # Pie chart
    ax2.pie(churn_counts.values, labels=churn_counts.index, autopct='%1.1f%%', startangle=90)
    ax2.set_title('Churn Percentage')
    
    plt.tight_layout()
    plt.show()

def analyze_numerical_features(df):
    """Analyze numerical features"""
    print("\n" + "=" * 50)
    print("NUMERICAL FEATURES ANALYSIS")
    print("=" * 50)
    
    # Convert TotalCharges to numeric (it might be stored as string)
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    
    numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
    
    print("Statistical Summary:")
    print(df[numerical_cols].describe())
    
    # Visualizations
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()
    
    for i, col in enumerate(numerical_cols):
        # Distribution
        sns.histplot(data=df, x=col, hue='Churn', kde=True, ax=axes[i])
        axes[i].set_title(f'{col} Distribution by Churn')
        
        # Box plot
        sns.boxplot(data=df, x='Churn', y=col, ax=axes[i+3])
        axes[i+3].set_title(f'{col} by Churn Status')
    
    plt.tight_layout()
    plt.show()

def analyze_categorical_features(df):
    """Analyze categorical features"""
    print("\n" + "=" * 50)
    print("CATEGORICAL FEATURES ANALYSIS")
    print("=" * 50)
    
    categorical_cols = [col for col in df.columns if df[col].dtype == 'object' and col not in ['customerID', 'Churn']]
    
    # Key categorical features for visualization
    key_features = ['Contract', 'PaymentMethod', 'InternetService', 'gender', 'SeniorCitizen']
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.ravel()
    
    for i, col in enumerate(key_features):
        if i < len(axes):
            # Create crosstab
            ct = pd.crosstab(df[col], df['Churn'], normalize='index') * 100
            ct.plot(kind='bar', ax=axes[i], rot=45)
            axes[i].set_title(f'Churn Rate by {col}')
            axes[i].set_ylabel('Churn Rate (%)')
            axes[i].legend(title='Churn')
    
    # Remove empty subplot
    if len(key_features) < len(axes):
        fig.delaxes(axes[-1])
    
    plt.tight_layout()
    plt.show()
    
    # Print churn rates for key categories
    print("\nChurn Rates by Category:")
    for col in key_features:
        print(f"\n{col}:")
        churn_rate = df.groupby(col)['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
        for category, rate in churn_rate.items():
            print(f"  {category}: {rate:.1f}%")

def correlation_analysis(df):
    """Analyze correlations between features"""
    print("\n" + "=" * 50)
    print("CORRELATION ANALYSIS")
    print("=" * 50)
    
    # Convert categorical variables to numerical for correlation
    df_encoded = df.copy()
    
    # Binary encoding for Yes/No columns
    binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
    for col in binary_cols:
        df_encoded[col] = (df_encoded[col] == 'Yes').astype(int)
    
    # Encode SeniorCitizen (already 0/1)
    # Encode other categorical variables
    categorical_to_encode = ['gender', 'MultipleLines', 'InternetService', 'OnlineSecurity', 
                           'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
                           'StreamingMovies', 'Contract', 'PaymentMethod']
    
    for col in categorical_to_encode:
        if col in df_encoded.columns:
            df_encoded[col] = pd.Categorical(df_encoded[col]).codes
    
    # Convert TotalCharges to numeric
    df_encoded['TotalCharges'] = pd.to_numeric(df_encoded['TotalCharges'], errors='coerce')
    
    # Select numerical columns for correlation
    numerical_cols = df_encoded.select_dtypes(include=[np.number]).columns
    numerical_cols = [col for col in numerical_cols if col != 'customerID']
    
    # Correlation matrix
    corr_matrix = df_encoded[numerical_cols].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Show correlations with Churn
    churn_corr = corr_matrix['Churn'].abs().sort_values(ascending=False)
    print("\nFeatures most correlated with Churn:")
    for feature, corr in churn_corr.items():
        if feature != 'Churn':
            print(f"{feature}: {corr:.3f}")

def generate_insights(df):
    """Generate key insights from the analysis"""
    print("\n" + "=" * 60)
    print("KEY INSIGHTS")
    print("=" * 60)
    
    # Convert TotalCharges to numeric
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    
    insights = []
    
    # Churn rate
    churn_rate = (df['Churn'] == 'Yes').mean() * 100
    insights.append(f"Overall churn rate: {churn_rate:.1f}%")
    
    # Contract insights
    contract_churn = df.groupby('Contract')['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
    insights.append(f"Month-to-month contracts have highest churn: {contract_churn['Month-to-month']:.1f}%")
    
    # Tenure insights
    avg_tenure_churned = df[df['Churn'] == 'Yes']['tenure'].mean()
    avg_tenure_retained = df[df['Churn'] == 'No']['tenure'].mean()
    insights.append(f"Average tenure - Churned: {avg_tenure_churned:.1f} months, Retained: {avg_tenure_retained:.1f} months")
    
    # Payment method insights
    payment_churn = df.groupby('PaymentMethod')['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
    highest_churn_payment = payment_churn.idxmax()
    insights.append(f"Highest churn payment method: {highest_churn_payment} ({payment_churn[highest_churn_payment]:.1f}%)")
    
    # Internet service insights
    internet_churn = df.groupby('InternetService')['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
    insights.append(f"Fiber optic users have higher churn: {internet_churn['Fiber optic']:.1f}%")
    
    for i, insight in enumerate(insights, 1):
        print(f"{i}. {insight}")

def main():
    """Main function to run the complete EDA"""
    print("Starting Telco Customer Churn EDA...")
    
    # Load data
    df = load_data()
    
    # Run analysis
    basic_info(df)
    check_missing_values(df)
    analyze_target_variable(df)
    analyze_numerical_features(df)
    analyze_categorical_features(df)
    correlation_analysis(df)
    generate_insights(df)
    
    print("\n" + "=" * 60)
    print("EDA COMPLETED!")
    print("=" * 60)

if __name__ == "__main__":
    main()
