
# Telco Churn — Level 2 Monolithic Notebook (Structured EDA)

**Scope:** Level 2 only — systematic, business-focused exploration of a cleaned dataset.  
No modeling here (that begins at Level 4).

**What this notebook does (end-to-end Level 2):**
1. Load the IBM Telco Churn dataset
2. Validate the data (shape, types, missing, duplicates)
3. Apply business-aware fixes (e.g., `TotalCharges`)
4. Structured EDA on **all** columns
5. Simple feature engineering for useful segments
6. Export figures and a cleaned/enhanced CSV


## 0) Setup & Paths

In [None]:

# --- User-configurable paths ---
from pathlib import Path
import os, sys, textwrap, inspect
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# >>> Adjust this to your machine if needed
PROJECT = Path("/Users/b/DATA/PROJECTS/Telco/L2")
DATA    = PROJECT / "data" / "WA_Fn-UseC_-Telco-Customer-Churn.csv"
FIGS    = PROJECT / "figures"
FIGS.mkdir(parents=True, exist_ok=True)

plt.rcParams["figure.dpi"] = 120

print("Project:", PROJECT)
print("Data:", DATA)
print("Figures:", FIGS)


In [None]:
## 1) Utilities (defined inline for monolithic workflow)
def ensure_dir(path):
    """Create directory if missing."""
    Path(path).mkdir(parents=True, exist_ok=True)

def memory_report(df: pd.DataFrame) -> str:
    """Human-readable memory usage for a DataFrame."""
    mb = df.memory_usage(deep=True).sum() / 1024**2
    return f"{mb:.2f} MB"


## 2) Loading & Basic Cleanup

In [None]:
#
BINARY_YN = [
    "Partner", "Dependents", "PhoneService", "PaperlessBilling",
    "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport",
    "StreamingTV", "StreamingMovies", "MultipleLines"
]
SERVICE_WITH_NO_INTERNET = [
    "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies"
]

def strip_object_whitespace(df: pd.DataFrame) -> pd.DataFrame:
    obj_cols = df.select_dtypes(include="object").columns
    for c in obj_cols:
        df[c] = df[c].astype(str).str.strip()
    return df

def simplify_service_levels(df: pd.DataFrame) -> pd.DataFrame:
    """Map 'No internet service' -> 'No'; keep raw copy with *_raw."""
    for c in SERVICE_WITH_NO_INTERNET:
        if c in df.columns:
            raw = f"{c}_raw"
            if raw not in df.columns:
                df[raw] = df[c].copy()
            df[c] = df[c].replace({"No internet service": "No"})
    if "MultipleLines" in df.columns:
        raw = "MultipleLines_raw"
        if raw not in df.columns:
            df[raw] = df["MultipleLines"].copy()
        df["MultipleLines"] = df["MultipleLines"].replace({"No phone service": "No"})
    return df

def load_telco_data(filepath: str, optimize_memory: bool = True) -> pd.DataFrame:
    """Load the IBM Telco dataset and apply light normalization/typing."""
    df = pd.read_csv(filepath)
    df = strip_object_whitespace(df)
    if "TotalCharges" in df.columns:
        df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
    df = simplify_service_levels(df)

    if optimize_memory:
        for c in BINARY_YN:
            if c in df.columns:
                df[c] = df[c].astype("category")
        for c in ["gender", "Contract", "PaymentMethod", "InternetService", "Churn"]:
            if c in df.columns:
                df[c] = df[c].astype("category")
        if "SeniorCitizen" in df.columns:
            df["SeniorCitizen"] = df["SeniorCitizen"].astype("int8").astype("category")
        for c in ["tenure", "MonthlyCharges"]:
            if c in df.columns and not np.issubdtype(df[c].dtype, np.number):
                df[c] = pd.to_numeric(df[c], errors="coerce")

    if "Churn" in df.columns and "ChurnFlag" not in df.columns:
        df["ChurnFlag"] = (df["Churn"].astype(str) == "Yes").astype("int8")
    return df


## 3) Validation & Dtype Probing

In [None]:

def validate_dataset(
    df: pd.DataFrame,
    expected_columns=None,
    unique_id_col: str | None = "customerID",
    show_examples_for_missing: int = 3,
) -> dict:
    """Print a human-readable validation report and return a dict summary."""
    report = {}
    report["shape"] = df.shape
    report["memory"] = memory_report(df)
    report["duplicates"] = int(df.duplicated().sum())
    nulls = df.isna().sum()
    report["missing"] = nulls[nulls > 0].sort_values(ascending=False)

    if expected_columns is not None:
        expected = set(expected_columns)
        actual = set(df.columns)
        report["missing_columns"] = sorted(list(expected - actual))
        report["unexpected_columns"] = sorted(list(actual - expected))

    if unique_id_col and unique_id_col in df.columns:
        report["id_uniqueness_ok"] = bool(df[unique_id_col].is_unique)
        if not report["id_uniqueness_ok"]:
            report["duplicate_ids"] = (
                df[unique_id_col][df[unique_id_col].duplicated()].head(show_examples_for_missing).tolist()
            )

    print("=== Data Validation Report ===")
    print(f"Shape: {report['shape']}")
    print(f"Memory: {report['memory']}")
    print(f"Duplicates: {report['duplicates']}")
    if len(report.get("missing", [])):
        print("\nMissing values:")
        print(report["missing"])
    if "missing_columns" in report and report["missing_columns"]:
        print("\nMissing columns:", report["missing_columns"])
    if "unexpected_columns" in report and report["unexpected_columns"]:
        print("Unexpected columns:", report["unexpected_columns"])
    if "id_uniqueness_ok" in report:
        print("customerID unique?:", report["id_uniqueness_ok"])
        if not report["id_uniqueness_ok"]:
            print("Example duplicate IDs:", report.get("duplicate_ids", []))

    return report

def investigate_object_columns(df: pd.DataFrame) -> None:
    """Inspect object columns for numeric convertibility and unique patterns."""
    objs = df.select_dtypes(include="object").columns
    print("=== Object Column Investigations ===")
    for c in objs:
        try:
            pd.to_numeric(df[c])
            print(f"✓ {c}: convertible to numeric")
        except Exception:
            nunique = df[c].nunique(dropna=True)
            preview = ", ".join(map(str, df[c].dropna().unique()[:10]))
            print(f"✗ {c}: stays as object; unique={nunique}; sample: {preview}")


## 4) Business Logic Fixes

In [None]:

def apply_business_logic(df: pd.DataFrame) -> pd.DataFrame:
    """Fill `TotalCharges` for NaN rows using tenure * MonthlyCharges; keep audit diffs."""
    df = df.copy()
    needed = {"tenure", "MonthlyCharges", "TotalCharges"}
    if needed.issubset(df.columns):
        expected = df["tenure"].astype(float) * df["MonthlyCharges"].astype(float)
        df["charge_difference"] = df["TotalCharges"] - expected
        na_mask = df["TotalCharges"].isna()
        df.loc[na_mask, "TotalCharges"] = expected[na_mask]
    return df


## 5) Feature Engineering (Level 2)

In [None]:

def create_customer_segments(df: pd.DataFrame) -> pd.DataFrame:
    """Create segments: CustomerValue (TotalCharges tertiles), LifecycleStage (tenure bins), ServiceCount."""
    df = df.copy()
    if "TotalCharges" in df.columns:
        df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
        valid = df["TotalCharges"].dropna()
        if valid.size:
            try:
                df["CustomerValue"] = pd.qcut(df["TotalCharges"], q=3, labels=["Low","Medium","High"])
            except ValueError:
                df["CustomerValue"] = pd.cut(df["TotalCharges"], bins=3, labels=["Low","Medium","High"])
    if "tenure" in df.columns:
        df["LifecycleStage"] = pd.cut(
            df["tenure"].astype(float),
            bins=[-0.1, 12, 24, 48, 72, np.inf],
            labels=["New","Growing","Mature","Loyal","Veteran"],
            include_lowest=True, right=True
        )
    service_cols = [
        "PhoneService","InternetService","OnlineSecurity","OnlineBackup",
        "DeviceProtection","TechSupport","StreamingTV","StreamingMovies","MultipleLines"
    ]
    present = [c for c in service_cols if c in df.columns]
    df["ServiceCount"] = 0
    for c in present:
        df["ServiceCount"] = df["ServiceCount"] + (df[c].astype(str) == "Yes").astype(int)
    return df


## 6) EDA Plot Helpers

In [None]:

def analyze_categorical(df: pd.DataFrame, column: str, target: str = "Churn", figsize=(12,4), top_n: int | None = 12):
    """Distribution, stacked churn proportion, quick stats box."""
    s = df[column].astype("category")
    if top_n and s.nunique() > top_n:
        top_levels = s.value_counts().nlargest(top_n).index
        s = s.where(s.isin(top_levels), other="__OTHER__")
    fig, axes = plt.subplots(1, 3, figsize=figsize)
    s.value_counts().sort_values(ascending=False).plot(kind="bar", ax=axes[0])
    axes[0].set_title(f"{column} Distribution"); axes[0].set_ylabel("Count")
    if target in df.columns:
        ct = pd.crosstab(s, df[target], normalize="index")
        ct.plot(kind="bar", stacked=True, ax=axes[1])
        axes[1].set_title(f"{column} vs {target} (Proportion)"); axes[1].set_ylabel("Proportion")
        axes[1].legend(title=target, bbox_to_anchor=(1.02, 1), loc="upper left")
    else:
        axes[1].axis("off"); axes[1].text(0.1, 0.5, f"Target '{target}' not found.", fontsize=11)
    axes[2].axis("off")
    stats_text = f"""Unique: {s.nunique()}
Mode: {s.mode().iat[0] if not s.mode().empty else '—'}
Missing: {int(s.isna().sum())}"""
    axes[2].text(0.05, 0.6, stats_text, fontsize=12, family="monospace")
    fig.tight_layout()
    return fig

def analyze_numerical(df: pd.DataFrame, column: str, target: str = "Churn", bins: int = 30):
    """Histogram, boxplot by target, KDE by target (safe), stats table."""
    x = pd.to_numeric(df[column], errors="coerce")
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    x.dropna().hist(bins=bins, ax=axes[0,0], edgecolor="black"); axes[0,0].set_title(f"{column} Distribution")
    if target in df.columns:
        groups_vals = df[target].dropna().unique()
        groups = [x[df[target] == v].dropna().values for v in groups_vals]
        if any(len(g) > 0 for g in groups):
            axes[0,1].boxplot(groups, labels=[str(v) for v in groups_vals]); axes[0,1].set_title(f"{column} by {target}")
        else:
            axes[0,1].axis("off"); axes[0,1].text(0.1, 0.5, "No data for boxplot", fontsize=11)
    else:
        axes[0,1].axis("off"); axes[0,1].text(0.1, 0.5, f"Target '{target}' not found.", fontsize=11)
    axes[1,0].set_title(f"{column} Density by {target}")
    if target in df.columns:
        groups = df[target].dropna().unique(); plotted = False
        x_all = pd.to_numeric(df[column], errors="coerce")
        for v in groups:
            xv = x_all[df[target] == v].dropna().to_numpy()
            if xv.size >= 3 and np.nanvar(xv) > 0:
                try:
                    pd.Series(xv).plot(kind="kde", ax=axes[1,0], label=f"{target}={v}"); plotted = True
                except Exception:
                    pass
        if plotted:
            axes[1,0].legend(loc="best")
        else:
            axes[1,0].text(0.05, 0.5, "KDE skipped (insufficient size/variance)", transform=axes[1,0].transAxes, fontsize=9)
    else:
        axes[1,0].axis("off")
    axes[1,1].axis("off"); desc = x.describe()
    axes[1,1].text(0.05, 0.6, desc.to_string(), fontsize=10, family="monospace")
    fig.tight_layout()
    return fig


## 7) Complete EDA Runner (saves figures to `/figures`)

In [None]:

def perform_complete_eda(
    df: pd.DataFrame,
    figures_dir: str | Path = FIGS,
    save_figures: bool = True,
    skip_cols=("customerID",),
    target: str = "Churn",
) -> pd.DataFrame:
    """Run Level-2 EDA: validate → per-column plots → business logic → features → re-validate."""
    ensure_dir(figures_dir)

    print("1) Validation (raw)")
    validate_dataset(df)

    categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
    numerical_cols   = df.select_dtypes(include=[np.number]).columns.tolist()
    if skip_cols:
        categorical_cols = [c for c in categorical_cols if c not in skip_cols]
        numerical_cols   = [c for c in numerical_cols   if c not in skip_cols]

    print("\n2) Categorical analysis")
    for c in categorical_cols:
        fig = analyze_categorical(df, c, target=target)
        if save_figures: fig.savefig(Path(figures_dir) / f"categorical_{c}.png", bbox_inches="tight")
        plt.close(fig)

    print("\n3) Numerical analysis")
    for c in numerical_cols:
        fig = analyze_numerical(df, c, target=target)
        if save_figures: fig.savefig(Path(figures_dir) / f"numerical_{c}.png", bbox_inches="tight")
        plt.close(fig)

    print("\n4) Business logic corrections")
    df2 = apply_business_logic(df)

    print("\n5) Feature engineering")
    df3 = create_customer_segments(df2)

    print("\n6) Validation (enhanced)")
    validate_dataset(df3)

    return df3


## 8) Execute Level-2 Pipeline

In [None]:

# Load & run
df = load_telco_data(str(DATA), optimize_memory=True)
_  = validate_dataset(df)
investigate_object_columns(df)

df_enhanced = perform_complete_eda(
    df,
    figures_dir=FIGS,
    save_figures=True,
    target="Churn"
)

# Save enhanced dataset + quick numeric hist grid
out_csv = FIGS / "telco_enhanced.csv"
df_enhanced.to_csv(out_csv, index=False)
print("Saved enhanced dataset to:", out_csv)

axes = df_enhanced.select_dtypes("number").hist(figsize=(12, 12), layout=(4, 4))
plt.tight_layout()
plt.savefig(FIGS / "all_numeric_histograms.png", bbox_inches="tight")
plt.close()
print("Saved numeric hist grid to:", FIGS / "all_numeric_histograms.png")

# Quick business-focused summaries (examples)
churn_rate = df_enhanced["ChurnFlag"].mean() if "ChurnFlag" in df_enhanced else float("nan")
print(f"Overall churn rate: {churn_rate:.3f}")
if {"ChurnFlag","Contract"}.issubset(df_enhanced.columns):
    by_contract = df_enhanced.groupby("Contract")["ChurnFlag"].mean().sort_values(ascending=False)
    print("\nChurn rate by Contract:"); print(by_contract)

if {"ChurnFlag","PaymentMethod"}.issubset(df_enhanced.columns):
    by_pay = df_enhanced.groupby("PaymentMethod")["ChurnFlag"].mean().sort_values(ascending=False)
    print("\nChurn rate by PaymentMethod:"); print(by_pay)
