In [None]:
def numeric_like_columns(df, protect=("customerID",), thresh=0.95):
    cand = [c for c in df.select_dtypes(include="object").columns if c not in protect]
    out = []
    for c in cand:
        s = df[c].astype("string").str.strip()
        s = s.str.replace(r"[,$%]", "", regex=True).str.replace(r"\(([^)]+)\)", r"-\1", regex=True)
        ratio = pd.to_numeric(s, errors="coerce").notna().mean()
        if ratio >= thresh:
            out.append(c)
    return sorted(out)


<details>
<summary> 2.0.2 schema consistency check </summary>


You don‚Äôt *have* to change it‚Äîthe current check is correct.
If what you want is **richer diagnostics** (which values caused >2 uniques, sample counts, and optional normalization without mutating `df`), swap that block for this drop-in:

```python
# --------- Optional integrity checks (binary columns should be 2-unique) ----------
# richer diagnostics + safe normalization (temp only)
def _normalize_binary_series(s: pd.Series) -> pd.Series:
    if s.dtype == "O" or pd.api.types.is_string_dtype(s):
        m = {"yes":"yes","y":"yes","1":"yes","true":"yes",
             "no":"no","n":"no","0":"no","false":"no"}
        return (s.astype("string").str.strip().str.lower().map(m).fillna(s))
    return s  # leave numeric/boolean as-is

binary_not_two = []
binary_details = []  # for reporting

for c in schema.get("binary", {}).get("columns", []):
    if c not in df.columns:
        continue
    s = _normalize_binary_series(df[c])
    vals = s.dropna().value_counts()
    nunq = int(vals.size)

    if nunq != 2:
        binary_not_two.append((c, nunq))
        # capture a compact snapshot for the report (top few values)
        binary_details.append({
            "column": c,
            "nunique": nunq,
            "top_values": vals.head(5).to_dict()
        })
```

Then, in your print/report section, you can show more helpful info:

```python
if binary_not_two:
    print(f"\n‚ùå Binary columns not 2-unique ({len(binary_not_two)}): {binary_not_two}")
    for d in binary_details:
        print(f"   ‚Ä¢ {d['column']}: top values ‚Üí {d['top_values']}")
else:
    print("‚úÖ All schema binary columns are 2-unique")
```

### What this improves

* **Explains why** a column isn‚Äôt binary (you see the offending values).
* **Handles common yes/no variants** temporarily (no mutation of `df`).
* Keeps your existing `binary_not_two` list so the rest of your pipeline doesn‚Äôt change.

</details>


<details>
<summary> 2.0.2 schema consistency check function </summary>

```python
from pathlib import Path
import pandas as pd
import yaml

def run_dataset_guard(
    df: pd.DataFrame | None = None,
    *,
    schema_path: Path = Path("config/feature_schema.yaml"),
    inline_schema: dict | None = None,
    auto_load_patterns: list[str] = ("**/telco_*.parquet", "**/telco_*.csv"),
    raise_on_critical: bool = True,
    section2_report_path: str | Path | None = None,
    verbose: bool = True,
) -> dict:
    """
    Notebook- & script-friendly dataset guard:
      - Auto-loads a dataset if df is None
      - Loads schema (YAML with robust fallback)
      - Derives/validates target
      - Checks schema consistency
      - Optionally appends to a unified CSV report
      - Returns a structured summary dict

    Returns
    -------
    dict
        {
          status, target, schema_version, rows, cols,
          missing_cols, unexpected_cols, binary_not_two,
          bad_numeric (if any), near_miss (if any)
        }
    """
    # -------------------- Settings & defaults --------------------
    if inline_schema is None:
        inline_schema = {
            "target": "Churn_flag",
            "binary": {
                "columns": ["gender", "Partner", "Dependents", "PhoneService", "PaperlessBilling"]
            },
            "continuous": {
                "columns": ["tenure", "MonthlyCharges", "TotalCharges"]
            },
            "categorical": {
                "columns": [
                    "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup",
                    "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
                    "Contract", "PaymentMethod"
                ]
            }
        }

    # -------------------- Load schema with fallback --------------------
    if schema_path.exists():
        try:
            schema = yaml.safe_load(schema_path.read_text(encoding="utf-8"))
            if verbose: print(f"üìò Loaded schema from {schema_path}")
        except Exception as e:
            if verbose: print(f"‚ö†Ô∏è Failed to parse schema ({e}); using inline fallback.")
            schema = inline_schema
    else:
        if verbose: print("üìò Using inline fallback schema (YAML not found)")
        schema = inline_schema

    schema_version = schema.get("version", "n/a")
    if verbose: print(f"Schema version: {schema_version}")

    target_name = schema.get("target", "Churn_flag")

    # -------------------- Load dataframe if needed --------------------
    files = []
    if df is None:
        root = Path.cwd()
        files = [p for pat in auto_load_patterns for p in root.glob(pat)]
        files = sorted(files, key=lambda p: p.stat().st_mtime, reverse=True)
        if verbose: print(f"üîé Candidate data files found: {len(files)}")

        if not files:
            raise FileNotFoundError("‚ùå No dataset found (looked for telco_*.parquet/csv).")

        latest = files[0]
        if verbose: print(f"üì¶ Auto-loading dataset: {latest}")

        ext = latest.suffix.lower()
        if ext == ".parquet":
            df = pd.read_parquet(latest)
        elif ext == ".csv":
            df = pd.read_csv(latest)
        else:
            raise ValueError(f"Unsupported format: {ext}")
    else:
        if verbose: print("‚úÖ Using dataset already in memory (df)")

    # Safe working copy & normalized columns
    df = df.copy()
    df.columns = df.columns.str.strip()

    # Duplicate column names (warn + dedupe)
    dupes = df.columns[df.columns.duplicated()].tolist()
    if dupes:
        if verbose: print(f"‚ö†Ô∏è Duplicate column names detected: {dupes}")
        df = df.loc[:, ~df.columns.duplicated()].copy()
        if verbose: print(f"‚ÑπÔ∏è Dropped {len(dupes)} duplicate column(s)")

    # -------------------- Target derive/validate --------------------
    if target_name not in df.columns:
        if "Churn" in df.columns:
            if verbose: print(f"‚öôÔ∏è Creating {target_name} from 'Churn' ‚Ä¶")
            churn_norm = (
                df["Churn"]
                .map({True: "yes", False: "no"})       # handle booleans first
                .astype("string").str.strip().str.lower()
                .map({
                    "yes": "yes", "y": "yes", "1": "yes", "true": "yes",
                    "no": "no", "n": "no", "0": "no", "false": "no"
                })
            )
            # Map to 0/1 (nullable for diagnostics)
            df[target_name] = churn_norm.map({"no": 0, "yes": 1}).astype("Int8")

            bad_mask = df[target_name].isna()
            if bad_mask.any():
                unmapped = (
                    df.loc[bad_mask, "Churn"]
                      .astype("string").str.strip().str.lower()
                      .value_counts().to_dict()
                )
                msg = f"‚ùå Could not map some 'Churn' values to 0/1. Unmapped: {unmapped}"
                if raise_on_critical:
                    raise ValueError(msg)
                else:
                    print(msg)

            # If fully mapped, enforce compact dtype
            if df[target_name].notna().all():
                df[target_name] = df[target_name].astype("int8")
                if verbose: print(f"‚úÖ Created {target_name} and verified dtype int8")
        else:
            msg = f"‚ùå Target '{target_name}' missing and no 'Churn' to derive from."
            if raise_on_critical:
                raise ValueError(msg)
            else:
                print(msg)

    # Binary integrity
    if target_name in df.columns:
        u = sorted(df[target_name].dropna().unique().tolist())
        if u != [0, 1]:
            msg = f"‚ùå Target '{target_name}' must be binary 0/1. Found unique={u}"
            if raise_on_critical: 
                raise ValueError(msg)
            else:
                print(msg)
        else:
            if verbose: print(f"‚úÖ Target '{target_name}' verified binary (0/1)")

        # Fail fast if entirely NA
        if df[target_name].isna().all():
            msg = f"‚ùå Target '{target_name}' is entirely NA after mapping."
            if raise_on_critical: 
                raise ValueError(msg)
            else:
                print(msg)

    # -------------------- Schema consistency --------------------
    groups = [g for g in ("binary", "continuous", "categorical") if g in schema]
    expected = pd.Index([target_name])
    for g in groups:
        expected = expected.union(pd.Index(schema[g].get("columns", [])))

    cols = pd.Index(df.columns)
    missing_cols = expected.difference(cols).tolist()
    unexpected_cols = cols.difference(expected).tolist()

    # Name-similar warning for unexpected cols
    expected_lower = set(map(str.lower, expected.tolist()))
    near_miss = [c for c in unexpected_cols if c.lower().strip() in expected_lower]
    if near_miss and verbose:
        print(f"‚ÑπÔ∏è Unexpected but name-similar columns (check casing/whitespace): {near_miss}")

    # Optional integrity: declared binary columns should be 2-unique
    binary_not_two = []
    for c in schema.get("binary", {}).get("columns", []):
        if c in df.columns:
            nunq = int(df[c].dropna().nunique())
            if nunq != 2:
                binary_not_two.append((c, nunq))

    # Optional: coerce continuous to numeric and report new NaNs
    coerced = []
    bad_numeric = {}
    for c in schema.get("continuous", {}).get("columns", []):
        if c in df.columns:
            before_nulls = df[c].isna().sum()
            df[c] = pd.to_numeric(df[c], errors="coerce")
            after_nulls = df[c].isna().sum()
            if after_nulls > before_nulls:
                bad_numeric[c] = int(after_nulls - before_nulls)
            coerced.append(c)
    if bad_numeric and verbose:
        print(f"‚ö†Ô∏è Continuous columns coerced to numeric with new NaNs: {bad_numeric}")

    # -------------------- Print summary --------------------
    if verbose:
        print("\nüìã Schema Consistency Check")
        print(f"Expected columns: {len(expected)} | Found: {len(cols)}")
        print(f"‚ùå Missing columns: {missing_cols}" if missing_cols else "‚úÖ No missing columns")
        if unexpected_cols:
            N = 20
            head = unexpected_cols[:N]
            tail = unexpected_cols[-N:] if len(unexpected_cols) > N else []
            print(f"\n‚ö†Ô∏è Unexpected columns ({len(unexpected_cols)}):")
            for c in head: print(f"  - {c}")
            if tail and tail != head:
                print("  ...")
                for c in tail: print(f"  - {c}")
        else:
            print("‚úÖ No unexpected columns")
        print(f"‚ùå Binary columns not 2-unique: {binary_not_two}" if binary_not_two else "‚úÖ All binary columns are 2-unique")

    # Status & raising
    critical = bool(missing_cols or binary_not_two)
    status = "FAIL" if critical else ("WARN" if unexpected_cols else "OK")
    if verbose: print(f"\nStatus: {status}")

    if critical and raise_on_critical:
        raise ValueError("Schema validation failed (critical issues above).")

    # -------------------- Atomic append to report (optional) --------------------
    if section2_report_path is not None:
        report_path = Path(section2_report_path)
        report_path.parent.mkdir(parents=True, exist_ok=True)

        if target_name in df.columns:
            counts = df[target_name].value_counts(dropna=False)
            ratio = float(counts.get(1, 0)) / float(counts.sum()) if counts.sum() else 0.0
            balance_str = f"{counts.to_dict()} | churn_rate={ratio:.4f}"
        else:
            balance_str = "target_missing"

        schema_chunk = pd.DataFrame([
            {"section":"0.1_schema_check","rule":"schema_version","value": schema_version},
            {"section":"0.1_schema_check","rule":"missing_cols","value":";".join(missing_cols) or "none"},
            {"section":"0.1_schema_check","rule":"unexpected_cols","value":";".join(unexpected_cols) or "none"},
            {"section":"0.1_schema_check","rule":"binary_not_two","value":";".join(f"{c}:{n}" for c,n in binary_not_two) or "none"},
            {"section":"0.1_schema_check","rule":"class_balance","value": balance_str},
            {"section":"0.1_schema_check","rule":"status","value": status}
        ])

        write_header = not report_path.exists()
        schema_chunk.to_csv(report_path, mode="a", header=write_header, index=False, encoding="utf-8")
        if verbose: print(f"üìù Appended schema check to: {report_path}")

    # -------------------- Summary dict (return) --------------------
    n_rows, n_cols = int(df.shape[0]), int(df.shape[1])
    summary = {
        "status": status,
        "target": target_name,
        "schema_version": schema_version,
        "rows": n_rows,
        "cols": n_cols,
        "missing_cols": missing_cols,
        "unexpected_cols": unexpected_cols,
        "binary_not_two": binary_not_two,
    }
    if bad_numeric:
        summary["bad_numeric"] = bad_numeric
    if near_miss:
        summary["near_miss"] = near_miss
    if files:
        summary["auto_loaded_from"] = str(files[0])

    return summary
```


<details>
<summary> tidy, minimal refactor of your Section-2 work into **3 small, reusable functions** </summary>

<!-- awesome ‚Äî here‚Äôs a tidy, minimal refactor of your Section-2 work into **3 small, reusable functions** (+ one tiny helper). they‚Äôre pure (no globals), notebook-friendly, and write nothing unless you call the append helper. -->

---

# `src/quality/section2.py`

# src/quality/section2.py
from __future__ import annotations
from pathlib import Path
from typing import Iterable, List, Optional, Sequence
import os
import pandas as pd
import numpy as np

# ----------------------------
# 0) Small helper: atomic append
# ----------------------------
def atomic_append_csv(path: Path, chunk: pd.DataFrame) -> Path:
    """
    Append `chunk` to CSV at `path` atomically, aligning columns.
    Creates file if missing. Returns the final path.
    """

    path = Path(path)
    tmp = path.with_suffix(path.suffix + ".tmp")
    path.parent.mkdir(parents=True, exist_ok=True)

    if path.exists():
        existing = pd.read_csv(path)
        all_cols = pd.Index(existing.columns).union(chunk.columns)
        out = pd.concat(
            [existing.reindex(columns=all_cols), chunk.reindex(columns=all_cols)],
            ignore_index=True
        )
    else:
        out = chunk

    out.to_csv(tmp, index=False)
    os.replace(tmp, path)
    return path


# ------------------------------------------
# 1) Numeric-like object detector (no I/O)
# ------------------------------------------
def numeric_like_columns(
    df: pd.DataFrame,
    protect: Sequence[str] = ("customerID",),
    thresh: float = 0.95,
) -> List[str]:
    """
    Return object columns whose trimmed values are ‚â• `thresh` parsable as numbers.
    Used to (a) include in blank/whitespace scans, and (b) decide later coercion.
    """
    obj_cols = [c for c in df.select_dtypes(include="object").columns if c not in protect]
    out: List[str] = []

    for c in obj_cols:
        s = df[c].astype("string")
        cleaned = (
            s.str.strip()
             .str.replace(r"[,$%]", "", regex=True)
             .str.replace(r"\(([^)]+)\)", r"-\1", regex=True)  # (123) -> -123
        )
        nn = cleaned.dropna()
        if nn.empty:
            continue
        ratio = pd.to_numeric(nn, errors="coerce").notna().mean()
        if ratio >= thresh:
            out.append(c)

    return sorted(set(out))


# -------------------------------------------------
# 2) Missing / Null / Blank scan (report, no I/O)
# -------------------------------------------------
def missing_blank_report(
    df: pd.DataFrame,
    id_cols: Iterable[str] = ("customerID",),
    extra_blank_scan_cols: Optional[Iterable[str]] = None,
    section_tag: str = "2.1_missing_null_blank",
    run_ts: Optional[str] = None,
) -> pd.DataFrame:
    """
    Build a per-column report with:
      - nulls, empty_strings (""), whitespace_only ("  ")
      - total_issues, pct_missing, pct_blank
    Returns a tidy DataFrame ready to append to your unified CSV.
    """
    N = len(df)
    run_ts = run_ts or pd.Timestamp.now().isoformat(timespec="seconds")

    nulls = df.isna().sum()

    empty = pd.Series(0, index=df.columns, dtype="int64")
    spaces = pd.Series(0, index=df.columns, dtype="int64")

    # string-like columns to scan for blanks/whitespace
    base_scan = set(df.select_dtypes(include=["object", "category"]).columns)
    if extra_blank_scan_cols:
        base_scan |= set(extra_blank_scan_cols)
    scan_cols = sorted(base_scan.difference(set(id_cols)))

    for c in scan_cols:
        s = df[c].astype("string", copy=False)
        empty[c]  = s.eq("").sum()
        spaces[c] = s.str.strip().eq("").sum()

    rep = (
        pd.DataFrame({"nulls": nulls, "empty_strings": empty, "whitespace_only": spaces})
          .assign(
              total_issues=lambda x: x[["nulls","empty_strings","whitespace_only"]].sum(axis=1),
              pct_missing=lambda x: (x["nulls"] / max(1, N) * 100).round(2),
              pct_blank=lambda x: ((x["empty_strings"] + x["whitespace_only"]) / max(1, N) * 100).round(2),
              section=section_tag,
              rule="missing_null_blank",
              run_ts=run_ts,
          )
          .reset_index(names="column")
          .sort_values("total_issues", ascending=False)
    )
    return rep


# -------------------------------------------------------
# 3) Constant / Nearly-Constant scan (report, no I/O)
# -------------------------------------------------------
def low_variance_report(
    df: pd.DataFrame,
    id_cols: Iterable[str] = ("customerID",),
    nearly_const_thresh: float = 0.98,
    section_tag: str = "2.2_constant_lowvariance",
    run_ts: Optional[str] = None,
) -> pd.DataFrame:
    """
    Classify columns as all_null / constant / nearly_constant (‚â• threshold of one level).
    Returns a tidy report aligned to your unified schema.
    """
    run_ts = run_ts or pd.Timestamp.now().isoformat(timespec="seconds")
    n_rows = len(df)
    rows = []

    scan_cols = [c for c in df.columns if c not in set(id_cols)]
    for c in scan_cols:
        vc = df[c].value_counts(dropna=False)
        if vc.empty:
            continue
        n_unique = int(vc.size)
        top_val = vc.index[0]
        top_count = int(vc.iloc[0])
        top_freq = top_count / n_rows if n_rows else np.nan

        if df[c].isna().all():
            rule = "all_null"
        elif n_unique == 1:
            rule = "constant"
        elif top_freq >= nearly_const_thresh:
            rule = "nearly_constant"
        else:
            continue

        rows.append({
            "column": c,
            "dtype": str(df[c].dtype),
            "rule": rule,
            "unique_count": n_unique,
            "top_value": top_val,
            "top_count": top_count,
            "top_freq": top_freq,
        })

    base = pd.DataFrame(rows)
    if base.empty:
        # return an aligned-but-empty frame (helps with appends)
        return pd.DataFrame(columns=[
            "column","dtype","rule","section","run_ts","n_rows","threshold",
            "unique_count","top_value","top_count","top_freq",
            "nulls","empty_strings","whitespace_only","total_issues","pct_missing","pct_blank",
        ])

    out = (
        base.assign(
            section=section_tag,
            run_ts=run_ts,
            n_rows=n_rows,
            threshold=nearly_const_thresh,
            # placeholders to align with missing/blank schema
            nulls=pd.NA, empty_strings=pd.NA, whitespace_only=pd.NA,
            total_issues=pd.NA, pct_missing=pd.NA, pct_blank=pd.NA,
        )[[
            "column","dtype","rule","section","run_ts","n_rows","threshold",
            "unique_count","top_value","top_count","top_freq",
            "nulls","empty_strings","whitespace_only","total_issues","pct_missing","pct_blank",
        ]]
    )
    # numeric tidy
    if "top_freq" in out.columns:
        out["top_freq"] = pd.to_numeric(out["top_freq"], errors="coerce").round(4)
    return out
```

---

# how to use in your notebook

```python
# In a cell near the top (after Section 1 paths):
from pathlib import Path
from datetime import datetime
import pandas as pd
from quality.section2 import (
    numeric_like_columns, missing_blank_report, low_variance_report, atomic_append_csv
)

# unified Section-2 CSV once per run
SECTION2_REPORT_PATH = REPORTS / f"section2_data_quality_{datetime.now():%Y%m%d_%H%M%S}.csv"

# --- 2.0.2 Numeric-like probe (for blanks scan & later coercion)
possible_numeric = numeric_like_columns(df, protect=("customerID",), thresh=0.95)
print("üîé Numeric-like object columns:", possible_numeric)

# --- 2.1 Missing / Null / Blank
rep_21 = missing_blank_report(
    df,
    id_cols=("customerID",),
    extra_blank_scan_cols=possible_numeric,   # ensure e.g. TotalCharges gets scanned
    section_tag="2.1_missing_null_blank",
)
atomic_append_csv(SECTION2_REPORT_PATH, rep_21)
display(rep_21.query("total_issues > 0").head(20))

# --- 2.2 Constant / Nearly-Constant
rep_22 = low_variance_report(
    df,
    id_cols=("customerID",),
    nearly_const_thresh=0.98,
    section_tag="2.2_constant_lowvariance",
)
if not rep_22.empty:
    atomic_append_csv(SECTION2_REPORT_PATH, rep_22)
    display(rep_22.sort_values(["rule","top_freq"], ascending=[True, False]).head(20))
else:
    print("‚úÖ No constant / nearly-constant columns")

print(f"üßæ Unified report: {SECTION2_REPORT_PATH}")
```

---

## why this helps (and keeps hiring managers happy)

* **tiny, purpose-built functions** w/ docstrings ‚úÖ
* **pure + testable** (no hidden globals, return DataFrames) ‚úÖ
* **consistent unified schema** across steps ‚úÖ
* **atomic append helper** you can reuse everywhere ‚úÖ

if you want, i can add a fourth optional function later for **schema guard** (taking a Python `schema` dict or YAML) that returns a compact ‚Äúguard report‚Äù chunk you can also append with the same helper.


<details>
<summary> 2.0.1A üß©üîí Dataset Guard drop-in upgrade / that adds a `problems = []` collector and captures **row samples** for each failing rule.  </summary>

# 2.0.1A üß©üîí Dataset Guard (Auto-Load + Target + Schema Consistency Check /Guard/ report)

Awesome idea. Here‚Äôs a **drop-in upgrade** that adds a `problems = []` collector and captures **row samples** for each failing rule. I kept it minimal and notebook-friendly.

Paste the block below into your validator (it‚Äôs self-contained). It assumes you‚Äôre using the latest version we just cleaned; you can replace the corresponding sections, or paste and tweak where noted.

```python
# =========================
# NEW: problems[] collector
# =========================
from typing import List, Dict, Any

problems: List[Dict[str, Any]] = []

def _sample_rows(df, mask=None, cols=None, n=5) -> List[Dict[str, Any]]:
    """
    Return up to n row samples as list-of-dicts. If mask is None, returns empty.
    """
    try:
        if mask is None:
            return []
        sub = df.loc[mask, cols] if cols is not None else df.loc[mask]
        if sub.empty:
            return []
        return sub.head(n).to_dict(orient="records")
    except Exception:
        return []

def _add_problem(rule: str,
                 severity: str,
                 message: str,
                 affected_cols: List[str] = None,
                 sample_rows: List[Dict[str, Any]] = None,
                 extras: Dict[str, Any] = None):
    problems.append({
        "rule": rule,
        "severity": severity,                  # "CRITICAL" | "WARN" | "INFO"
        "message": message,
        "affected_cols": affected_cols or [],
        "sample_rows": sample_rows or [],
        "extras": extras or {},
    })

# =========================
# Hook 1: Duplicate columns
# =========================
dupes = df.columns[df.columns.duplicated()].tolist()
if dupes:
    # Keep an audit before dropping
    _add_problem(
        rule="duplicate_columns",
        severity="WARN",
        message=f"Duplicate column names detected: {dupes}",
        affected_cols=dupes
    )
    df = df.loc[:, ~df.columns.duplicated()].copy()
    print(f"‚ÑπÔ∏è Dropped {len(dupes)} duplicate column(s)")

# ==================================================
# Hook 2: Churn -> TARGET mapping (unmapped samples)
# ==================================================
unmapped = None           # we'll set if we discover any unmapped
unmapped_mask = None      # boolean mask to sample rows

if TARGET_NAME not in df.columns and "Churn" in df.columns:
    # after your churn mapping logic...
    # ... you already set df[TARGET_NAME] = Int8 with possible NAs
    unmapped_mask = df[TARGET_NAME].isna()
    if unmapped_mask.any():
        # Collect distribution of the *original* Churn values that failed
        unmapped = (
            df.loc[unmapped_mask, "Churn"]
              .astype("string").str.strip().str.lower()
              .value_counts().to_dict()
        )
        # Prefer sampling key ID columns if present
        sample_cols = ["customerID", "Churn"] if "customerID" in df.columns else ["Churn"]
        samples = _sample_rows(df, unmapped_mask, cols=sample_cols, n=5)
        _add_problem(
            rule="target_mapping_unmapped",
            severity="CRITICAL" if RAISE_ON_CRITICAL else "WARN",
            message=f"Could not map some 'Churn' values to 0/1.",
            affected_cols=[TARGET_NAME],
            sample_rows=samples,
            extras={"unmapped_value_counts": unmapped}
        )

# =======================================================
# Hook 3: Target not strictly binary (0/1) ‚Äî show sample
# =======================================================
if TARGET_NAME in df.columns:
    u = sorted(df[TARGET_NAME].dropna().unique().tolist())
    if u != [0, 1]:
        # sample rows where values are not in {0,1}
        bad_mask = df[TARGET_NAME].notna() & ~df[TARGET_NAME].isin([0, 1])
        samples = _sample_rows(
            df, bad_mask,
            cols=["customerID", TARGET_NAME] if "customerID" in df.columns else [TARGET_NAME],
            n=5
        )
        _add_problem(
            rule="target_not_binary",
            severity="CRITICAL" if RAISE_ON_CRITICAL else "WARN",
            message=f"Target '{TARGET_NAME}' must be binary 0/1. Found unique={u}",
            affected_cols=[TARGET_NAME],
            sample_rows=samples,
            extras={"unique_values": u}
        )

# ==============================================
# Hook 4: Target is entirely NA ‚Äî no row samples
# ==============================================
if TARGET_NAME in df.columns and df[TARGET_NAME].isna().all():
    _add_problem(
        rule="target_all_na",
        severity="CRITICAL" if RAISE_ON_CRITICAL else "WARN",
        message=f"Target '{TARGET_NAME}' is entirely NA after mapping.",
        affected_cols=[TARGET_NAME],
    )

# ===========================================
# Hook 5: Missing columns ‚Äî suggest near-miss
# ===========================================
if missing_cols:
    # Try to find near-miss names (case/trim variants) among existing columns
    lower_cols = {c.lower().strip(): c for c in df.columns}
    suggestions = {}
    for m in missing_cols:
        key = m.lower().strip()
        if key in lower_cols:
            suggestions[m] = lower_cols[key]
    _add_problem(
        rule="missing_columns",
        severity="CRITICAL" if RAISE_ON_CRITICAL else "WARN",
        message=f"Missing expected columns: {missing_cols}",
        affected_cols=missing_cols,
        extras={"near_miss_suggestions": suggestions}
    )

# ===========================================
# Hook 6: Unexpected columns ‚Äî sample a few
# ===========================================
if unexpected_cols:
    # show a few rows of just the unexpected columns
    keep = unexpected_cols[: min(6, len(unexpected_cols))]
    samples = _sample_rows(df, mask=df.index == df.index, cols=keep, n=3)  # first 3 rows
    _add_problem(
        rule="unexpected_columns",
        severity="WARN",
        message=f"Found {len(unexpected_cols)} unexpected columns.",
        affected_cols=unexpected_cols,
        sample_rows=samples
    )

# ============================================================
# Hook 7: Binary columns not exactly 2-unique ‚Äî show examples
# ============================================================
for col, nunq in (binary_not_two or []):
    counts = (
        df[col].astype("string").str.strip().str.lower()
          .value_counts(dropna=False).to_dict()
    ) if col in df.columns else {}
    # sample 5 raw values for inspection
    samples = _sample_rows(
        df, mask=df[col].notna() if col in df.columns else None,
        cols=["customerID", col] if "customerID" in df.columns and col in df.columns else ([col] if col in df.columns else None),
        n=5
    )
    _add_problem(
        rule="binary_not_two_unique",
        severity="CRITICAL" if RAISE_ON_CRITICAL else "WARN",
        message=f"Binary column '{col}' has {nunq} unique values (expected 2).",
        affected_cols=[col],
        sample_rows=samples,
        extras={"value_counts": counts}
    )

# =========================================================================
# Hook 8: Continuous coercion created NaNs ‚Äî capture which rows became NaN
# =========================================================================
# To support row sampling here, keep a copy *before* coercion when you coerce.
# Replace your coercion loop with this version:
bad_numeric = {}
coercion_samples = {}   # col -> sample rows that became NaN after coercion

for c in schema.get("continuous", {}).get("columns", []):
    if c in df.columns:
        s_before = df[c].copy()
        before_nulls = s_before.isna().sum()

        # attempt coercion
        s_after = pd.to_numeric(s_before, errors="coerce")
        df[c] = s_after

        after_nulls = s_after.isna().sum()
        if after_nulls > before_nulls:
            # Rows that were non-null before but became NaN after coercion
            new_nan_mask = s_before.notna() & s_after.isna()
            bad_numeric[c] = int(after_nulls - before_nulls)
            # sample a few original "bad" values (with ID if present)
            sample_cols = ["customerID", c] if "customerID" in df.columns else [c]
            coercion_samples[c] = _sample_rows(df.assign(_before=s_before), new_nan_mask, cols=["customerID", "_before"] if "customerID" in df.columns else ["_before"], n=5)

if bad_numeric:
    print(f"‚ö†Ô∏è Continuous columns coerced to numeric with new NaNs: {bad_numeric}")
    # Log one consolidated problem with per-column samples
    _add_problem(
        rule="continuous_coercion_new_nans",
        severity="WARN",
        message="Numeric coercion introduced NaNs in continuous columns.",
        affected_cols=list(bad_numeric.keys()),
        sample_rows=[],  # keep consolidated samples in extras
        extras={"new_nan_counts": bad_numeric, "samples_per_column": coercion_samples}
    )

# ===========================
# (Optional) Near-miss helper
# ===========================
if 'near_miss' in locals() and near_miss:
    _add_problem(
        rule="name_similarity_warnings",
        severity="INFO",
        message="Columns that are unexpected but look like case/whitespace variants.",
        affected_cols=near_miss
    )

# ================================================
# Final: pretty-print problems & (optionally) save
# ================================================
print(f"\nüßæ Problems collected: {len(problems)}")
for i, p in enumerate(problems, 1):
    print(f"\n[{i}] {p['severity']} ‚Äî {p['rule']}")
    print(f"    {p['message']}")
    if p.get("affected_cols"):
        print(f"    Affected cols: {p['affected_cols'][:8]}{' ‚Ä¶' if len(p['affected_cols'])>8 else ''}")
    # print a tiny sample if present
    if p.get("sample_rows"):
        print(f"    Samples (up to 5 rows):")
        for r in p["sample_rows"][:5]:
            print(f"      - {r}")
    if p.get("extras"):
        # avoid dumping huge dicts ‚Äî show keys only
        keys = list(p["extras"].keys())
        print(f"    Extras keys: {keys}")

# If you'd like to persist the full problems payload alongside your CSV:
if "SECTION2_REPORT_PATH" in globals():
    try:
        import json
        report_path = Path(SECTION2_REPORT_PATH)
        issues_json = report_path.with_name(report_path.stem.replace(".csv","") + "_issues.json")
        issues_json.write_text(json.dumps(problems, ensure_ascii=False, indent=2), encoding="utf-8")
        print(f"üíæ Wrote detailed issues JSON to: {issues_json}")
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to write issues JSON: {e}")
```

### What this captures (with samples)

* `duplicate_columns` ‚Äî list of dupes before dropping.
* `target_mapping_unmapped` ‚Äî original `Churn` values that failed, with sample rows.
* `target_not_binary` ‚Äî rows where target isn‚Äôt 0/1.
* `target_all_na` ‚Äî whole target NA (no samples).
* `missing_columns` ‚Äî includes near-miss suggestions (case/space).
* `unexpected_columns` ‚Äî quick 3-row sample of unexpected columns.
* `binary_not_two_unique` ‚Äî per-column value counts + row samples.
* `continuous_coercion_new_nans` ‚Äî counts and **row samples per column** showing original values that became NaN after coercion.

If you want me to **merge this into your full 2.0.1A cell** and return a single consolidated block, I can do that too.


<details>
<summary>
------- 2.0.1: schema guard: Full V5 + 'problems=[]' collector-----------
</summary>

Got it‚Äîhere‚Äôs your **single, consolidated 2.0.1A** validator cell with a `problems = []` collector and **row samples** for failing rules. It logs to your CSV (if `SECTION2_REPORT_PATH` is set) **before** raising, and also writes a sidecar JSON (`*_issues.json`) with full details.

```python
# 2.0.1A üß©üîí Dataset Guard (Auto-Load + Target + Schema Consistency Check /Guard/ report)
# - Validates df against config/feature_schema.yaml
# - problems[] collector with row samples for failing rules
# - Appends concise CSV row to SECTION2_REPORT_PATH and writes *_issues.json (if path provided)
# - Notebook-friendly: fails after logging when RAISE_ON_CRITICAL=True

from pathlib import Path
from typing import List, Dict, Any
import pandas as pd
import yaml

# --------- Settings ----------
SCHEMA_PATH = Path("config/feature_schema.yaml")
AUTO_LOAD_PATTERN = ["**/telco_*.parquet", "**/telco_*.csv"]
RAISE_ON_CRITICAL = True   # flip to False if you want to continue on FAIL

# ------- Inline Schema Fallback -----------
INLINE_SCHEMA = {
    "target": "Churn_flag",
    "binary": {
        "columns": ["gender", "Partner", "Dependents", "PhoneService", "PaperlessBilling"]
    },
    "continuous": {
        "columns": ["tenure", "MonthlyCharges", "TotalCharges"]
    },
    "categorical": {
        "columns": [
            "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup",
            "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
            "Contract", "PaymentMethod"
        ]
    }
}

# =========================
# problems[] collector
# =========================
problems: List[Dict[str, Any]] = []

def _sample_rows(df, mask=None, cols=None, n=5) -> List[Dict[str, Any]]:
    """Return up to n row samples as list-of-dicts. If mask is None/empty, returns []."""
    try:
        if mask is None:
            return []
        sub = df.loc[mask, cols] if cols is not None else df.loc[mask]
        if sub.empty:
            return []
        return sub.head(n).to_dict(orient="records")
    except Exception:
        return []

def _add_problem(rule: str,
                 severity: str,
                 message: str,
                 affected_cols: List[str] = None,
                 sample_rows: List[Dict[str, Any]] = None,
                 extras: Dict[str, Any] = None):
    problems.append({
        "rule": rule,
        "severity": severity,                  # "CRITICAL" | "WARN" | "INFO"
        "message": message,
        "affected_cols": affected_cols or [],
        "sample_rows": sample_rows or [],
        "extras": extras or {},
    })

# --------- Load schema with fallback ----------
if SCHEMA_PATH.exists():
    try:
        schema = yaml.safe_load(SCHEMA_PATH.read_text(encoding="utf-8"))
        print(f"üìò Loaded schema from {SCHEMA_PATH}")
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to parse schema ({e}); using inline fallback.")
        schema = INLINE_SCHEMA
else:
    print("üìò Using inline fallback schema (YAML not found)")
    schema = INLINE_SCHEMA

schema_version = schema.get("version", "n/a")
print(f"Schema version: {schema_version}")
TARGET_NAME = schema.get("target", "Churn_flag")  # set early

# --------- Locate / load dataframe if df not present ----------
files = []  # safe to print even if df existed
if "df" not in locals():
    root = Path.cwd()
    files = [p for pat in AUTO_LOAD_PATTERN for p in root.glob(pat)]
    files = sorted(files, key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError("‚ùå No dataset found (looked for telco_*.parquet/csv).")
    latest = files[0]
    print(f"üì¶ Auto-loading dataset: {latest}")
    ext = latest.suffix.lower()
    if ext == ".parquet":
        df = pd.read_parquet(latest)
    elif ext == ".csv":
        df = pd.read_csv(latest)
    else:
        raise ValueError(f"Unsupported format: {ext}")
else:
    print("‚úÖ Using dataset already in memory (df)")

print(f"üîé Candidate data files found: {len(files)}")

# Make a safe working copy
df = df.copy()

# Normalize columns (trim only; you can also lower/underscores if desired)
df.columns = df.columns.str.strip()

# =========================
# Hook 1: Duplicate columns
# =========================
dupes = df.columns[df.columns.duplicated()].tolist()
if dupes:
    _add_problem(
        rule="duplicate_columns",
        severity="WARN",
        message=f"Duplicate column names detected: {dupes}",
        affected_cols=dupes
    )
    df = df.loc[:, ~df.columns.duplicated()].copy()
    print(f"‚ÑπÔ∏è Dropped {len(dupes)} duplicate column(s)")

# ==================================================
# Target creation: Churn -> TARGET_NAME (robust map)
# ==================================================
# Create target if missing, from 'Churn' if available
unmapped_mask = None  # to sample failures later
if TARGET_NAME not in df.columns:
    if "Churn" in df.columns:
        print(f"‚öôÔ∏è Creating {TARGET_NAME} from 'Churn' ‚Ä¶")
        s = df["Churn"]

        # Build a textual yes/no series using both bools and strings numerics safely
        if pd.api.types.is_bool_dtype(s):
            churn_text = s.map({True: "yes", False: "no"}).astype("string")
        else:
            # First map actual Python bools present in object dtype, else NA
            churn_text = s.map({True: "yes", False: "no"})
            # Where unmapped, use normalized string
            needs_text = churn_text.isna()
            churn_text = churn_text.astype("string")
            churn_text.loc[needs_text] = (
                s.astype("string").str.strip().str.lower()
            )

        # Normalize typical variants ‚Üí canonical yes/no
        churn_text = churn_text.map({
            "yes":"yes","y":"yes","1":"yes","true":"yes",
            "no":"no","n":"no","0":"no","false":"no"
        })

        # Map to 0/1 nullable
        df[TARGET_NAME] = churn_text.map({"no": 0, "yes": 1}).astype("Int8")

        # Collect unmapped sample rows (NA after mapping)
        unmapped_mask = df[TARGET_NAME].isna()
        if unmapped_mask.any():
            unmapped_counts = (
                df.loc[unmapped_mask, "Churn"]
                  .astype("string").str.strip().str.lower()
                  .value_counts().to_dict()
            )
            sample_cols = ["customerID", "Churn"] if "customerID" in df.columns else ["Churn"]
            samples = _sample_rows(df, unmapped_mask, cols=sample_cols, n=5)
            _add_problem(
                rule="target_mapping_unmapped",
                severity="CRITICAL" if RAISE_ON_CRITICAL else "WARN",
                message="Could not map some 'Churn' values to 0/1.",
                affected_cols=[TARGET_NAME],
                sample_rows=samples,
                extras={"unmapped_value_counts": unmapped_counts}
            )

        # If everything mapped, compact the dtype
        if df[TARGET_NAME].notna().all():
            df[TARGET_NAME] = df[TARGET_NAME].astype("int8")
            print(f"‚úÖ Created {TARGET_NAME} and verified dtype int8")
    else:
        _add_problem(
            rule="target_missing",
            severity="CRITICAL" if RAISE_ON_CRITICAL else "WARN",
            message=f"Target '{TARGET_NAME}' missing and no 'Churn' to derive from.",
            affected_cols=[TARGET_NAME]
        )
        print(f"‚ùå Target '{TARGET_NAME}' missing and no 'Churn' to derive from.")

# -------- Binary integrity of target ---------
target_binary_problem = None
if TARGET_NAME in df.columns:
    u = sorted(df[TARGET_NAME].dropna().unique().tolist())
    if u != [0, 1]:
        target_binary_problem = f"Target '{TARGET_NAME}' must be binary 0/1. Found unique={u}"
        # sample rows where target is not 0/1
        bad_mask = df[TARGET_NAME].notna() & ~df[TARGET_NAME].isin([0, 1])
        samples = _sample_rows(
            df, bad_mask,
            cols=["customerID", TARGET_NAME] if "customerID" in df.columns else [TARGET_NAME],
            n=5
        )
        _add_problem(
            rule="target_not_binary",
            severity="CRITICAL" if RAISE_ON_CRITICAL else "WARN",
            message=target_binary_problem,
            affected_cols=[TARGET_NAME],
            sample_rows=samples,
            extras={"unique_values": u}
        )
        print(f"‚ùå {target_binary_problem}")
    else:
        print(f"‚úÖ Target '{TARGET_NAME}' verified binary (0/1)")

# Fully NA target after mapping
if TARGET_NAME in df.columns and df[TARGET_NAME].isna().all():
    _add_problem(
        rule="target_all_na",
        severity="CRITICAL" if RAISE_ON_CRITICAL else "WARN",
        message=f"Target '{TARGET_NAME}' is entirely NA after mapping.",
        affected_cols=[TARGET_NAME],
    )
    print(f"‚ùå Target '{TARGET_NAME}' is entirely NA after mapping.")

# --------- Expected columns from schema ----------
groups = [g for g in ("binary","continuous","categorical") if g in schema]
expected = pd.Index([TARGET_NAME])
for g in groups:
    expected = expected.union(pd.Index(schema[g].get("columns", [])))

cols = pd.Index(df.columns)
missing_cols = expected.difference(cols).tolist()
unexpected_cols = cols.difference(expected).tolist()

# Near-miss suggestions (case/trim variants)
expected_lower = set(map(str.lower, expected.tolist()))
near_miss = [c for c in unexpected_cols if c.lower().strip() in expected_lower]
if near_miss:
    print(f"‚ÑπÔ∏è Unexpected but name-similar columns (check casing/whitespace): {near_miss}")

# Log missing columns w/ near-miss suggestions
if missing_cols:
    lower_cols = {c.lower().strip(): c for c in df.columns}
    suggestions = {m: lower_cols[m.lower().strip()] for m in missing_cols if m.lower().strip() in lower_cols}
    _add_problem(
        rule="missing_columns",
        severity="CRITICAL" if RAISE_ON_CRITICAL else "WARN",
        message=f"Missing expected columns: {missing_cols}",
        affected_cols=missing_cols,
        extras={"near_miss_suggestions": suggestions}
    )

# Log unexpected columns (show quick first-rows sample)
if unexpected_cols:
    keep = unexpected_cols[: min(6, len(unexpected_cols))]
    samples = _sample_rows(df, mask=df.index == df.index, cols=keep, n=3)  # first 3 rows
    _add_problem(
        rule="unexpected_columns",
        severity="WARN",
        message=f"Found {len(unexpected_cols)} unexpected columns.",
        affected_cols=unexpected_cols,
        sample_rows=samples
    )

# --------- Binary columns (should be 2-unique) ---------
binary_not_two = []
for c in schema.get("binary", {}).get("columns", []):
    if c in df.columns:
        nunq = int(df[c].dropna().nunique())
        if nunq != 2:
            binary_not_two.append((c, nunq))

for col, nunq in (binary_not_two or []):
    counts = (
        df[col].astype("string").str.strip().str.lower()
          .value_counts(dropna=False).to_dict()
    ) if col in df.columns else {}
    samples = _sample_rows(
        df,
        mask=df[col].notna() if col in df.columns else None,
        cols=["customerID", col] if "customerID" in df.columns and col in df.columns else ([col] if col in df.columns else None),
        n=5
    )
    _add_problem(
        rule="binary_not_two_unique",
        severity="CRITICAL" if RAISE_ON_CRITICAL else "WARN",
        message=f"Binary column '{col}' has {nunq} unique values (expected 2).",
        affected_cols=[col],
        sample_rows=samples,
        extras={"value_counts": counts}
    )

# --------- Enforce numeric dtype for continuous columns (with samples) ---------
bad_numeric = {}
coercion_samples = {}   # col -> list-of-row dicts (original values that became NaN)

for c in schema.get("continuous", {}).get("columns", []):
    if c in df.columns:
        s_before = df[c].copy()
        before_nulls = s_before.isna().sum()

        s_after = pd.to_numeric(s_before, errors="coerce")
        df[c] = s_after

        after_nulls = s_after.isna().sum()
        if after_nulls > before_nulls:
            new_nan_mask = s_before.notna() & s_after.isna()
            bad_numeric[c] = int(after_nulls - before_nulls)
            # capture original values that failed coercion
            sample_cols = ["customerID", c] if "customerID" in df.columns else [c]
            # show original (pre-coercion) values via an aux column
            coercion_samples[c] = _sample_rows(
                df.assign(_before=s_before),
                new_nan_mask,
                cols=(["customerID", "_before"] if "customerID" in df.columns else ["_before"]),
                n=5
            )

if bad_numeric:
    print(f"‚ö†Ô∏è Continuous columns coerced to numeric with new NaNs: {bad_numeric}")
    _add_problem(
        rule="continuous_coercion_new_nans",
        severity="WARN",
        message="Numeric coercion introduced NaNs in continuous columns.",
        affected_cols=list(bad_numeric.keys()),
        sample_rows=[],  # samples are large; keep in extras
        extras={"new_nan_counts": bad_numeric, "samples_per_column": coercion_samples}
    )

# --------- Name-similarity info ---------
if near_miss:
    _add_problem(
        rule="name_similarity_warnings",
        severity="INFO",
        message="Columns that are unexpected but look like case/whitespace variants.",
        affected_cols=near_miss
    )

# --------- Print concise summary (pre-raise) ----------
print("\nüìã Schema Consistency Check")
print(f"Expected columns: {len(expected)} | Found: {len(cols)}")
print(f"‚ùå Missing columns: {missing_cols}" if missing_cols else "‚úÖ No missing columns")
if unexpected_cols:
    N = 20
    head = unexpected_cols[:N]
    tail = unexpected_cols[-N:] if len(unexpected_cols) > N else []
    print(f"\n‚ö†Ô∏è Unexpected columns ({len(unexpected_cols)}):")
    for c in head: print(f"  - {c}")
    if tail and tail != head:
        print("  ...")
        for c in tail: print(f"  - {c}")
else:
    print("‚úÖ No unexpected columns")
print(f"‚ùå Binary columns not 2-unique: {binary_not_two}" if binary_not_two else "‚úÖ All binary columns are 2-unique")

# --------- Compute status ----------
critical_reasons = []
if missing_cols: critical_reasons.append("missing_cols")
if binary_not_two: critical_reasons.append("binary_not_two")
if target_binary_problem: critical_reasons.append("target_not_binary")
if TARGET_NAME in df.columns and df[TARGET_NAME].isna().all():
    critical_reasons.append("target_all_na")

critical = bool(critical_reasons)
status = "FAIL" if critical else ("WARN" if unexpected_cols or problems else "OK")
print(f"\nStatus: {status}" + (f" | reasons: {', '.join(critical_reasons)}" if critical else ""))

# --------- Append to SECTION2_REPORT_PATH (BEFORE any raise) ----------
if "SECTION2_REPORT_PATH" in globals():
    report_path = Path(SECTION2_REPORT_PATH)
    report_path.parent.mkdir(parents=True, exist_ok=True)

    if TARGET_NAME in df.columns:
        counts = df[TARGET_NAME].value_counts(dropna=False)
        ratio = float(counts.get(1, 0)) / float(counts.sum()) if counts.sum() else 0.0
        balance_str = f"{counts.to_dict()} | churn_rate={ratio:.4f}"
    else:
        balance_str = "target_missing"

    schema_chunk = pd.DataFrame([
        {"section":"0.1_schema_check","rule":"schema_version","value": schema_version},
        {"section":"0.1_schema_check","rule":"missing_cols","value":";".join(missing_cols) or "none"},
        {"section":"0.1_schema_check","rule":"unexpected_cols","value":";".join(unexpected_cols) or "none"},
        {"section":"0.1_schema_check","rule":"binary_not_two","value":";".join(f"{c}:{n}" for c,n in binary_not_two) or "none"},
        {"section":"0.1_schema_check","rule":"class_balance","value": balance_str},
        {"section":"0.1_schema_check","rule":"status","value": status}
    ])
    write_header = not report_path.exists()
    schema_chunk.to_csv(report_path, mode="a", header=write_header, index=False, encoding="utf-8")
    print(f"üìù Appended schema check to: {report_path}")

    # Write issues JSON alongside the CSV
    try:
        import json
        issues_json = report_path.with_name(report_path.stem.replace(".csv","") + "_issues.json")
        issues_json.write_text(json.dumps(problems, ensure_ascii=False, indent=2), encoding="utf-8")
        print(f"üíæ Wrote detailed issues JSON to: {issues_json}")
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to write issues JSON: {e}")
else:
    print("‚ÑπÔ∏è SECTION2_REPORT_PATH not set ‚Äî skipping report & issues JSON append.")

# --------- Pretty-print problems now ----------
print(f"\nüßæ Problems collected: {len(problems)}")
for i, p in enumerate(problems, 1):
    print(f"\n[{i}] {p['severity']} ‚Äî {p['rule']}")
    print(f"    {p['message']}")
    if p.get("affected_cols"):
        print(f"    Affected cols: {p['affected_cols'][:8]}{' ‚Ä¶' if len(p['affected_cols'])>8 else ''}")
    if p.get("sample_rows"):
        print(f"    Samples (up to 5 rows):")
        for r in p["sample_rows"][:5]:
            print(f"      - {r}")
    if p.get("extras"):
        keys = list(p["extras"].keys())
        print(f"    Extras keys: {keys}")

# --------- Raise if critical AFTER logging ---------
if critical and RAISE_ON_CRITICAL:
    raise ValueError("Schema validation failed (critical issues above).")

# -------------------- Summary dict (print) --------------------
schema_summary = {
    "status": status,
    "target": TARGET_NAME,
    "schema_version": schema_version,
    "missing_cols": missing_cols,
    "unexpected_cols": unexpected_cols,
    "binary_not_two": binary_not_two,
    "rows": int(df.shape[0]),
    "cols": int(df.shape[1]),
}
print("\nüì¶ Schema summary:")
for k, v in schema_summary.items():
    print(f"  {k}: {v}")
```

Tip: if you want lowercased, underscore column names across the board, change the normalize line to:

```python
df.columns = (df.columns
              .str.strip()
              .str.replace(r"\s+", "_", regex=True)
              .str.lower())
```


In [None]:
# LIB_01_EDA_scripts
class DataQualityChecker:
    """Elegant data quality checking for TotalCharges."""
    
    def __init__(self, df):
        self.df = df
        self.issues = []
        
    def check_total_charges(self):
        """Comprehensive TotalCharges validation."""
        col = 'TotalCharges'
        
        # Type check
        if self.df[col].dtype == 'object':
            # Find non-numeric
            numeric_mask = pd.to_numeric(self.df[col], errors='coerce').notna()
            invalid_count = (~numeric_mask).sum()
            
            if invalid_count > 0:
                # Analyze pattern
                invalid_df = self.df[~numeric_mask]
                
                self.issues.append({
                    'column': col,
                    'issue_type': 'non_numeric_values',
                    'count': invalid_count,
                    'percentage': invalid_count / len(self.df) * 100,
                    'unique_invalid_values': invalid_df[col].unique().tolist(),
                    'pattern': {
                        'all_tenure_zero': invalid_df['tenure'].eq(0).all(),
                        'avg_monthly_charges': invalid_df['MonthlyCharges'].mean()
                    },
                    'recommendation': 'Replace with MonthlyCharges for tenure=0 customers'
                })

        # Logic check
        numeric_charges = pd.to_numeric(self.df[col], errors='coerce')
        logic_issues = (numeric_charges < self.df['MonthlyCharges']) & (self.df['tenure'] > 1)
        
        if logic_issues.any():
            self.issues.append({
                'column': col,
                'issue_type': 'logical_inconsistency',
                'count': logic_issues.sum(),
                'description': 'TotalCharges less than MonthlyCharges for tenure > 1'
            })
        
        return self
    
    def get_report(self):
        """Generate elegant report."""
        if not self.issues:
            return "‚úÖ No issues found in TotalCharges"
        
        report = "üîç TotalCharges Quality Report\n" + "="*40 + "\n"
        for issue in self.issues:
            report += f"\n‚ö†Ô∏è Issue: {issue['issue_type']}\n"
            report += f"   Count: {issue['count']} ({issue['percentage']:.1f}%)\n"
            if 'unique_invalid_values' in issue:
                report += f"   Values found: {issue['unique_invalid_values']}\n"
            if 'pattern' in issue:
                report += f"   Pattern: All tenure=0: {issue['pattern']['all_tenure_zero']}\n"
            if 'recommendation' in issue:
                report += f"   ‚ú® Fix: {issue['recommendation']}\n"
        
        return report

# Usage
checker = DataQualityChecker(df)
print(checker.check_total_charges().get_report())

<details>
<summary><h2> 

### üîß `Operators in Python` </h2></summary>

<h4>

### üîß Bitwise Assignment Operators in Python`
</h4>
<ul>
  <li>Create virtual environment, install dependencies</li>
  <li>Load data from BigQuery / local CSV</li>
</ul>

| Operator | Name                       | Example     | Equivalent To    |
|----------|----------------------------|-------------|------------------|
| `&=`     | Bitwise AND assignment     | `a &= b`    | `a = a & b`      |
| `|=`     | Bitwise OR assignment      | `a |= b`    | `a = a | b`      |
| `^=`     | Bitwise XOR assignment     | `a ^= b`    | `a = a ^ b`      |
| `<<=`    | Left shift assignment      | `a <<= b`   | `a = a << b`     |
| `>>=`    | Right shift assignment     | `a >>= b`   | `a = a >> b`     |

># üîç Focus on ^=
**^=** toggles bits where the other operand is 1.
- Common use: flipping a single bit (like toggling between 0 and 1 for parity checks).

|Example:|

```python
x = 10      # 0b1010
x ^= 3      # 0b0011
print(x)    # 9 (0b1001)
```

>### ‚úÖ Tip: When to Use ^=

Toggling flags or parity (like even/odd checks)

Swapping values without a temporary variable (though not recommended for readability)
- Bitmask operations
- Competitive programming or algorithm optimizations

Let me know if you‚Äôd like a deep dive into how these work with actual bit patterns!

## VERSION 2
def perform_complete_eda(df, save_figures=False):
    """
    Complete EDA workflow using our extracted functions.
    This is the Level 2 culmination:
    - Use functions instead of repetitive code
    - Systematic approach to analysis
    - Clear documentation of findings
    """
    # 1. Data Validation
    validate_dataset(df)
    
    # 2. Separate variable types
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    
    # 3. Analyze each type systematically
    print("\n=== Categorical Variables Analysis ===")
    for col in categorical_cols:
        if col not in ['customerID']:  # Skip ID
            fig = analyze_categorical(df, col)
            if save_figures:
                fig.savefig(f'figures/categorical_{col}.png')
    
    print("\n=== Numerical Variables Analysis ===")
    for col in numerical_cols:
        fig = analyze_numerical(df, col)
        if save_figures:
            fig.savefig(f'figures/numerical_{col}.png')
    
    # 4. Feature Engineering
    df_enhanced = create_customer_segments(df)
    
    # 5. Final validation
    print("\n=== Enhanced Dataset ===")
    validate_dataset(df_enhanced)
    
    return df_enhanced
# utils.py (moved under /Users/b/DATA/PROJECTS/Telco/L2/src/)
from __future__ import annotations
import os
import textwrap
from pathlib import Path
from typing import Iterable, Optional, Tuple, Dict, Any

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ---------------------------
# Project paths
# ---------------------------
# Always resolve project root relative to this utils.py file
PROJECT_ROOT = Path(__file__).resolve().parents[1]   # /Users/b/DATA/PROJECTS/Telco/L2
FIGURES_DIR = PROJECT_ROOT / "figures"

def ensure_dir(path: str | Path) -> None:
    Path(path).mkdir(parents=True, exist_ok=True)

def memory_report(df: pd.DataFrame) -> str:
    mb = df.memory_usage(deep=True).sum() / 1024**2
    return f"{mb:.2f} MB"

# ---------------------------
# (your existing data prep + validation code unchanged)
# ---------------------------
# ... load_telco_data, validate_dataset, etc. ...

# ---------------------------
# Complete EDA workflow
# ---------------------------
def perform_complete_eda(
    df: pd.DataFrame,
    figures_dir: Path = FIGURES_DIR,  # default fixed to project root /figures
    save_figures: bool = False,
    skip_cols: Optional[Iterable[str]] = ("customerID",),
    target: str = "Churn",
) -> pd.DataFrame:
    """
    Run the Level-2 EDA end-to-end:
      1) Validate
      2) Analyze categoricals & numerics
      3) Apply business logic
      4) Feature engineering
      5) Final validation
    """
    ensure_dir(figures_dir)

    print("1) Validation (raw)")
    validate_dataset(df)

    # Separate types
    categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    if skip_cols:
        categorical_cols = [c for c in categorical_cols if c not in skip_cols]
        numerical_cols = [c for c in numerical_cols if c not in skip_cols]

    print("\n2) Categorical analysis")
    for c in categorical_cols:
        fig = analyze_categorical(df, c, target=target)
        if save_figures:
            fig.savefig(figures_dir / f"categorical_{c}.png")
        plt.close(fig)

    print("\n3) Numerical analysis")
    for c in numerical_cols:
        fig = analyze_numerical(df, c, target=target)
        if save_figures:
            fig.savefig(figures_dir / f"numerical_{c}.png")
        plt.close(fig)

    print("\n4) Business logic corrections")
    df2 = apply_business_logic(df)

    print("\n5) Feature engineering")
    df3 = create_customer_segments(df2)

    print("\n6) Validation (enhanced)")
    validate_dataset(df3)

    return df3


In [None]:

---

Parameters
Great question üëå ‚Äî let‚Äôs break down exactly what‚Äôs happening in that function definition:
def load_telco_data(filepath, optimize_memory=True):

1. Function definition
def ‚Üí defines a new function.
load_telco_data ‚Üí the name of the function (you‚Äôll call it later).

2. Parameters
Inside the parentheses are the parameters (inputs the function expects):
filepath
A positional parameter.
When you call the function, you must supply this argument.
 load_telco_data("data/telco.csv")
In this case, it‚Äôs probably expected to be a string path to a CSV file.
optimize_memory=True
This is a keyword parameter with a default value.
=True means that if you don‚Äôt pass anything for it, Python automatically sets it to True.
You can override it when calling the function:

load_telco_data("data/telco.csv")               # optimize_memory defaults to True
load_telco_data("data/telco.csv", False)        # pass by position
load_telco_data("data/telco.csv", optimize_memory=False)  # pass by keyword
So:
filepath is required.
optimize_memory is optional, defaulting to True.

3. The trailing colon :
The colon ends the function header and tells Python that what follows (indented) will be the function body.
Example body (just illustrative):
def load_telco_data(filepath, optimize_memory=True):
    import pandas as pd
    df = pd.read_csv(filepath)

    if optimize_memory:
        # do some memory optimization steps
        df = df.astype({"customerID": "category"})
    return df

4. Why use a default argument?
Convenience: Most of the time, you may want memory optimization ‚Üí so True is the default.
Flexibility: If you don‚Äôt want optimization (e.g., for debugging), you can set it to False.

‚úÖ Summary:
filepath ‚Üí required argument.
optimize_memory=True ‚Üí optional argument, defaults to True unless you override it.
The colon : starts the function‚Äôs body.

---

Absolutely! Moving from a monolithic to a modular project structure is a smart goal. There are several excellent resources to guide you through planning, organizing, and executing that transformation‚Äîespecially in Python. Here‚Äôs a curated guide to get you started:

1. Python Structure & Modular Refactoring
Python Guide: Structuring Your Project
A foundational primer on organizing Python projects with clean modules, packages, clear layouts, and import structure.
 Provides solid guidance on how modules and filesystem layout affect maintainability and clarity.
 (docs.python-guide.org)
Dagster Blog: Best Practices for Structuring Python Projects
Covers nine practical best practices‚Äîfrom organizing folders, naming conventions, to modularization. Great if you're aiming to apply modular design with maintainable patterns.
 (Dagster)

2. Modular Monolith Architecture
Breadcrumbs Collector: Modular Monolith in Python
Explores how to divide a monolith into thoughtfully separated, encapsulated components‚Äîeach with its own API and internal structure.
 Discusses leveraging Clean Architecture selectively per component for pragmatic modularization.
 (breadcrumbs collector.tech)
Dev.to: Structuring Modular Monoliths
Emphasizes domain-based modules (e.g., Product, Order) with clear boundaries, own persistence, and controlled inter-module communication‚Äîeven within a single codebase.
 (DEV Community)
Medium: Modular Folder Organization
Proposes a modular folder layout aligned with Clean or Hexagonal Architecture‚Äîeach module containing UI, business logic, and data access enabling easier testing and future extraction.
 (Medium)

3. Transition Path & Reasoning
Medium ‚Äì Modular Monolithic Architecture
 A practical architecture pattern that divides the monolith into independent modules with explicit boundaries. Useful as an intermediate step before microservices.
 (Medium)


Miljan Jovanovic ‚Äì What Is a Modular Monolith?
 A recent overview of modular monolith benefits and why it's often better to start here rather than microservices. Includes caution using Martin Fowler's notes and scalability points.
 (milanjovanovic.tech)



4. Community Wisdom
From a popular Reddit discussion on structuring monoliths before splitting into services:
‚ÄúGet the monolith right first‚Ä¶ if you can‚Äôt get one repo in the right state, ‚Ä¶ you‚Äôll have lots of things in a bad state instead of just one‚Ä¶‚Äù
 ‚ÄúPut your focus first on finding bounded contexts‚Äîwhat is service A, service B‚Ä¶‚Äù
 Very practical advice: refactor internally, identify domain boundaries, then modularize.
 (Reddit)

5. Academic / Tool-supported Methods
For long-term or large-scale projects, automated tools may help:
Hierarchical-DBSCAN for Extracting Microservices (research paper)
 Uses clustering to suggest how classes could form microservices. Useful in mapping dependencies before refactor.
 (arXiv)


Feature Table Approach for Decomposition
 Semi-automated methodology to help breakdown monolith into service candidates based on feature correlations.
 (arXiv)


CARGO: AI-Guided Dependency Analysis
 Advanced ML tool that analyzes code dependencies to guide microservice extraction with minimal cross-service coupling.
 (arXiv)



Summary Table: Resources at a Glance
Stage
Resource & Focus
Basic Structuring
Python Guide, Dagster's Best Practices
Python Modular Monolith
Breadcrumbs Collector, Dev.to, Medium tutorial
Why Modular First?
Medium architecture overview, modular monolith benefits
Domain Modeling/Planning
Reddit discussion on bounded contexts and modular breakdown
Tool-assisted Refactoring
Academic research (DBSCAN, Feature Table, CARGO)


Next Steps for You
Start Small
 Begin by reorganizing your monolith into modules by feature or domain. Use the Breadcrumbs Collector or Dev.to guides as templates.


Enforce Boundaries
 Ensure each module only exposes public APIs. Avoid breaking encapsulation‚Äîsee the Clean Architecture distinction in some parts of your system.


Test Rigorously
 Move functions into modules gradually and validate via tests (unit/integration).


Iterate Toward Modularity
 Don‚Äôt aim for perfection upfront. Refactor iteratively, using domain clarity and testing as your guardrails.


Prepare for Potential Extraction
 When modules are clean, they‚Äôre easier to ‚Äúextract‚Äù into services later‚Äîthanks to good boundaries and folder layout.

# Quick Reference Commands
>### Once everything is set up, you can download any Kaggle dataset with:
```py
## Command line
kaggle datasets download -d [dataset-slug] -p [destination-path]

# Python
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
api.dataset_download_files(dataset='[dataset-slug]', path='[destination]', unzip=True)
```

**Best practices for setting up a project in Windsurf (your IDE on Mac)** so it‚Äôs clean, reproducible, and scalable. Here‚Äôs a checklist that blends **data science / software engineering conventions** with the Windsurf environment:

---

# üîπ 1. Create a Clean Project Structure

Inside Windsurf, set up a folder like:

```
my_project/
‚îú‚îÄ‚îÄ src/             # source code (modules, utils, pipelines)
‚îÇ   ‚îî‚îÄ‚îÄ my_project/
‚îÇ       ‚îú‚îÄ‚îÄ __init__.py
‚îÇ       ‚îú‚îÄ‚îÄ data_utils.py
‚îÇ       ‚îú‚îÄ‚îÄ model_utils.py
‚îú‚îÄ‚îÄ notebooks/       # Jupyter notebooks (exploration, EDA)
‚îú‚îÄ‚îÄ data/            # (gitignored) raw & processed datasets
‚îÇ   ‚îú‚îÄ‚îÄ raw/
‚îÇ   ‚îî‚îÄ‚îÄ processed/
‚îú‚îÄ‚îÄ logs/            # logging outputs
‚îú‚îÄ‚îÄ tests/           # pytest unit tests
‚îú‚îÄ‚îÄ .gitignore
‚îú‚îÄ‚îÄ requirements.txt # or pyproject.toml
‚îú‚îÄ‚îÄ README.md
‚îî‚îÄ‚îÄ venv/ or .venv/  # virtual environment (gitignored)
```

---

# üîπ 2. Set Up Git for Version Control

In Windsurf terminal:

```bash
git init
echo "venv/" >> .gitignore
echo "data/" >> .gitignore
echo "logs/" >> .gitignore
git add .
git commit -m "Initial commit"
```

Then link to GitHub if needed:

```bash
git remote add origin https://github.com/username/repo.git
git branch -M main
git push -u origin main
```

---

# üîπ 3. Create & Activate a Virtual Environment

From Windsurf terminal:

```bash
python3 -m venv .venv
source .venv/bin/activate   # Mac/Linux
```

Then install basics:

```bash
pip install -U pip wheel setuptools
pip install jupyter pandas numpy matplotlib scikit-learn
pip freeze > requirements.txt
```

---

# üîπ 4. Configure Windsurf to Use the Venv

* In **Command Palette** (`Cmd+Shift+P`), search for:
  **Python: Select Interpreter** ‚Üí choose your `.venv`.
* For notebooks, install kernel:

  ```bash
  python -m ipykernel install --user --name=my_project
  ```

---

# üîπ 5. Add Essential Config Files

* **`.gitignore`** ‚Üí ignore `venv/`, `data/`, `logs/`, `__pycache__/`.
* **`README.md`** ‚Üí quick overview (purpose, setup, usage).
* **`requirements.txt`** or **`pyproject.toml`** ‚Üí environment reproducibility.

---

# üîπ 6. Logging & Output Management

* Direct logs to `/logs/` (not `src/`).
* Save notebooks outputs to `/notebooks/`.
* Save raw vs processed data to `/data/` (with subfolders).

---

# üîπ 7. Testing Setup (Optional but Powerful)

* Use `pytest`:

  ```bash
  pip install pytest
  ```
* Add tests under `/tests/`:

  ```
  tests/
    test_data_utils.py
  ```

Run with:

```bash
pytest
```

---

# üîπ 8. Keep Notebooks Monolithic ‚Üí then Modularize

* Do quick experiments in `/notebooks/`.
* Once stable, move reusable code into `/src/my_project/`.
* Import it back into notebooks with:

  ```python
  from my_project import data_utils
  ```

---

# üîπ 9. Optional (but useful)

* **Pre-commit hooks** for formatting/linting (`black`, `flake8`).
* **.env file** for secrets/keys (don‚Äôt commit it).
* **Makefile** or **task runner** for common commands (`make clean`, `make run`).

---

‚úÖ **TL;DR**:
When setting up a project in Windsurf, focus on:

1. Clean folder layout.
2. Virtual environment tied to the IDE.
3. Git + `.gitignore` from day one.
4. Reproducibility (`requirements.txt`).
5. Separation of code (`src/`), data (`data/`), logs (`logs/`), and notebooks.

---

Would you like me to make you a **ready-to-run Windsurf project template** (folders, `.gitignore`, `requirements.txt`, and a sample `src/utils.py`) that you can drop into your next project?


# How to Organize Your Code Snippets Library

## Option 1: Markdown Files by Category (Recommended for Beginners)

Create separate `.md` files for each category:

### File Structure:
```
code_library/
‚îú‚îÄ‚îÄ missing_values.md
‚îú‚îÄ‚îÄ data_types.md
‚îú‚îÄ‚îÄ anomaly_detection.md
‚îú‚îÄ‚îÄ data_cleaning.md
‚îú‚îÄ‚îÄ basic_exploration.md
‚îî‚îÄ‚îÄ visualization.md
```

### Example: `missing_values.md`
```markdown
# Missing Values Code Snippets

## Quick Count
```python
# Most concise
df.isna().sum().sum()
```

## Detailed Analysis
```python
# Show which columns have missing values
missing_data = df.isnull().sum()
print(missing_data[missing_data > 0])
```

## Business Context Check
```python
# Check if missing values make business sense
missing_mask = df['TotalCharges'].isnull()
print(df[missing_mask][['customerID', 'tenure', 'MonthlyCharges']])
```

## When to Use Each:
- **Quick count**: Initial dataset assessment
- **Detailed analysis**: When you find missing values
- **Business context**: Understanding WHY values are missing
```

---

## Option 2: Jupyter Notebook Library

Create `code_snippets.ipynb` with sections:

```python
# =============================================================================
# MISSING VALUES TOOLKIT
# =============================================================================

# Quick check
def quick_missing_check(df):
    """Return total missing values count"""
    return df.isna().sum().sum()

# Detailed check  
def detailed_missing_check(df):
    """Show missing values by column"""
    missing = df.isnull().sum()
    return missing[missing > 0]

# Usage examples and when to use each method
```

---

## Option 3: Python Module (Advanced)

Create `data_toolkit.py`:

```python
import pandas as pd

class DataExplorer:
    """Collection of data exploration utilities"""
    
    @staticmethod
    def missing_values_quick(df):
        """Quick missing values count"""
        return df.isna().sum().sum()
    
    @staticmethod
    def missing_values_detailed(df):
        """Detailed missing values analysis"""
        missing = df.isnull().sum()
        if missing.sum() > 0:
            return missing[missing > 0]
        else:
            print("No missing values found")
            return None
    
    @staticmethod
    def find_anomalies_in_column(df, column):
        """Find non-numeric values in supposedly numeric columns"""
        anomalies = []
        for val in df[column].unique():
            if pd.isna(pd.to_numeric(val, errors='coerce')):
                count = (df[column] == val).sum()
                anomalies.append((val, count))
        return anomalies

# Usage: from data_toolkit import DataExplorer
# DataExplorer.missing_values_quick(df)
```

---

## Option 4: GitHub Repository Structure

```
data_analysis_toolkit/
‚îú‚îÄ‚îÄ README.md
‚îú‚îÄ‚îÄ missing_values/
‚îÇ   ‚îú‚îÄ‚îÄ README.md
‚îÇ   ‚îú‚îÄ‚îÄ quick_checks.py
‚îÇ   ‚îî‚îÄ‚îÄ detailed_analysis.py
‚îú‚îÄ‚îÄ data_cleaning/
‚îÇ   ‚îú‚îÄ‚îÄ README.md
‚îÇ   ‚îî‚îÄ‚îÄ text_to_numeric.py
‚îú‚îÄ‚îÄ anomaly_detection/
‚îÇ   ‚îú‚îÄ‚îÄ README.md
‚îÇ   ‚îî‚îÄ‚îÄ find_weird_values.py
‚îî‚îÄ‚îÄ examples/
    ‚îú‚îÄ‚îÄ telco_dataset_examples.ipynb
    ‚îî‚îÄ‚îÄ general_examples.ipynb
```

---

## My Recommendation for You: Start with Option 1

### Why Markdown Files Work Best Initially:
1. **Easy to search** - Ctrl+F across files
2. **Copy-paste friendly** - Code blocks ready to use
3. **Documentation built-in** - Context and when-to-use notes
4. **Version control friendly** - Works great with Git
5. **Portable** - Works on any platform

### Sample Categories for Your Library:

#### `missing_values.md`
- Quick count methods
- Detailed analysis
- Business logic checks

#### `data_types.md`  
- Check data types
- Convert text to numeric
- Handle conversion errors

#### `anomaly_detection.md`
- Find weird values
- Spot empty spaces
- Identify non-convertible entries

#### `data_cleaning.md`
- Replace missing values
- Handle empty spaces
- Fix data type issues

#### `quick_exploration.md`
- Dataset shape and info
- Sample data viewing
- Basic statistics

#### `visualization.md`
- Simple plots
- Quick correlation checks
- Distribution analysis

---

## Template for Each Category File:

```markdown
# [Category Name] - Code Snippets

## Quick Reference
[One-liner solutions]

## Detailed Methods  
[More comprehensive approaches]

## When to Use Each
[Decision guide]

## Real Examples
[From actual datasets you've worked with]

## Common Pitfalls
[Mistakes to avoid]

## Related Categories
[Links to other relevant snippet categories]
```

---

## Pro Tips:

### 1. Include Context
Don't just store code - store the **why** and **when**:
```markdown
## Quick Missing Values Check
```python
df.isna().sum().sum()
```
**When to use:** Initial dataset assessment, want just a number
**Returns:** Total count of missing values across entire dataset
**Example output:** `42` (means 42 missing values total)
```

### 2. Tag by Use Case
```markdown
**Tags:** #initial-exploration #data-quality #quick-check
```

### 3. Include Failed Attempts
```markdown
## What Doesn't Work
```python
# DON'T DO THIS - checks length, not missing count
if len(missing_data) > 0:  # Always true!
```
**Why it fails:** Length is number of columns, not missing values
```

### 4. Version Your Learning
```markdown
## Evolution of My Approach
- **Level 0:** `df.isnull().sum()`  
- **Level 1:** `df.isnull().sum()[df.isnull().sum() > 0]`
- **Level 2:** Business context analysis with missing patterns
```

This way, you're not just collecting code - you're building a **learning journey** that shows your progression and thinking process to hiring managers!

Start with simple markdown files, and as your library grows, you can always reorganize into more sophisticated structures.

You're building a **personal code library** - this is exactly what professional data scientists do. Here are several effective approaches:## üéØ **My Recommendation: Start Simple with Markdown Files**

Create a folder structure like this:
```
my_code_library/
‚îú‚îÄ‚îÄ missing_values.md
‚îú‚îÄ‚îÄ data_types.md
‚îú‚îÄ‚îÄ anomaly_detection.md
‚îú‚îÄ‚îÄ data_cleaning.md
‚îî‚îÄ‚îÄ quick_exploration.md
```

## üìù **Why Markdown Files Work Best:**

1. **Easy to search** - Ctrl+F to find what you need
2. **Copy-paste ready** - Code blocks ready to use  
3. **Context included** - Notes on when/why to use each method
4. **GitHub friendly** - Looks professional in your portfolio

## üí° **Example: `missing_values.md`**

```markdown
# Missing Values Toolkit

## Quick Count (Most Used)
```python
df.isna().sum().sum()  # 18 characters, fastest
```

## Show Details When Needed  
```python
df.isnull().sum()[df.isnull().sum() > 0]  # Only non-zero columns
```

## When to Use Each:
- **Quick count**: Initial assessment, just need total
- **Detailed view**: When you find missing values and need to know where
```

## üöÄ **Start Your Library Today:**

1. **Create the folder structure**
2. **Start with one file** - `missing_values.md`
3. **Add your variations** from today's session
4. **Include context** - when to use each method
5. **Build as you learn** - add new categories in future levels

## üìà **Benefits for Your Goals:**

- **Goal #2**: Documents your learning progression
- **Goal #4**: Creates your personal reference library
- **Portfolio value**: Shows systematic thinking to hiring managers

This approach scales perfectly - start simple, add complexity as you grow!

Here are several ways to check for empty values across the entire dataset:

## **Quick One-Liners for Different Types of Empty:**

### **Check NaN/Null values:**
```python
df.isnull().sum().sum()  # Total count
df.isnull().sum()        # By column
```

### **Check empty strings (''):**
```python
(df == '').sum().sum()   # Total count
(df == '').sum()         # By column
```

### **Check empty spaces (' '):**
```python
(df == ' ').sum().sum()  # Total count
(df == ' ').sum()        # By column
```

### **Ultimate one-liner for all types:**
```python
print(f"NaN: {df.isnull().sum().sum()}, Empty: {(df == '').sum().sum()}, Spaces: {(df == ' ').sum().sum()}")
```

## **Most Comprehensive Check:**

```python
# Check every column for all types of empty values
for col in df.columns:
    nan_count = df[col].isnull().sum()
    empty_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
    space_count = (df[col] == ' ').sum() if df[col].dtype == 'object' else 0
    
    total = nan_count + empty_count + space_count
    if total > 0:
        print(f"{col}: {total} empty values")
```

## **Show Actual Problem Rows:**

```python
# Find and display rows with any empty values
problem_rows = df[df.isnull().any(axis=1) | (df == '').any(axis=1) | (df == ' ').any(axis=1)]
print(f"Rows with empty values: {len(problem_rows)}")
if len(problem_rows) > 0:
    print(problem_rows[['customerID']].head())  # Show first few problem customer IDs
```

**After your TotalCharges fix, you should see:**
- **NaN: 0**
- **Empty strings: 0** 
- **Spaces: 0**

If you see any non-zero counts, that tells you where you still have empty value issues!

---

># **#2: Keep a snippet library handy across projects/workspaces**

    Here are three ‚Äúprofessional and portable‚Äù ways to keep a snippet library handy across projects/workspaces on your Mac‚Äîordered from most editor-native to most universal. Pick one or combine B+C for a killer setup.

# A) VS Code user snippets + built-in Sync (simple, editor-native)

**Best if you primarily use VS Code.**

1. Open Command Palette ‚Üí ‚ÄúPreferences: Configure User Snippets‚Äù.
2. Create language or global snippets. Example (`python.json`):

```json
{
  "Py docstring": {
    "prefix": "pydoc",
    "body": [
      "\"\"\"${1:Summary}",
      "",
      "Args:",
      "    ${2:param}: ${3:desc}",
      "",
      "Returns:",
      "    ${4:type}: ${5:desc}",
      "\"\"\""
    ],
    "description": "Docstring scaffold"
  }
}
```

3. Turn on **Settings Sync** (Account icon ‚Üí Turn On Sync). Your snippets follow you on any machine you sign into VS Code with.

**Pro tip (shared across repos):** keep your snippet JSON files in a Git repo and **symlink** them into VS Code‚Äôs snippets folder so you can version-control them:

```bash
# Paths (macOS)
VS_SNIPS="$HOME/Library/Application Support/Code/User/snippets"
mkdir -p ~/snippets/vscode
ln -s ~/snippets/vscode/python.json "$VS_SNIPS/python.json"
```

# B) Git + Markdown snippets + fzf CLI (portable, editor-agnostic)

**Best if you hop between tools (VS Code, Windsurf, terminals, notebooks).**

1. Make a repo:

```
~/snippets/
  python/
  sql/
  dbt/
  shell/
  README.md
```

Each snippet is a small `.md` with a clear title and a fenced code block.

2. Install tools (free):

```bash
brew install fzf ripgrep
```

3. Add a tiny **`snip`** helper to your `~/.zshrc`:

````zsh
snip() {
  local file
  file=$(rg -l --hidden --glob "!*.git/*" "${*:-.}" "$HOME/snippets" | fzf --prompt="Snip> ") || return
  # copy the code block contents to clipboard
  awk '/^```/{f=!f; next} f' "$file" | pbcopy
  echo "üìã Copied from: $file"
}
````

Usage:

````bash
snip pandas groupby
# copies the code inside the first ``` block from the chosen file
# ‚åò+V (paste) in any editor/terminal
````

**Why this rocks:** works everywhere (terminal, notebooks, different editors), versioned with Git, searchable via fuzzy find. You can keep private snippets in a private repo.

# C) Text expander (Espanso) for instant boilerplate (cross-app)

**Great for short, frequently typed patterns.** Free & open-source.

```bash
brew install espanso
espanso start
espanso edit
```

Add triggers in `default.yml`:

```yaml
matches:
  - trigger: ":sqlsel"
    replace: |
      SELECT ${1:*}
      FROM ${2:table}
      WHERE ${3:cond};
    vars:
      - name: cursor
        type: cursor
  - trigger: ":dbtmodel"
    replace: |
      {{ config(materialized='table') }}
      WITH src AS (
          SELECT * FROM {{ ref('${1:stg_table}') }}
      )
      SELECT * FROM src;
    propagate_case: true
filters:
  apps:
    only: ["com.microsoft.VSCode"]  # limit expansions to VS Code
```

Now typing `:sqlsel` in VS Code expands into a SQL skeleton with tabstops‚Äîwithout touching your editor‚Äôs snippet system.

---

## What I recommend for you

Given you‚Äôre on macOS + zsh and bounce across projects:

* **Primary:** VS Code user snippets + Sync (A) for language-aware tab-trigger snippets.
* **Universal:** Git + `snip` (B) so you can search/paste any snippet in *any* workspace, terminal, or notebook.
* **Boilerplate:** Espanso (C) for lightning-fast expansions you want everywhere.

---

## Starter pack you can paste in today

**Repo skeleton**

```bash
mkdir -p ~/snippets/{python,sql,dbt,shell}
git init ~/snippets
```

**Example files**
`~/snippets/sql/select-where.md`

````md
# Select with WHERE & limit
```sql
SELECT ${1:cols}
FROM ${2:schema.table}
WHERE ${3:cond}
LIMIT ${4:100};
````

````

`~/snippets/python/pandas-null-check.md`
```md
# Null audit (top offenders)
```python
na = df.isna().sum().sort_values(ascending=False)
print(na[na>0].head(10))
````

````

**VS Code snippet (dbt model)**
`~/snippets/vscode/sql.json`
```json
{
  "dbt model": {
    "prefix": "dbtmodel",
    "body": [
      "{{ config(materialized='table') }}",
      "WITH src AS (",
      "  SELECT * FROM {{ ref('${1:stg_table}') }}",
      ")",
      "SELECT * FROM src;"
    ],
    "description": "dbt model scaffold"
  }
}
````

Then symlink it as shown above.

---

### Guardrails

* **Secrets:** never store tokens/keys in snippets; use placeholders like `${SECRET_NAME}`.
* **Licensing:** if you paste from blogs/StackOverflow, keep a note/link at the top of the snippet file.
* **Naming:** use a consistent prefix (e.g., `:sql*`, `:py*`) for Espanso triggers to avoid accidental expansion.

>If you want, tell me your top 10 snippet needs (Python/pandas, SQL/dbt, shell, GCP/AWS), and I‚Äôll generate the files to drop into `~/snippets/` plus the matching VS Code/Espanso entries.

---

># **#3: Windsurf with VS Code‚Äìstyle snippets**

Yep‚ÄîWindsurf plays nicely with **VS Code‚Äìstyle snippets**, plus you can layer on a git repo and a text expander for portability. Here‚Äôs a clean setup that works across projects/workspaces.

# 1) Built-in (VS Code‚Äìstyle) user & workspace snippets

* Open the **Command Palette ‚Üí ‚ÄúPreferences: Configure User Snippets.‚Äù** Create a language file (e.g., `python.json`) or a **global** snippets file. Snippet format is the same as VS Code‚Äôs JSON (prefix/body/description). ([Arsturn][1], [Visual Studio Code][2])
* Bringing snippets from VS Code? On first run Windsurf lets you **import settings from VS Code/Cursor**, which includes snippets. ([DEV Community][3])
* For repo-scoped snippets, add a workspace file (e.g., `.vscode/snippets.code-snippets`) so they travel with the project in Git. (Same mechanism as VS Code.) ([Stack Overflow][4])

Example snippet body (works in Windsurf since it‚Äôs VS Code syntax):

```json
{
  "Docstring scaffold": {
    "prefix": "pydoc",
    "body": [
      "\"\"\"${1:Summary}",
      "",
      "Args:",
      "    ${2:param}: ${3:desc}",
      "",
      "Returns:",
      "    ${4:type}: ${5:desc}",
      "\"\"\""
    ],
    "description": "Python docstring"
  }
}
```

# 2) Manage snippets with extensions (Open VSX)

Windsurf uses the **Open VSX** marketplace, so install snippet helpers from there in the Extensions view:

* **Snippets Manager** ‚Äì simple UI to create/edit snippets. ([open-vsx.org][5])
* **Snippets Viewer** ‚Äì browse built-in, extension, and user snippets in a tree view. ([open-vsx.org][6])
* **Snippets Ranger** ‚Äì create/select snippet files quickly. ([open-vsx.org][7])
  (If you‚Äôre used to VS Code‚Äôs marketplace, note the store difference‚ÄîWindsurf surfaces Open VSX by default.) ([Reddit][8])

# 3) Keep them portable with Git (+ optional symlink)

* Put all snippet JSON (and markdown examples) in `~/snippets/` under Git.
* In Windsurf, open one of your snippet files and copy its on-disk path; then **symlink** your repo files to that location so edits are versioned automatically (works on Mac/Linux).
  This avoids guessing paths and keeps snippets consistent across workspaces.

# 4) Universal quick-paste option (works in any app)

If you want snippets outside the editor too:

* **Espanso** (free text expander): type `:sqlsel` ‚Üí expands to a SQL skeleton anywhere.
* Or a tiny **fzf** picker in your terminal that copies code blocks from `~/snippets/*.md` to clipboard (great inside Windsurf‚Äôs integrated terminal).

# 5) Bonus: make snippets ‚Äúsearchable‚Äù by the AI

If you keep a **Google Doc** of your favorite patterns, Windsurf‚Äôs **Knowledge Base / docs context** can ingest docs for retrieval in chat. It‚Äôs not an inserter like snippets, but handy for ‚Äúremind me of the dbt model scaffold‚Äù prompts. ([Windsurf Docs][9])

---

## A setup I‚Äôd recommend for you

1. **User & workspace snippets** for tab-trigger speed. ([Arsturn][1], [Visual Studio Code][2])
2. **Git repo** at `~/snippets/` + **symlink** to Windsurf‚Äôs snippet files so changes sync across workspaces.
3. Install **Snippets Manager** (UI), plus **Snippets Viewer** (browse). ([open-vsx.org][5])
4. Add **Espanso** for cross-app boilerplate (optional).
5. (Nice-to-have) A **Google Doc** of patterns connected to Windsurf‚Äôs knowledge base for quick AI recall. ([Windsurf Docs][9])

If you want, tell me your top 10 snippets (Python/pandas, SQL/dbt, shell/GCP) and I‚Äôll generate the JSON files + a starter `~/snippets/` repo structure to drop in.

[1]: https://www.arsturn.com/blog/creating-custom-code-snippets-in-windsurf-for-reusability "Mastering Code Snippets in Windsurf: Reusability Techniques"
[2]: https://code.visualstudio.com/docs/editing/userdefinedsnippets?utm_source=chatgpt.com "Snippets in Visual Studio Code"
[3]: https://dev.to/proflead/this-ai-ide-can-code-for-you-windsurf-ai-full-tutorial-4p94 "This AI IDE Can Code For You ‚Äì Windsurf AI Full Tutorial - DEV Community"
[4]: https://stackoverflow.com/questions/29995863/how-to-add-custom-code-snippets-in-vscode?utm_source=chatgpt.com "How to add custom code snippets in VSCode?"
[5]: https://open-vsx.org/extension/zjffun/snippetsmanager?utm_source=chatgpt.com "Snippets Manager"
[6]: https://open-vsx.org/extension/RandomFractalsInc/snippets-viewer?utm_source=chatgpt.com "Snippets Viewer"
[7]: https://open-vsx.org/extension/robole/snippets-ranger?utm_source=chatgpt.com "Snippets Ranger"
[8]: https://www.reddit.com/r/RooCode/comments/1i6ns94/new_users_discovery_of_roocode_on_vscode/?utm_source=chatgpt.com "New users' discovery of RooCode on vscode marketplace ..."
[9]: https://docs.windsurf.com/context-awareness/overview?utm_source=chatgpt.com "Overview - Windsurf Docs"

---

># **#4: create a custome library that gives me suggestions in a jupyter cell or python file when I type # to call the library then call a specific snippet by name**

Yes‚Äîyou can make a tiny IPython/Jupyter extension that:

* lets you write a **comment directive** like `# snip <name>` which gets **expanded into code** before execution, and
* gives you **Tab-completion suggestions** for snippet names after you type `# snip ` in a Jupyter cell (works in JupyterLab/Notebook and VS Code‚Äôs Jupyter).

Below is a minimal, drop-in ‚Äúsnippet library‚Äù you can keep in `~/.snippets/`. It uses a YAML file for your snippets, a transformer to expand `# snip ...` lines, and a custom completer to suggest names.

---

### 1) Snippet storage (YAML)

`~/.snippets/snippets.yaml`

```yaml
pandas.null_audit: |
  na = df.isna().sum().sort_values(ascending=False)
  print(na[na>0].head(10))
sql.select_where: |
  SELECT ${1:cols}
  FROM ${2:schema.table}
  WHERE ${3:cond}
  LIMIT ${4:100};
dbt.model_scaffold: |
  {{ config(materialized='table') }}
  WITH src AS (
    SELECT * FROM {{ ref('${1:stg_table}') }}
  )
  SELECT * FROM src;
```

> You can keep multiple YAML files under `~/.snippets/`‚Äîthe loader will merge them.

---

### 2) The extension (`snipline.py`)

Put this file anywhere on your PYTHONPATH (e.g., `~/snipline/snipline.py`) and `pip install pyyaml` if you don‚Äôt have it.

```python
# snipline.py
from __future__ import annotations
import os, re, glob, time, yaml, textwrap
from pathlib import Path
from typing import Dict, Optional
from IPython.core.magic import Magics, magics_class, line_magic
from IPython.display import Javascript, display

SNIP_DIR = Path(os.environ.get("SNIP_DIR", "~/.snippets")).expanduser()

class SnipStore:
    def __init__(self, directory: Path):
        self.dir = Path(directory)
        self.cache: Dict[str, str] = {}
        self._mtimes: Dict[str, float] = {}
        self.reload()

    def reload(self):
        self.cache.clear()
        self._mtimes.clear()
        for yml in glob.glob(str(self.dir / "*.y*ml")):
            with open(yml, "r", encoding="utf-8") as f:
                data = yaml.safe_load(f) or {}
            for k, v in data.items():
                self.cache[str(k)] = str(v)
            self._mtimes[yml] = os.path.getmtime(yml)

    def maybe_reload(self):
        for yml, old in list(self._mtimes.items()):
            if os.path.exists(yml) and os.path.getmtime(yml) != old:
                self.reload()
                break

    def names(self, prefix: str = ""):
        self.maybe_reload()
        return sorted([k for k in self.cache if k.startswith(prefix)])

    def get(self, name: str) -> Optional[str]:
        self.maybe_reload()
        return self.cache.get(name)

STORE = SnipStore(SNIP_DIR)

# --- Input transformer: replace lines like "# snip <name>" with the snippet text
_SNIP_LINE = re.compile(r"^[ \t]*#\s*snip\s+([A-Za-z0-9_.\-/]+)[ \t]*$", re.MULTILINE)

def _expand_snips(cell: str) -> str:
    def repl(m):
        name = m.group(1)
        body = STORE.get(name)
        if body is None:
            # Leave a visible marker if missing
            return f"# [snip: '{name}' not found]"
        # Preserve indentation of the directive line
        indent = len(m.string[:m.start()].splitlines()[-1]) - len(m.string[:m.start()].splitlines()[-1].lstrip())
        ind = " " * max(indent, 0)
        expanded = textwrap.indent(body.rstrip("\n"), ind)
        return expanded
    return _SNIP_LINE.sub(repl, cell)

# --- Tab completion for "# snip " or "%snip " prefixes
def _snip_completer(self, event):
    # Works for both comments and magic usage
    line = getattr(event, "line", "")
    # capture partial name user is typing
    m = re.search(r"(#\s*snip|%snip)\s+([A-Za-z0-9_.\-/]*)$", line[:getattr(event, "cursor_position", len(line))])
    if not m:
        return []
    prefix = m.group(2)
    return STORE.names(prefix)

@magics_class
class SnipMagics(Magics):
    @line_magic("snip")
    def snip(self, line):
        """Usage: %snip <name>  -> prints the snippet body
        With 'insert' in classic Notebook, it replaces the current cell."""
        name = line.strip()
        if not name:
            print("\n".join(STORE.names()))
            return
        body = STORE.get(name)
        if body is None:
            print(f"[snip: '{name}' not found]")
            return
        # Print for copy/paste
        print(body)
        # Optional: try to replace current cell in classic Notebook UI
        try:
            js = Javascript("""
                if (typeof Jupyter !== 'undefined') {
                  var cell = Jupyter.notebook.get_selected_cell();
                  cell.set_text(atob('%s'));
                }
            """ % body.encode("utf-8").hex())
            display(js)
        except Exception:
            pass

def load_ipython_extension(ip):
    # Expand "# snip ..." lines before execution
    ip.input_transformers_post.append(_expand_snips)
    # Add completer (works in Jupyter and VS Code's Jupyter)
    try:
        ip.set_hook('complete_command', _snip_completer, re_key='.*')
    except Exception:
        # Fallback: older IPython versions
        completer = getattr(ip, 'Completer', None)
        if completer and hasattr(completer, 'matchers'):
            completer.matchers.insert(0, lambda self, text: [])
    ip.register_magics(SnipMagics)

def unload_ipython_extension(ip):
    # Best-effort cleanup; new sessions usually just restart
    pass
```

---

### 3) Use it in Jupyter/VS Code‚Äôs Jupyter

In a cell:

```python
%load_ext snipline   # run once per session (or auto-load, see below)
```

Now you can:

* Type: `# snip pandas.null_audit` then press **Tab** after `# snip ` to see snippet **name suggestions**.
* Run the cell: that line is **replaced** with the snippet‚Äôs code before execution.
* Or: `%snip pandas.null_audit` to print/insert the snippet body.

**Example:**

```python
# snip pandas.null_audit
```

‚Ä¶expands to:

```python
na = df.isna().sum().sort_values(ascending=False)
print(na[na>0].head(10))
```

---

### 4) Auto-load every session (optional)

Create (or edit) `~/.ipython/profile_default/ipython_config.py`:

```python
c = get_config()
c.InteractiveShellApp.extensions = ['snipline']
```

---

### Notes & limits

* **Where suggestions work:** in **Jupyter contexts** (classic, Lab, or VS Code‚Äôs Jupyter) because completion comes from the kernel. In a plain `.py` file editor view, completion is handled by the editor‚Äôs language server, so you won‚Äôt get the `# snip` suggestions there. For `.py` editing, pair this with **editor snippets** (VS Code/Windsurf) or a text expander (Espanso).
* **Variables/placeholders:** You can embed `${1:placeholder}` in snippets for editor tabstops (useful when you paste into VS Code). The transformer leaves those as-is; your editor‚Äôs snippet engine will handle them if present.
* **Versioning:** Keep `~/.snippets/` under Git so you can sync across machines. The loader auto-reloads when YAML files change.

If you want, tell me your top 10 snippet names and I‚Äôll generate the YAML ready to drop into `~/.snippets/snippets.yaml`.


----



----


    Heck yes‚Äîlet‚Äôs make **snipples** your cross-workspace snippet engine. Below are two files:

1. **`snipples.py`** ‚Äì an IPython/Jupyter extension so you can type `# snipples <name>` in a cell and it expands before execution (with tab-complete for names).
2. **`~/.snipples/snippets.yaml`** ‚Äì a chunky library covering the Telco Churn project end-to-end (ingest ‚Üí EDA ‚Üí cleaning ‚Üí features ‚Üí ML ‚Üí explainability ‚Üí dbt/SQL ‚Üí ops).

---

## 1) `snipples.py` (drop anywhere on your PYTHONPATH)

```python
# snipples.py
from __future__ import annotations
import os, re, glob, yaml, textwrap
from pathlib import Path
from typing import Dict, Optional
from IPython.core.magic import Magics, magics_class, line_magic
from IPython.display import Javascript, display

SNIPS_DIR = Path(os.environ.get("SNIPPLES_DIR", "~/.snipples")).expanduser()

class SnipStore:
    def __init__(self, directory: Path):
        self.dir = Path(directory)
        self.cache: Dict[str, str] = {}
        self._mtimes: Dict[str, float] = {}
        self.reload()

    def reload(self):
        self.cache.clear()
        self._mtimes.clear()
        self.dir.mkdir(parents=True, exist_ok=True)
        for yml in sorted(self.dir.glob("*.y*ml")):
            with open(yml, "r", encoding="utf-8") as f:
                data = yaml.safe_load(f) or {}
            for k, v in data.items():
                self.cache[str(k)] = str(v)
            self._mtimes[str(yml)] = yml.stat().st_mtime

    def maybe_reload(self):
        for yml, old in list(self._mtimes.items()):
            p = Path(yml)
            if p.exists() and p.stat().st_mtime != old:
                self.reload()
                break

    def names(self, prefix: str = ""):
        self.maybe_reload()
        return sorted([k for k in self.cache if k.startswith(prefix)])

    def get(self, name: str) -> Optional[str]:
        self.maybe_reload()
        return self.cache.get(name)

STORE = SnipStore(SNIPS_DIR)

_SNIPLINE = re.compile(r"^[ \t]*#\s*snipples\s+([A-Za-z0-9_.\-/]+)[ \t]*$", re.MULTILINE)

def _expand_snips(cell: str) -> str:
    def repl(m):
        name = m.group(1)
        body = STORE.get(name)
        if body is None:
            return f"# [snipples: '{name}' not found]"
        # Preserve indentation level of the directive line
        line = cell[:m.start()].splitlines()[-1] if cell[:m.start()].splitlines() else ""
        indent = len(line) - len(line.lstrip(" "))
        expanded = textwrap.indent(body.rstrip("\n"), " " * max(indent, 0))
        return expanded
    return _SNIPLINE.sub(repl, cell)

def _snipples_completer(self, event):
    line = getattr(event, "line", "")
    cur = getattr(event, "cursor_position", len(line))
    m = re.search(r"(#\s*snipples|%snipples)\s+([A-Za-z0-9_.\-/]*)$", line[:cur])
    if not m:
        return []
    return STORE.names(m.group(2))

@magics_class
class SnipplesMagics(Magics):
    @line_magic("snipples")
    def snipples(self, line):
        """%snipples <name>  -> prints/optionally inserts the snippet body"""
        name = line.strip()
        if not name:
            print("\n".join(STORE.names()))
            return
        body = STORE.get(name)
        if body is None:
            print(f"[snipples: '{name}' not found]")
            return
        print(body)
        try:
            js = Javascript("""
                if (typeof Jupyter !== 'undefined') {
                  var cell = Jupyter.notebook.get_selected_cell();
                  cell.set_text(atob('%s'));
                }
            """ % body.encode("utf-8").hex())
            display(js)
        except Exception:
            pass

def load_ipython_extension(ip):
    ip.input_transformers_post.append(_expand_snips)
    try:
        ip.set_hook('complete_command', _snipples_completer, re_key='.*')
    except Exception:
        pass
    ip.register_magics(SnipplesMagics)

def unload_ipython_extension(ip):
    pass
```

**Use:** in a Jupyter cell run once per session:

```python
%load_ext snipples
# then type:  # snipples telco.qa.null_audit   (Tab to see suggestions)
```

To autoload every time, add to `~/.ipython/profile_default/ipython_config.py`:

```python
c = get_config()
c.InteractiveShellApp.extensions = ['snipples']
```

---

## 2) `~/.snipples/snippets.yaml` (Telco Churn end-to-end)

Create the folder and file:

```bash
mkdir -p ~/.snipples
# paste the following into ~/.snipples/snippets.yaml
```

```yaml
# =========================
# Project scaffolding / setup
# =========================
proj.readme.telco: |
  # Telco Churn ‚Äî Analytics & ML
  ## Stack
  - Python 3.11, pandas, scikit-learn, xgboost, catboost, imbalanced-learn, shap
  - BigQuery + dbt (analytics)
  - Great Expectations or Pandera (ingest DQ)
  - MLflow (experiment tracking)

  ## Structure
  .
  ‚îú‚îÄ data/{raw,interim,processed}
  ‚îú‚îÄ notebooks/
  ‚îú‚îÄ src/telco/...
  ‚îú‚îÄ models/ (dbt)
  ‚îî‚îÄ reports/

  ## Targets
  - Clean EDA + feature marts
  - Baseline + tree models
  - Explainability (SHAP)
  - Reproducible pipelines + tests

env.venv.setup: |
  python3 -m venv .venv
  source .venv/bin/activate
  python -m pip install --upgrade pip
  pip install pandas numpy scikit-learn imbalanced-learn xgboost catboost shap mlflow matplotlib seaborn pandera great-expectations python-dotenv pandas-gbq google-cloud-bigquery pyarrow

nb.header.imports: |
  import os, sys, math, json, textwrap, warnings
  import numpy as np
  import pandas as pd
  import matplotlib.pyplot as plt
  import seaborn as sns
  from pathlib import Path
  warnings.filterwarnings("ignore")
  pd.set_option("display.max_columns", 100)
  plt.rcParams["figure.figsize"] = (10,5)

# =========================
# Ingest / IO
# =========================
ingest.csv.read_telco: |
  dtype_map = {
      "customerID": "string",
      "gender": "string",
      "SeniorCitizen": "Int64",
      "Partner": "string",
      "Dependents": "string",
      "tenure": "Int64",
      "PhoneService": "string",
      "MultipleLines": "string",
      "InternetService": "string",
      "OnlineSecurity": "string",
      "OnlineBackup": "string",
      "DeviceProtection": "string",
      "TechSupport": "string",
      "StreamingTV": "string",
      "StreamingMovies": "string",
      "Contract": "string",
      "PaperlessBilling": "string",
      "PaymentMethod": "string",
      "MonthlyCharges": "float64",
      "TotalCharges": "string",   # coercion later
      "Churn": "string"
  }
  df = pd.read_csv("data/raw/Telco-Customer-Churn.csv", dtype=dtype_map)
  print(df.shape)

ingest.bigquery.read_table: |
  # pip install pandas-gbq google-cloud-bigquery
  from pandas_gbq import read_gbq
  df = read_gbq("""
      SELECT * FROM `PROJECT.DATASET.telco_customers`
  """, project_id=os.environ.get("GCP_PROJECT"))

save.outputs.standard_paths: |
  Path("data/interim").mkdir(parents=True, exist_ok=True)
  Path("data/processed").mkdir(parents=True, exist_ok=True)
  df.to_csv("data/interim/telco_cleaned.csv", index=False)

# =========================
# Data Quality (ingest)
# =========================
qa.null_audit: |
  na = df.isna().sum().sort_values(ascending=False)
  print("Missing by column:")
  print(na[na>0])

qa.dup_pkey_check: |
  dups = df["customerID"].value_counts()
  print("Dup primary keys:", (dups > 1).sum())

qa.pandera.schema_telco: |
  import pandera as pa
  from pandera import Column, Check
  TelcoSchema = pa.DataFrameSchema({
      "customerID": Column(str, nullable=False),
      "tenure": Column(int, Check.in_range(0, 84), nullable=False),
      "MonthlyCharges": Column(float, Check.in_range(0, 200), nullable=False),
      "TotalCharges": Column(object, nullable=True),  # coerced later
      "Churn": Column(str, Check.isin(["Yes","No"]), nullable=False),
  })
  TelcoSchema.validate(df, lazy=True)

qa.business_rule_totalcharges_tenure: |
  bad = df[df["tenure"].fillna(0) > 0].copy()
  bad = bad[np.abs(pd.to_numeric(bad["TotalCharges"], errors="coerce") - bad["tenure"] * bad["MonthlyCharges"]) > 10]
  print(f"Rows outside $10 tolerance: {len(bad)}")

# =========================
# Cleaning / Imputation
# =========================
clean.totalcharges_fix: |
  df["TotalCharges"] = pd.to_numeric(df["TotalCharges"].astype(str).str.strip(), errors="coerce")
  mask0 = df["TotalCharges"].isna() & (df["tenure"].fillna(0) == 0)
  df.loc[mask0, "TotalCharges"] = 0.0
  mask_other = df["TotalCharges"].isna() & df["tenure"].notna() & df["MonthlyCharges"].notna()
  df.loc[mask_other, "TotalCharges"] = df.loc[mask_other, "tenure"] * df.loc[mask_other, "MonthlyCharges"]
  df["TotalCharges"] = df["TotalCharges"].astype("float64")

clean.category_normalize: |
  yesno = ["Partner","Dependents","PhoneService","PaperlessBilling","Churn"]
  for c in yesno:
      df[c] = df[c].str.strip().str.title()
  df["MultipleLines"]   = df["MultipleLines"].str.strip().str.replace("No phone service","No Phone Service", regex=False)
  internet_cols = ["OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies"]
  for c in internet_cols:
      df[c] = df[c].str.strip().str.replace("No internet service","No Internet Service", regex=False)

# =========================
# EDA
# =========================
eda.quick_overview: |
  print(df.shape)
  display(df.head())
  print(df.describe(include="all").T)

eda.target_balance: |
  ax = (df["Churn"].value_counts(normalize=True)*100).plot.bar()
  ax.set_title("Churn class balance (%)"); plt.show()

eda.corr_heatmap_numeric: |
  num = df.select_dtypes(include=["number"])
  sns.heatmap(num.corr(numeric_only=True), annot=False, linewidths=.5)
  plt.title("Numeric correlations"); plt.show()

eda.churn_rate_by_col_template: |
  col = "Contract"  # <- change me
  rate = (df.groupby(col)["Churn"].apply(lambda s: (s=="Yes").mean()).sort_values()*100)
  print(rate.round(2))

# =========================
# Feature Engineering
# =========================
feat.tenure_buckets: |
  bins = [-1, 0, 6, 12, 24, 48, 84, 999]
  labels = ["0","1-6","7-12","13-24","25-48","49-84","85+"]
  df["tenure_bucket"] = pd.cut(df["tenure"], bins=bins, labels=labels)

feat.boolean_target: |
  df["y"] = (df["Churn"].str.upper() == "YES").astype(int)

feat.split_train_test: |
  from sklearn.model_selection import train_test_split
  target = "y"
  y = df[target]
  feature_drop = ["customerID","Churn","y"]
  X = df.drop(columns=feature_drop, errors="ignore")
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# =========================
# Modeling ‚Äì shared utilities
# =========================
model.columns_splitter: |
  num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
  cat_cols = X_train.select_dtypes(exclude=["number"]).columns.tolist()
  from sklearn.preprocessing import OneHotEncoder, StandardScaler
  from sklearn.compose import ColumnTransformer
  pre = ColumnTransformer(
      transformers=[
          ("num", StandardScaler(with_mean=False), num_cols),
          ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
      ],
      remainder="drop"
  )

model.metrics_helpers: |
  from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, classification_report, confusion_matrix
  def print_metrics(y_true, prob, thr=0.5):
      y_pred = (prob >= thr).astype(int)
      print(f"ROC AUC: {roc_auc_score(y_true, prob):.4f}")
      print(f"PR AUC : {average_precision_score(y_true, prob):.4f}")
      print(f"F1     : {f1_score(y_true, y_pred):.4f}")
      print(confusion_matrix(y_true, y_pred))
      print(classification_report(y_true, y_pred, digits=3))

# =========================
# Baseline model (LogReg)
# =========================
model.baseline_logreg: |
  from sklearn.linear_model import LogisticRegression
  from sklearn.pipeline import Pipeline
  clf = Pipeline(steps=[
      ("pre", pre),
      ("lr", LogisticRegression(max_iter=200, class_weight="balanced", n_jobs=None))
  ])
  clf.fit(X_train, y_train)
  prob = clf.predict_proba(X_test)[:,1]
  print_metrics(y_test, prob)

# =========================
# Tree models (XGBoost / CatBoost)
# =========================
model.xgboost_cv: |
  import xgboost as xgb
  from sklearn.model_selection import StratifiedKFold
  from sklearn.pipeline import Pipeline
  skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  clf = Pipeline(steps=[
      ("pre", pre),
      ("xgb", xgb.XGBClassifier(
          n_estimators=600, max_depth=6, learning_rate=0.05,
          subsample=0.8, colsample_bytree=0.8, eval_metric="logloss",
          tree_method="hist", reg_lambda=1.0, n_jobs=-1
      ))
  ])
  aucs = []
  for tr, va in skf.split(X_train, y_train):
      clf.fit(X_train.iloc[tr], y_train.iloc[tr])
      p = clf.predict_proba(X_train.iloc[va])[:,1]
      aucs.append(roc_auc_score(y_train.iloc[va], p))
  print("CV ROC AUC:", np.mean(aucs).round(4), "+/-", np.std(aucs).round(4))
  clf.fit(X_train, y_train)
  prob = clf.predict_proba(X_test)[:,1]
  print_metrics(y_test, prob)

model.catboost_simple: |
  from catboost import CatBoostClassifier
  # For CatBoost you can pass categorical indices directly (works best on raw categories)
  Xc = df.drop(columns=["customerID","Churn","y"], errors="ignore")
  cat_idx = np.where(Xc.dtypes == "object")[0].tolist()
  from sklearn.model_selection import train_test_split
  Xtr, Xte, ytr, yte = train_test_split(Xc, df["y"], test_size=0.2, stratify=df["y"], random_state=42)
  cat = CatBoostClassifier(
      depth=6, iterations=1500, learning_rate=0.03, loss_function="Logloss",
      eval_metric="AUC", verbose=200, random_seed=42, auto_class_weights="Balanced"
  )
  cat.fit(Xtr, ytr, cat_features=cat_idx, eval_set=(Xte, yte), use_best_model=True)
  prob = cat.predict_proba(Xte)[:,1]
  print_metrics(yte, prob)

# =========================
# Imbalance strategies (optional)
# =========================
imb.smote_pipeline_xgb: |
  from imblearn.pipeline import Pipeline as ImbPipeline
  from imblearn.over_sampling import SMOTE
  import xgboost as xgb
  imb_clf = ImbPipeline(steps=[
      ("pre", pre),
      ("smote", SMOTE(random_state=42)),
      ("xgb", xgb.XGBClassifier(
          n_estimators=500, max_depth=5, learning_rate=0.05,
          subsample=0.9, colsample_bytree=0.9, eval_metric="logloss", n_jobs=-1
      ))
  ])
  imb_clf.fit(X_train, y_train)
  prob = imb_clf.predict_proba(X_test)[:,1]
  print_metrics(y_test, prob)

# =========================
# Threshold tuning & calibration
# =========================
eval.threshold_opt_pr: |
  from sklearn.metrics import precision_recall_curve
  pr, rc, thr = precision_recall_curve(y_test, prob)
  f = 2*pr*rc/(pr+rc+1e-9)
  best = np.nanargmax(f)
  print("Best F1 threshold:", thr[best].round(4), "F1:", f[best].round(4))

eval.calibration_plot: |
  from sklearn.calibration import calibration_curve
  prob_true, prob_pred = calibration_curve(y_test, prob, n_bins=10)
  plt.plot(prob_pred, prob_true, marker="o"); plt.plot([0,1],[0,1],"--")
  plt.title("Calibration"); plt.xlabel("Predicted"); plt.ylabel("Observed"); plt.show()

# =========================
# SHAP explainability
# =========================
exp.shap_tree: |
  import shap
  shap.initjs()
  # Works with tree-based models like XGBoost/CatBoost
  booster = clf.named_steps["xgb"] if "xgb" in dict(clf.steps) else None
  if booster is None:
      raise RuntimeError("This snippet expects a fitted Pipeline with step 'xgb'.")
  Xs = clf.named_steps["pre"].transform(X_test)
  explainer = shap.TreeExplainer(booster)
  shap_values = explainer.shap_values(Xs)
  shap.summary_plot(shap_values, Xs)

exp.shap_linear: |
  import shap
  linear = clf.named_steps.get("lr", None)
  if linear is None:
      raise RuntimeError("This snippet expects a fitted Pipeline with step 'lr'.")
  Xs = clf.named_steps["pre"].transform(X_test)
  explainer = shap.LinearExplainer(linear, Xs, feature_dependence="independent")
  shap_values = explainer.shap_values(Xs)
  shap.summary_plot(shap_values, Xs)

# =========================
# MLflow tracking (optional)
# =========================
mlflow.start_run_and_log: |
  import mlflow, mlflow.sklearn
  mlflow.set_experiment("telco-churn")
  with mlflow.start_run(run_name="xgb_baseline"):
      mlflow.log_params({"n_estimators":600,"max_depth":6,"lr":0.05})
      mlflow.log_metric("roc_auc", roc_auc_score(y_test, prob))
      mlflow.sklearn.log_model(clf, "model")

# =========================
# SQL (analysis & sanity)
# =========================
sql.churn_rate_by_segment: |
  SELECT Contract,
         ROUND(100 * AVG(CASE WHEN Churn='Yes' THEN 1 ELSE 0 END), 2) AS churn_pct,
         COUNT(*) AS n
  FROM analytics.telco_customers
  GROUP BY Contract
  ORDER BY churn_pct DESC;

sql.monthly_retention: |
  -- Requires start_date and end_date fields for each subscription
  WITH months AS (
    SELECT DATE_TRUNC(d, MONTH) AS month
    FROM UNNEST(GENERATE_DATE_ARRAY('2017-01-01','2018-12-31', INTERVAL 1 DAY)) AS d
  ),
  active AS (
    SELECT m.month,
           COUNTIF(t.start_date <= m.month AND (t.end_date IS NULL OR t.end_date >= m.month)) AS active_customers
    FROM months m CROSS JOIN analytics.telco_subscriptions t
    GROUP BY m.month
  )
  SELECT * FROM active ORDER BY month;

# =========================
# dbt scaffolds
# =========================
dbt.model_scaffold: |
  {{ config(materialized='table') }}
  WITH src AS (
    SELECT * FROM {{ ref('stg_telco_customers') }}
  )
  SELECT * FROM src;

dbt.schema_tests_min: |
  version: 2
  models:
    - name: stg_telco_customers
      columns:
        - name: customerID
          tests: [not_null, unique]
        - name: tenure
          tests:
            - dbt_expectations.expect_column_values_to_be_between:
                min_value: 0
                max_value: 84
        - name: Churn
          tests:
            - accepted_values:
                values: ['Yes','No']
      tests:
        - dbt_utils.expression_is_true:
            expression: "case when tenure = 0 then coalesce(TotalCharges,0) = 0 else true end"
        - dbt_utils.expression_is_true:
            expression: "abs(coalesce(TotalCharges,0) - coalesce(tenure,0)*coalesce(MonthlyCharges,0)) <= 10"
            where: "tenure > 0"

dbt.freshness_recency: |
  models:
    - name: stg_telco_customers
      tests:
        - dbt_utils.recency:
            field: ingested_at
            datepart: hour
            interval: 24

# =========================
# Reporting / export
# =========================
report.top_drivers_summary: |
  # After SHAP: simple text summary
  import numpy as np
  def top_features(shap_vals, k=10):
      imp = np.abs(shap_vals).mean(axis=0)
      idx = np.argsort(imp)[::-1][:k]
      return list(zip(idx, imp[idx]))
  print("Top features by mean |SHAP| (indices):")
  print(top_features(shap_values, 10))

export.predictions_csv: |
  out = X_test.copy()
  out["prob_churn"] = prob
  out["y_true"] = y_test.values
  out.to_csv("reports/predictions.csv", index=False)
  print("Saved reports/predictions.csv")
```

---

### Quick start flow (what to run in a fresh notebook)

```python
%load_ext snipples

# snipples nb.header.imports
# snipples ingest.csv.read_telco
# snipples qa.null_audit
# snipples clean.totalcharges_fix
# snipples clean.category_normalize
# snipples feat.boolean_target
# snipples feat.tenure_buckets
# snipples feat.split_train_test
# snipples model.columns_splitter
# snipples model.metrics_helpers
# snipples model.baseline_logreg       # => baseline
# snipples model.xgboost_cv            # => stronger model
# snipples eval.threshold_opt_pr
# snipples exp.shap_tree               # if using the XGB pipeline
# snipples export.predictions_csv
```

# Generate **workspace snippets** (VS Code/Windsurf JSON) 

    Here are two drop-in artifacts for Windsurf/VS Code:

---

# 1) Workspace snippets

Save as **`.vscode/snippets.code-snippets`** in your repo.

```json
{
  "Snipples: expand line": {
    "prefix": "snipples",
    "description": "Insert a # snipples directive (works in Jupyter cells via snipples.py)",
    "body": ["# snipples ${1:namespace.snippet_name}"]
  },

  "Imports: data analyst notebook header": {
    "prefix": "py-imports-notebook",
    "scope": "python",
    "description": "Standard imports for analysis notebooks",
    "body": [
      "import os, sys, math, json, textwrap, warnings",
      "import numpy as np",
      "import pandas as pd",
      "import matplotlib.pyplot as plt",
      "import seaborn as sns",
      "from pathlib import Path",
      "warnings.filterwarnings('ignore')",
      "pd.set_option('display.max_columns', 100)",
      "plt.rcParams['figure.figsize'] = (10,5)"
    ]
  },

  "IO: read Telco CSV with dtypes": {
    "prefix": "py-read-telco",
    "scope": "python",
    "description": "Read IBM Telco Churn CSV with explicit dtypes",
    "body": [
      "dtype_map = {",
      "  'customerID':'string','gender':'string','SeniorCitizen':'Int64','Partner':'string','Dependents':'string',",
      "  'tenure':'Int64','PhoneService':'string','MultipleLines':'string','InternetService':'string','OnlineSecurity':'string',",
      "  'OnlineBackup':'string','DeviceProtection':'string','TechSupport':'string','StreamingTV':'string','StreamingMovies':'string',",
      "  'Contract':'string','PaperlessBilling':'string','PaymentMethod':'string','MonthlyCharges':'float64','TotalCharges':'string','Churn':'string'",
      "}",
      "df = pd.read_csv('${1:data/raw/Telco-Customer-Churn.csv}', dtype=dtype_map)",
      "print(df.shape)"
    ]
  },

  "Clean: TotalCharges fix (tenure==0 => 0 else tenure*MonthlyCharges)": {
    "prefix": "py-clean-totalcharges",
    "scope": "python",
    "description": "Coerce TotalCharges to numeric and impute correctly",
    "body": [
      "df['TotalCharges'] = pd.to_numeric(df['TotalCharges'].astype(str).str.strip(), errors='coerce')",
      "mask0 = df['TotalCharges'].isna() & (df['tenure'].fillna(0) == 0)",
      "df.loc[mask0, 'TotalCharges'] = 0.0",
      "mask_other = df['TotalCharges'].isna() & df['tenure'].notna() & df['MonthlyCharges'].notna()",
      "df.loc[mask_other, 'TotalCharges'] = df.loc[mask_other, 'tenure'] * df.loc[mask_other, 'MonthlyCharges']",
      "df['TotalCharges'] = df['TotalCharges'].astype('float64')"
    ]
  },

  "EDA: churn class balance plot": {
    "prefix": "py-eda-balance",
    "scope": "python",
    "description": "Bar chart of churn class balance",
    "body": [
      "ax = (df['Churn'].value_counts(normalize=true)*100).plot.bar()",
      "ax.set_title('Churn class balance (%)'); plt.show()"
    ]
  },

  "Feature: target + tenure buckets": {
    "prefix": "py-feat-target-buckets",
    "scope": "python",
    "description": "Create y and tenure_bucket features",
    "body": [
      "df['y'] = (df['Churn'].str.upper() == 'YES').astype(int)",
      "bins = [-1,0,6,12,24,48,84,999]",
      "labels = ['0','1-6','7-12','13-24','25-48','49-84','85+']",
      "df['tenure_bucket'] = pd.cut(df['tenure'], bins=bins, labels=labels)"
    ]
  },

  "Split: train/test": {
    "prefix": "py-split",
    "scope": "python",
    "description": "Train/test split with drop of ID/label columns",
    "body": [
      "from sklearn.model_selection import train_test_split",
      "target = 'y'",
      "y = df[target]",
      "X = df.drop(columns=['customerID','Churn','y'], errors='ignore')",
      "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)"
    ]
  },

  "Preprocess: ColumnTransformer (num scale + OHE cat)": {
    "prefix": "py-pre-coltx",
    "scope": "python",
    "description": "StandardScaler for numeric and OneHotEncoder for categories",
    "body": [
      "num_cols = X_train.select_dtypes(include=['number']).columns.tolist()",
      "cat_cols = X_train.select_dtypes(exclude=['number']).columns.tolist()",
      "from sklearn.preprocessing import OneHotEncoder, StandardScaler",
      "from sklearn.compose import ColumnTransformer",
      "pre = ColumnTransformer([",
      "  ('num', StandardScaler(with_mean=false), num_cols),",
      "  ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=true), cat_cols)",
      "])"
    ]
  },

  "Model: Logistic Regression baseline": {
    "prefix": "py-model-logreg",
    "scope": "python",
    "description": "Baseline classifier pipeline + quick metrics",
    "body": [
      "from sklearn.pipeline import Pipeline",
      "from sklearn.linear_model import LogisticRegression",
      "from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, classification_report, confusion_matrix",
      "def print_metrics(y_true, prob, thr=0.5):",
      "  y_pred = (prob >= thr).astype(int)",
      "  print(f'ROC AUC: {roc_auc_score(y_true, prob):.4f}')",
      "  print(f'PR AUC : {average_precision_score(y_true, prob):.4f}')",
      "  print(f'F1     : {f1_score(y_true, y_pred):.4f}')",
      "  print(confusion_matrix(y_true, y_pred))",
      "  print(classification_report(y_true, y_pred, digits=3))",
      "clf = Pipeline([('pre', pre), ('lr', LogisticRegression(max_iter=200, class_weight='balanced'))])",
      "clf.fit(X_train, y_train)",
      "prob = clf.predict_proba(X_test)[:,1]",
      "print_metrics(y_test, prob)"
    ]
  },

  "Model: XGBoost pipeline (hist)": {
    "prefix": "py-model-xgb",
    "scope": "python",
    "description": "XGBoost classifier with ColumnTransformer preprocessing",
    "body": [
      "import xgboost as xgb",
      "from sklearn.pipeline import Pipeline",
      "from sklearn.metrics import roc_auc_score",
      "clf = Pipeline([",
      "  ('pre', pre),",
      "  ('xgb', xgb.XGBClassifier(n_estimators=600, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, eval_metric='logloss', tree_method='hist', n_jobs=-1))",
      "])",
      "clf.fit(X_train, y_train)",
      "prob = clf.predict_proba(X_test)[:,1]",
      "print_metrics(y_test, prob)"
    ]
  },

  "Explain: SHAP (tree models)": {
    "prefix": "py-shap-tree",
    "scope": "python",
    "description": "SHAP summary for tree-based model within a Pipeline",
    "body": [
      "import shap",
      "shap.initjs()",
      "booster = clf.named_steps.get('xgb')",
      "Xs = clf.named_steps['pre'].transform(X_test)",
      "explainer = shap.TreeExplainer(booster)",
      "shap_values = explainer.shap_values(Xs)",
      "shap.summary_plot(shap_values, Xs)"
    ]
  },

  "Quality: Pandera mini schema": {
    "prefix": "py-pandera-telco",
    "scope": "python",
    "description": "Quick Pandera schema to validate key columns",
    "body": [
      "import pandera as pa",
      "from pandera import Column, Check",
      "TelcoSchema = pa.DataFrameSchema({",
      "  'customerID': Column(str, nullable=false),",
      "  'tenure': Column(int, Check.in_range(0,84), nullable=false),",
      "  'MonthlyCharges': Column(float, Check.in_range(0,200), nullable=false),",
      "  'TotalCharges': Column(float, nullable=false),",
      "  'Churn': Column(str, Check.isin(['Yes','No']), nullable=false)",
      "})",
      "TelcoSchema.validate(df, lazy=true)"
    ]
  },

  "dbt: model scaffold": {
    "prefix": "dbt-model",
    "scope": "sql",
    "description": "dbt table model scaffold",
    "body": [
      "{{ config(materialized='table') }}",
      "WITH src AS (",
      "  SELECT * FROM {{ ref('${1:stg_telco_customers}') }}",
      ")",
      "SELECT * FROM src;"
    ]
  },

  "SQL: churn by segment": {
    "prefix": "sql-churn-seg",
    "scope": "sql",
    "description": "Segment churn rates by Contract",
    "body": [
      "SELECT Contract,",
      "       ROUND(100 * AVG(CASE WHEN Churn='Yes' THEN 1 ELSE 0 END), 2) AS churn_pct,",
      "       COUNT(*) AS n",
      "FROM ${1:analytics.telco_customers}",
      "GROUP BY Contract",
      "ORDER BY churn_pct DESC;"
    ]
  }
}
```

---
># Tiny **Makefile** to lint/test/run your dbt & notebooks?

# 2) Makefile

Save as **`Makefile`** at the repo root. It‚Äôs opinionated but tidy for AE/ML + dbt.

```make
# -------- Config --------
PY          ?= python3
VENV        ?= .venv
ACTIVATE    = . $(VENV)/bin/activate
REQS        ?= requirements.txt

DBT_TARGET  ?= dev
DBT_PROJ    ?= .           # path to dbt project (has dbt_project.yml)
RAW_CSV     ?= data/raw/Telco-Customer-Churn.csv

# -------- Helpers --------
.PHONY: help
help:
	@echo "Common targets:"
	@echo "  make venv           # create venv and install requirements"
	@echo "  make install        # install/upgrade packages in venv"
	@echo "  make lint           # ruff lint + format check"
	@echo "  make fmt            # ruff format"
	@echo "  make test           # run pytest"
	@echo "  make nb-run         # run notebooks with papermill (paramizable)"
	@echo "  make dbt-deps/run/test/freshness  # dbt workflow"
	@echo "  make qa-quick       # quick CSV sanity checks (row count, nulls, dup IDs)"
	@echo "  make clean          # remove build artifacts"

# -------- Environment --------
$(VENV):
	$(PY) -m venv $(VENV)
	$(ACTIVATE) && python -m pip install --upgrade pip

.PHONY: venv
venv: $(VENV) install

.PHONY: install
install:
	@if [ -f "$(REQS)" ]; then \
		$(ACTIVATE) && pip install -r $(REQS); \
	else \
		echo "No requirements.txt found; installing a sane default set..."; \
		$(ACTIVATE) && pip install pandas numpy scikit-learn imbalanced-learn xgboost catboost shap mlflow matplotlib seaborn pandera great-expectations jupyter papermill ruff pytest dbt-bigquery pandas-gbq google-cloud-bigquery pyarrow; \
	fi

# -------- Lint & Test --------
.PHONY: lint
lint:
	$(ACTIVATE) && ruff check .

.PHONY: fmt
fmt:
	$(ACTIVATE) && ruff format .

.PHONY: test
test:
	$(ACTIVATE) && pytest -q

# -------- Notebooks (papermill) --------
# Usage: make nb-run NB=notebooks/01_eda.ipynb OUT=reports/01_eda.out.ipynb
NB ?= notebooks/01_eda.ipynb
OUT ?= reports/01_eda.out.ipynb
.PHONY: nb-run
nb-run:
	$(ACTIVATE) && papermill $(NB) $(OUT)

# -------- Quick CSV QA (no dbt) --------
.PHONY: qa-quick
qa-quick:
	@echo "QA: $(RAW_CSV)"
	$(ACTIVATE) && python - <<'PY'
import pandas as pd, sys
path = "$(RAW_CSV)"
df = pd.read_csv(path)
print("Shape:", df.shape)
print("Nulls (top 10):")
print(df.isna().sum().sort_values(ascending=False).head(10))
if "customerID" in df.columns:
    dup = (df["customerID"].value_counts() > 1).sum()
    print("Duplicate customerID:", dup)
PY

# -------- dbt workflow --------
.PHONY: dbt-deps
dbt-deps:
	$(ACTIVATE) && cd $(DBT_PROJ) && dbt deps

.PHONY: dbt-run
dbt-run:
	$(ACTIVATE) && cd $(DBT_PROJ) && dbt run --target $(DBT_TARGET)

.PHONY: dbt-test
dbt-test:
	$(ACTIVATE) && cd $(DBT_PROJ) && dbt test --target $(DBT_TARGET)

.PHONY: dbt-freshness
dbt-freshness:
	$(ACTIVATE) && cd $(DBT_PROJ) && dbt source freshness --target $(DBT_TARGET)

# -------- Clean --------
.PHONY: clean
clean:
	rm -rf $(VENV) .pytest_cache .ruff_cache **/__pycache__
	find . -name "*.ipynb_checkpoints" -type d -prune -exec rm -rf {} +
```

---

## How to wire this up fast

1. **Put your extension + snippets in place**

```bash
mkdir -p ~/.snipples
# (Paste your large ~/.snipples/snippets.yaml we created earlier)
# Put snipples.py somewhere on PYTHONPATH, or `pip install -e .` if you package it.
```

In notebooks:

```python
%load_ext snipples
# then type:  # snipples model.xgboost_cv   (Tab to browse)
```

2. **Add the workspace snippets**

* Create `.vscode/snippets.code-snippets` and paste the JSON above.

3. **Makefile**

```bash
make venv
make qa-quick
make dbt-deps dbt-run dbt-test
```

---

># **generate a tiny `requirements.txt` and a `tasks.json` (Windsurf/VS Code tasks that wrap `make` targets so you can ‚åò‚áßB to run ‚ÄúDBT: test‚Äù)**



># CLAUDE SUGGESTION: CODE LIBRARY

Great question! Code organization and storage strategy is crucial for professional development and portfolio presentation. Here are the most appropriate approaches for different aspects of your projects:

## üóÇÔ∏è **Repository Structure & Organization**

### **Primary Code Storage: GitHub**
```
your-username/
‚îú‚îÄ‚îÄ telco-churn-analysis-level-0/
‚îÇ   ‚îú‚îÄ‚îÄ README.md
‚îÇ   ‚îú‚îÄ‚îÄ requirements.txt
‚îÇ   ‚îú‚îÄ‚îÄ src/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ data_loader.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ analysis.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ utils.py
‚îÇ   ‚îú‚îÄ‚îÄ notebooks/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ exploratory_analysis.ipynb
‚îÇ   ‚îú‚îÄ‚îÄ data/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ telco.csv
‚îÇ   ‚îú‚îÄ‚îÄ docs/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ methodology.md
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ findings.md
‚îÇ   ‚îú‚îÄ‚îÄ tests/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ test_analysis.py
‚îÇ   ‚îî‚îÄ‚îÄ results/
‚îÇ       ‚îú‚îÄ‚îÄ figures/
‚îÇ       ‚îî‚îÄ‚îÄ reports/
```

### **Multi-Level Portfolio Structure**
```
data-analytics-journey/
‚îú‚îÄ‚îÄ README.md (Portfolio overview)
‚îú‚îÄ‚îÄ level-00-bare-bones/
‚îú‚îÄ‚îÄ level-01-structured-eda/
‚îú‚îÄ‚îÄ level-02-data-cleaning/
‚îú‚îÄ‚îÄ ...
‚îú‚îÄ‚îÄ level-10-enterprise/
‚îú‚îÄ‚îÄ docs/
‚îÇ   ‚îú‚îÄ‚îÄ learning-progression.md
‚îÇ   ‚îú‚îÄ‚îÄ technologies-mastered.md
‚îÇ   ‚îî‚îÄ‚îÄ business-impact.md
‚îî‚îÄ‚îÄ portfolio-assets/
    ‚îú‚îÄ‚îÄ presentations/
    ‚îú‚îÄ‚îÄ publications/
    ‚îî‚îÄ‚îÄ certifications/
```

## üíª **Code Workflow Storage Options**

### **1. GitHub (Primary - Essential)**
**Best for:**
- Version control and collaboration
- Portfolio presentation to employers
- Open source contributions
- CI/CD integration

**Structure:**
```
‚îú‚îÄ‚îÄ .github/
‚îÇ   ‚îú‚îÄ‚îÄ workflows/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ ci.yml
‚îÇ   ‚îî‚îÄ‚îÄ ISSUE_TEMPLATE/
‚îú‚îÄ‚îÄ src/
‚îú‚îÄ‚îÄ tests/
‚îú‚îÄ‚îÄ docs/
‚îî‚îÄ‚îÄ examples/
```

### **2. Jupyter Notebooks (Development)**
**Best for:**
- Exploratory analysis documentation
- Step-by-step methodology explanation
- Interactive presentations
- Educational content

**Organization:**
```
notebooks/
‚îú‚îÄ‚îÄ 01-data-exploration.ipynb
‚îú‚îÄ‚îÄ 02-data-cleaning.ipynb
‚îú‚îÄ‚îÄ 03-feature-engineering.ipynb
‚îú‚îÄ‚îÄ 04-modeling.ipynb
‚îî‚îÄ‚îÄ 05-results-analysis.ipynb
```

### **3. Professional Platforms**

#### **Kaggle Notebooks**
```python
# Kaggle-optimized structure
"""
TELCO CUSTOMER CHURN - LEVEL 0 ANALYSIS
=======================================
Author: Your Name
Created: Date
Updated: Date

Table of Contents:
1. Introduction & Business Problem
2. Data Loading & Initial Exploration  
3. Data Quality Assessment
4. Key Insights Discovery
5. Business Recommendations
6. Technical Implementation
7. Next Steps & Learning Journey
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')

# Load data
df = pd.read_csv('../input/telco-customer-churn/telco.csv')
```

#### **Google Colab (Shareable)**
```python
# Mount Google Drive for data persistence
from google.colab import drive
drive.mount('/content/drive')

# Install required packages
!pip install -q pandas numpy matplotlib seaborn scikit-learn

# Professional notebook structure with clear sections
```

## üèóÔ∏è **Advanced Code Organization (Level 5+)**

### **Modular Python Package Structure**
```
telco_churn_analysis/
‚îú‚îÄ‚îÄ setup.py
‚îú‚îÄ‚îÄ pyproject.toml
‚îú‚îÄ‚îÄ telco_churn/
‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îú‚îÄ‚îÄ data/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ loader.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ cleaner.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ validator.py
‚îÇ   ‚îú‚îÄ‚îÄ features/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ engineering.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ selection.py
‚îÇ   ‚îú‚îÄ‚îÄ models/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ training.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ evaluation.py
‚îÇ   ‚îú‚îÄ‚îÄ visualization/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ plots.py
‚îÇ   ‚îî‚îÄ‚îÄ utils/
‚îÇ       ‚îú‚îÄ‚îÄ __init__.py
‚îÇ       ‚îî‚îÄ‚îÄ helpers.py
‚îú‚îÄ‚îÄ tests/
‚îú‚îÄ‚îÄ docs/
‚îú‚îÄ‚îÄ examples/
‚îî‚îÄ‚îÄ data/
```

### **Configuration Management**
```python
# config.py
from dataclasses import dataclass
from pathlib import Path

@dataclass
class Config:
    # Data paths
    DATA_DIR: Path = Path("data")
    RAW_DATA_FILE: str = "telco.csv"
    PROCESSED_DATA_FILE: str = "telco_processed.csv"
    
    # Model parameters
    TEST_SIZE: float = 0.2
    RANDOM_STATE: int = 42
    
    # Output paths
    RESULTS_DIR: Path = Path("results")
    FIGURES_DIR: Path = Path("results/figures")
    REPORTS_DIR: Path = Path("results/reports")
```

## üìä **Documentation & Workflow Storage**

### **README.md Template**
```markdown
# Level X: Telco Customer Churn Analysis

## Quick Start
```bash
pip install -r requirements.txt
python src/analysis.py
```

## Project Structure
- `src/`: Core analysis code
- `notebooks/`: Jupyter exploration
- `data/`: Dataset files  
- `results/`: Output artifacts
- `tests/`: Unit tests
- `docs/`: Documentation

## Key Findings
- Finding 1: Contract type drives churn
- Finding 2: Payment method indicates engagement  
- Finding 3: Premium services show paradox

## Business Impact
- $2.4M annual revenue at risk identified
- 3 strategic initiatives recommended
- Clear ROI projections provided

## Technologies Used
- Python, Pandas, NumPy
- Matplotlib, Seaborn
- Scikit-learn (for advanced levels)

## Next Steps
See Level X+1 for progression...
```

### **Code Documentation Standards**
```python
def analyze_churn_by_contract(df: pd.DataFrame) -> pd.Series:
    """
    Analyze churn rates by contract type.
    
    This function calculates churn rates for each contract type,
    revealing the primary business insight of this analysis.
    
    Args:
        df (pd.DataFrame): Customer dataset with 'Contract' and 'Churn' columns
        
    Returns:
        pd.Series: Churn rates by contract type, sorted descending
        
    Example:
        >>> churn_rates = analyze_churn_by_contract(df)
        >>> print(churn_rates)
        Month-to-month    0.427
        One year          0.113  
        Two year          0.028
    
    Business Impact:
        This analysis reveals that month-to-month customers churn at 15x 
        the rate of two-year contract customers, representing the single 
        most actionable insight for retention strategy.
    """
    return df.groupby('Contract')['Churn'].apply(
        lambda x: (x == 'Yes').mean()
    ).sort_values(ascending=False)
```

## üîß **Workflow Management Tools**

### **For Individual Projects (Levels 0-3)**
- **Primary**: GitHub repository per level
- **Development**: Jupyter notebooks + VS Code
- **Documentation**: Markdown files + README
- **Sharing**: Kaggle notebooks for community

### **For Advanced Projects (Levels 4-7)**
- **Code**: Modular Python packages
- **Environment**: Poetry/Conda for dependency management
- **Testing**: pytest framework
- **Documentation**: Sphinx for API docs
- **Workflow**: GitHub Actions for CI/CD

### **For Enterprise Projects (Levels 8-10)**
- **Infrastructure**: Infrastructure as Code (Terraform)
- **Containerization**: Docker + Kubernetes
- **Orchestration**: Apache Airflow
- **Monitoring**: Logging and metrics collection
- **Deployment**: Blue-green deployment strategies

## üí° **Best Practices by Level**

### **Level 0-2: Foundation**
```python
# Simple, clear, well-commented code
import pandas as pd

# Load data with error handling
try:
    df = pd.read_csv('data/telco.csv')
    print(f"‚úÖ Data loaded: {df.shape}")
except FileNotFoundError:
    print("‚ùå Data file not found")
    exit(1)

# Clear business-focused analysis
contract_churn = df.groupby('Contract')['Churn'].mean()
print("üìä Churn by Contract Type:")
print(contract_churn.sort_values(ascending=False))
```

### **Level 5-7: Professional**
```python
# Type hints, docstrings, error handling
from typing import Tuple, Dict, Any
import logging

logger = logging.getLogger(__name__)

class ChurnAnalyzer:
    """Professional customer churn analysis class."""
    
    def __init__(self, config: Config):
        self.config = config
        self.df = None
        
    def load_data(self) -> None:
        """Load and validate customer data."""
        try:
            self.df = pd.read_csv(self.config.data_path)
            logger.info(f"Data loaded successfully: {self.df.shape}")
        except Exception as e:
            logger.error(f"Failed to load data: {e}")
            raise
```

### **Level 8-10: Enterprise**
```python
# Production-ready with monitoring
from dataclasses import dataclass
from abc import ABC, abstractmethod
import structlog

logger = structlog.get_logger()

@dataclass
class PredictionMetrics:
    accuracy: float
    precision: float
    recall: float
    latency_ms: float

class ChurnPredictor(ABC):
    """Abstract base class for churn prediction models."""
    
    @abstractmethod
    async def predict(self, customer_data: Dict) -> Tuple[float, PredictionMetrics]:
        """Predict churn probability with performance metrics."""
        pass
```

## üìà **Storage Strategy Recommendations**

### **For Learning Journey Documentation**
1. **GitHub**: Primary portfolio repository
2. **Kaggle**: Public notebooks for community engagement  
3. **Personal Blog**: Medium/LinkedIn articles
4. **Cloud Storage**: AWS S3/Google Cloud for large datasets
5. **Local Development**: VS Code + Jupyter for daily work

### **For Professional Presentation**
1. **Clean GitHub repos** with professional README files
2. **Comprehensive documentation** explaining methodology
3. **Business-focused presentations** with clear ROI
4. **Code quality standards** with proper testing
5. **Progression documentation** showing skill development

The key is to maintain **consistency across platforms** while optimizing each for its specific audience and purpose. Your Level 0 should be simple and accessible, while your Level 10 should demonstrate enterprise-scale thinking and implementation.

---


"""
__init__.py is one of those little Python things that feels mysterious at first.

What __init__.py is
=====================

It‚Äôs a special file that tells Python:
‚ÄúThis folder should be treated as a package.‚Äù

Without it, Python may not recognize your src/ directory as importable.

In practice, having an __init__.py means you can do:

from src import utils


instead of hacking sys.path.

What goes inside?
==================

For your Level 2 project, you can keep it super simple:

# __init__.py is one of those little Python things that feels mysterious at first.

üì¶ What __init__.py is

It‚Äôs a special file that tells Python:
‚ÄúThis folder should be treated as a package.‚Äù

Without it, Python may not recognize your src/ directory as importable.

In practice, having an __init__.py means you can do:

from src import utils


instead of hacking sys.path.

üõ† What goes inside?

For your Level 2 project, you can keep it super simple:

# /src/__init__.py
"""
src package for Telco Churn Level 2 project.

Contains:
- utils: data loading, validation, EDA helpers
"""

from . import utils


This way, when you import src, you automatically expose src.utils.

ü§î Do you need it?

Python 3.3+ technically supports ‚Äúimplicit namespace packages‚Äù without __init__.py.

But most teams (and tools like pytest, linters, IDEs) still expect it.

So it‚Äôs best practice to always include one, even if it‚Äôs empty.

‚úÖ For your project

In your repo:

Telco-L2/
‚îú‚îÄ src/
‚îÇ  ‚îú‚îÄ __init__.py   ‚Üê just add this file (can even be empty)
‚îÇ  ‚îî‚îÄ utils.py


This will make your src folder behave like a proper Python package.

Do you want me to show you the two import styles (monolithic vs. package-style with __init__.py) side by side, so you see how this little file changes how you call utils from notebooks?

You're building a **personal code library** - this is exactly what professional data scientists do. Here are several effective approaches:## üéØ **My Recommendation: Start Simple with Markdown Files**

Create a folder structure like this:
```
my_code_library/
‚îú‚îÄ‚îÄ missing_values.md
‚îú‚îÄ‚îÄ data_types.md
‚îú‚îÄ‚îÄ anomaly_detection.md
‚îú‚îÄ‚îÄ data_cleaning.md
‚îî‚îÄ‚îÄ quick_exploration.md
```

## üìù **Why Markdown Files Work Best:**

1. **Easy to search** - Ctrl+F to find what you need
2. **Copy-paste ready** - Code blocks ready to use  
3. **Context included** - Notes on when/why to use each method
4. **GitHub friendly** - Looks professional in your portfolio

## üí° **Example: `missing_values.md`**

```markdown
# Missing Values Toolkit

## Quick Count (Most Used)
```python
df.isna().sum().sum()  # 18 characters, fastest
```

## Show Details When Needed  
```python
df.isnull().sum()[df.isnull().sum() > 0]  # Only non-zero columns
```

## When to Use Each:
- **Quick count**: Initial assessment, just need total
- **Detailed view**: When you find missing values and need to know where
```

## üöÄ **Start Your Library Today:**

1. **Create the folder structure**
2. **Start with one file** - `missing_values.md`
3. **Add your variations** from today's session
4. **Include context** - when to use each method
5. **Build as you learn** - add new categories in future levels

## üìà **Benefits for Your Goals:**

- **Goal #2**: Documents your learning progression
- **Goal #4**: Creates your personal reference library
- **Portfolio value**: Shows systematic thinking to hiring managers

This approach scales perfectly - start simple, add complexity as you grow!

Here are several ways to check for empty values across the entire dataset:

## **Quick One-Liners for Different Types of Empty:**

### **Check NaN/Null values:**
```python
df.isnull().sum().sum()  # Total count
df.isnull().sum()        # By column
```

### **Check empty strings (''):**
```python
(df == '').sum().sum()   # Total count
(df == '').sum()         # By column
```

### **Check empty spaces (' '):**
```python
(df == ' ').sum().sum()  # Total count
(df == ' ').sum()        # By column
```

### **Ultimate one-liner for all types:**
```python
print(f"NaN: {df.isnull().sum().sum()}, Empty: {(df == '').sum().sum()}, Spaces: {(df == ' ').sum().sum()}")
```

## **Most Comprehensive Check:**

```python
# Check every column for all types of empty values
for col in df.columns:
    nan_count = df[col].isnull().sum()
    empty_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
    space_count = (df[col] == ' ').sum() if df[col].dtype == 'object' else 0
    
    total = nan_count + empty_count + space_count
    if total > 0:
        print(f"{col}: {total} empty values")
```

## **Show Actual Problem Rows:**

```python
# Find and display rows with any empty values
problem_rows = df[df.isnull().any(axis=1) | (df == '').any(axis=1) | (df == ' ').any(axis=1)]
print(f"Rows with empty values: {len(problem_rows)}")
if len(problem_rows) > 0:
    print(problem_rows[['customerID']].head())  # Show first few problem customer IDs
```

**After your TotalCharges fix, you should see:**
- **NaN: 0**
- **Empty strings: 0** 
- **Spaces: 0**

If you see any non-zero counts, that tells you where you still have empty value issues!

---

># **#2: Keep a snippet library handy across projects/workspaces**

    Here are three ‚Äúprofessional and portable‚Äù ways to keep a snippet library handy across projects/workspaces on your Mac‚Äîordered from most editor-native to most universal. Pick one or combine B+C for a killer setup.

# A) VS Code user snippets + built-in Sync (simple, editor-native)

**Best if you primarily use VS Code.**

1. Open Command Palette ‚Üí ‚ÄúPreferences: Configure User Snippets‚Äù.
2. Create language or global snippets. Example (`python.json`):

```json
{
  "Py docstring": {
    "prefix": "pydoc",
    "body": [
      "\"\"\"${1:Summary}",
      "",
      "Args:",
      "    ${2:param}: ${3:desc}",
      "",
      "Returns:",
      "    ${4:type}: ${5:desc}",
      "\"\"\""
    ],
    "description": "Docstring scaffold"
  }
}
```

3. Turn on **Settings Sync** (Account icon ‚Üí Turn On Sync). Your snippets follow you on any machine you sign into VS Code with.

**Pro tip (shared across repos):** keep your snippet JSON files in a Git repo and **symlink** them into VS Code‚Äôs snippets folder so you can version-control them:

```bash
# Paths (macOS)
VS_SNIPS="$HOME/Library/Application Support/Code/User/snippets"
mkdir -p ~/snippets/vscode
ln -s ~/snippets/vscode/python.json "$VS_SNIPS/python.json"
```

# B) Git + Markdown snippets + fzf CLI (portable, editor-agnostic)

**Best if you hop between tools (VS Code, Windsurf, terminals, notebooks).**

1. Make a repo:

```
~/snippets/
  python/
  sql/
  dbt/
  shell/
  README.md
```

Each snippet is a small `.md` with a clear title and a fenced code block.

2. Install tools (free):

```bash
brew install fzf ripgrep
```

3. Add a tiny **`snip`** helper to your `~/.zshrc`:

````zsh
snip() {
  local file
  file=$(rg -l --hidden --glob "!*.git/*" "${*:-.}" "$HOME/snippets" | fzf --prompt="Snip> ") || return
  # copy the code block contents to clipboard
  awk '/^```/{f=!f; next} f' "$file" | pbcopy
  echo "üìã Copied from: $file"
}
````

Usage:

````bash
snip pandas groupby
# copies the code inside the first ``` block from the chosen file
# ‚åò+V (paste) in any editor/terminal
````

**Why this rocks:** works everywhere (terminal, notebooks, different editors), versioned with Git, searchable via fuzzy find. You can keep private snippets in a private repo.

# C) Text expander (Espanso) for instant boilerplate (cross-app)

**Great for short, frequently typed patterns.** Free & open-source.

```bash
brew install espanso
espanso start
espanso edit
```

Add triggers in `default.yml`:

```yaml
matches:
  - trigger: ":sqlsel"
    replace: |
      SELECT ${1:*}
      FROM ${2:table}
      WHERE ${3:cond};
    vars:
      - name: cursor
        type: cursor
  - trigger: ":dbtmodel"
    replace: |
      {{ config(materialized='table') }}
      WITH src AS (
          SELECT * FROM {{ ref('${1:stg_table}') }}
      )
      SELECT * FROM src;
    propagate_case: true
filters:
  apps:
    only: ["com.microsoft.VSCode"]  # limit expansions to VS Code
```

Now typing `:sqlsel` in VS Code expands into a SQL skeleton with tabstops‚Äîwithout touching your editor‚Äôs snippet system.

---

## What I recommend for you

Given you‚Äôre on macOS + zsh and bounce across projects:

* **Primary:** VS Code user snippets + Sync (A) for language-aware tab-trigger snippets.
* **Universal:** Git + `snip` (B) so you can search/paste any snippet in *any* workspace, terminal, or notebook.
* **Boilerplate:** Espanso (C) for lightning-fast expansions you want everywhere.

---

## Starter pack you can paste in today

**Repo skeleton**

```bash
mkdir -p ~/snippets/{python,sql,dbt,shell}
git init ~/snippets
```

**Example files**
`~/snippets/sql/select-where.md`

````md
# Select with WHERE & limit
```sql
SELECT ${1:cols}
FROM ${2:schema.table}
WHERE ${3:cond}
LIMIT ${4:100};
````

````

`~/snippets/python/pandas-null-check.md`
```md
# Null audit (top offenders)
```python
na = df.isna().sum().sort_values(ascending=False)
print(na[na>0].head(10))
````

````

**VS Code snippet (dbt model)**
`~/snippets/vscode/sql.json`
```json
{
  "dbt model": {
    "prefix": "dbtmodel",
    "body": [
      "{{ config(materialized='table') }}",
      "WITH src AS (",
      "  SELECT * FROM {{ ref('${1:stg_table}') }}",
      ")",
      "SELECT * FROM src;"
    ],
    "description": "dbt model scaffold"
  }
}
````

Then symlink it as shown above.

---

### Guardrails

* **Secrets:** never store tokens/keys in snippets; use placeholders like `${SECRET_NAME}`.
* **Licensing:** if you paste from blogs/StackOverflow, keep a note/link at the top of the snippet file.
* **Naming:** use a consistent prefix (e.g., `:sql*`, `:py*`) for Espanso triggers to avoid accidental expansion.

>If you want, tell me your top 10 snippet needs (Python/pandas, SQL/dbt, shell, GCP/AWS), and I‚Äôll generate the files to drop into `~/snippets/` plus the matching VS Code/Espanso entries.

---

># **#3: Windsurf with VS Code‚Äìstyle snippets**

Yep‚ÄîWindsurf plays nicely with **VS Code‚Äìstyle snippets**, plus you can layer on a git repo and a text expander for portability. Here‚Äôs a clean setup that works across projects/workspaces.

# 1) Built-in (VS Code‚Äìstyle) user & workspace snippets

* Open the **Command Palette ‚Üí ‚ÄúPreferences: Configure User Snippets.‚Äù** Create a language file (e.g., `python.json`) or a **global** snippets file. Snippet format is the same as VS Code‚Äôs JSON (prefix/body/description). ([Arsturn][1], [Visual Studio Code][2])
* Bringing snippets from VS Code? On first run Windsurf lets you **import settings from VS Code/Cursor**, which includes snippets. ([DEV Community][3])
* For repo-scoped snippets, add a workspace file (e.g., `.vscode/snippets.code-snippets`) so they travel with the project in Git. (Same mechanism as VS Code.) ([Stack Overflow][4])

Example snippet body (works in Windsurf since it‚Äôs VS Code syntax):

```json
{
  "Docstring scaffold": {
    "prefix": "pydoc",
    "body": [
      "\"\"\"${1:Summary}",
      "",
      "Args:",
      "    ${2:param}: ${3:desc}",
      "",
      "Returns:",
      "    ${4:type}: ${5:desc}",
      "\"\"\""
    ],
    "description": "Python docstring"
  }
}
```

# 2) Manage snippets with extensions (Open VSX)

Windsurf uses the **Open VSX** marketplace, so install snippet helpers from there in the Extensions view:

* **Snippets Manager** ‚Äì simple UI to create/edit snippets. ([open-vsx.org][5])
* **Snippets Viewer** ‚Äì browse built-in, extension, and user snippets in a tree view. ([open-vsx.org][6])
* **Snippets Ranger** ‚Äì create/select snippet files quickly. ([open-vsx.org][7])
  (If you‚Äôre used to VS Code‚Äôs marketplace, note the store difference‚ÄîWindsurf surfaces Open VSX by default.) ([Reddit][8])

# 3) Keep them portable with Git (+ optional symlink)

* Put all snippet JSON (and markdown examples) in `~/snippets/` under Git.
* In Windsurf, open one of your snippet files and copy its on-disk path; then **symlink** your repo files to that location so edits are versioned automatically (works on Mac/Linux).
  This avoids guessing paths and keeps snippets consistent across workspaces.

# 4) Universal quick-paste option (works in any app)

If you want snippets outside the editor too:

* **Espanso** (free text expander): type `:sqlsel` ‚Üí expands to a SQL skeleton anywhere.
* Or a tiny **fzf** picker in your terminal that copies code blocks from `~/snippets/*.md` to clipboard (great inside Windsurf‚Äôs integrated terminal).

# 5) Bonus: make snippets ‚Äúsearchable‚Äù by the AI

If you keep a **Google Doc** of your favorite patterns, Windsurf‚Äôs **Knowledge Base / docs context** can ingest docs for retrieval in chat. It‚Äôs not an inserter like snippets, but handy for ‚Äúremind me of the dbt model scaffold‚Äù prompts. ([Windsurf Docs][9])

---

## A setup I‚Äôd recommend for you

1. **User & workspace snippets** for tab-trigger speed. ([Arsturn][1], [Visual Studio Code][2])
2. **Git repo** at `~/snippets/` + **symlink** to Windsurf‚Äôs snippet files so changes sync across workspaces.
3. Install **Snippets Manager** (UI), plus **Snippets Viewer** (browse). ([open-vsx.org][5])
4. Add **Espanso** for cross-app boilerplate (optional).
5. (Nice-to-have) A **Google Doc** of patterns connected to Windsurf‚Äôs knowledge base for quick AI recall. ([Windsurf Docs][9])

If you want, tell me your top 10 snippets (Python/pandas, SQL/dbt, shell/GCP) and I‚Äôll generate the JSON files + a starter `~/snippets/` repo structure to drop in.

[1]: https://www.arsturn.com/blog/creating-custom-code-snippets-in-windsurf-for-reusability "Mastering Code Snippets in Windsurf: Reusability Techniques"
[2]: https://code.visualstudio.com/docs/editing/userdefinedsnippets?utm_source=chatgpt.com "Snippets in Visual Studio Code"
[3]: https://dev.to/proflead/this-ai-ide-can-code-for-you-windsurf-ai-full-tutorial-4p94 "This AI IDE Can Code For You ‚Äì Windsurf AI Full Tutorial - DEV Community"
[4]: https://stackoverflow.com/questions/29995863/how-to-add-custom-code-snippets-in-vscode?utm_source=chatgpt.com "How to add custom code snippets in VSCode?"
[5]: https://open-vsx.org/extension/zjffun/snippetsmanager?utm_source=chatgpt.com "Snippets Manager"
[6]: https://open-vsx.org/extension/RandomFractalsInc/snippets-viewer?utm_source=chatgpt.com "Snippets Viewer"
[7]: https://open-vsx.org/extension/robole/snippets-ranger?utm_source=chatgpt.com "Snippets Ranger"
[8]: https://www.reddit.com/r/RooCode/comments/1i6ns94/new_users_discovery_of_roocode_on_vscode/?utm_source=chatgpt.com "New users' discovery of RooCode on vscode marketplace ..."
[9]: https://docs.windsurf.com/context-awareness/overview?utm_source=chatgpt.com "Overview - Windsurf Docs"

---

># **#4: create a custome library that gives me suggestions in a jupyter cell or python file when I type # to call the library then call a specific snippet by name**

Yes‚Äîyou can make a tiny IPython/Jupyter extension that:

* lets you write a **comment directive** like `# snip <name>` which gets **expanded into code** before execution, and
* gives you **Tab-completion suggestions** for snippet names after you type `# snip ` in a Jupyter cell (works in JupyterLab/Notebook and VS Code‚Äôs Jupyter).

Below is a minimal, drop-in ‚Äúsnippet library‚Äù you can keep in `~/.snippets/`. It uses a YAML file for your snippets, a transformer to expand `# snip ...` lines, and a custom completer to suggest names.

---

### 1) Snippet storage (YAML)

`~/.snippets/snippets.yaml`

```yaml
pandas.null_audit: |
  na = df.isna().sum().sort_values(ascending=False)
  print(na[na>0].head(10))
sql.select_where: |
  SELECT ${1:cols}
  FROM ${2:schema.table}
  WHERE ${3:cond}
  LIMIT ${4:100};
dbt.model_scaffold: |
  {{ config(materialized='table') }}
  WITH src AS (
    SELECT * FROM {{ ref('${1:stg_table}') }}
  )
  SELECT * FROM src;
```

> You can keep multiple YAML files under `~/.snippets/`‚Äîthe loader will merge them.

---

### 2) The extension (`snipline.py`)

Put this file anywhere on your PYTHONPATH (e.g., `~/snipline/snipline.py`) and `pip install pyyaml` if you don‚Äôt have it.

```python
# snipline.py
from __future__ import annotations
import os, re, glob, time, yaml, textwrap
from pathlib import Path
from typing import Dict, Optional
from IPython.core.magic import Magics, magics_class, line_magic
from IPython.display import Javascript, display

SNIP_DIR = Path(os.environ.get("SNIP_DIR", "~/.snippets")).expanduser()

class SnipStore:
    def __init__(self, directory: Path):
        self.dir = Path(directory)
        self.cache: Dict[str, str] = {}
        self._mtimes: Dict[str, float] = {}
        self.reload()

    def reload(self):
        self.cache.clear()
        self._mtimes.clear()
        for yml in glob.glob(str(self.dir / "*.y*ml")):
            with open(yml, "r", encoding="utf-8") as f:
                data = yaml.safe_load(f) or {}
            for k, v in data.items():
                self.cache[str(k)] = str(v)
            self._mtimes[yml] = os.path.getmtime(yml)

    def maybe_reload(self):
        for yml, old in list(self._mtimes.items()):
            if os.path.exists(yml) and os.path.getmtime(yml) != old:
                self.reload()
                break

    def names(self, prefix: str = ""):
        self.maybe_reload()
        return sorted([k for k in self.cache if k.startswith(prefix)])

    def get(self, name: str) -> Optional[str]:
        self.maybe_reload()
        return self.cache.get(name)

STORE = SnipStore(SNIP_DIR)

# --- Input transformer: replace lines like "# snip <name>" with the snippet text
_SNIP_LINE = re.compile(r"^[ \t]*#\s*snip\s+([A-Za-z0-9_.\-/]+)[ \t]*$", re.MULTILINE)

def _expand_snips(cell: str) -> str:
    def repl(m):
        name = m.group(1)
        body = STORE.get(name)
        if body is None:
            # Leave a visible marker if missing
            return f"# [snip: '{name}' not found]"
        # Preserve indentation of the directive line
        indent = len(m.string[:m.start()].splitlines()[-1]) - len(m.string[:m.start()].splitlines()[-1].lstrip())
        ind = " " * max(indent, 0)
        expanded = textwrap.indent(body.rstrip("\n"), ind)
        return expanded
    return _SNIP_LINE.sub(repl, cell)

# --- Tab completion for "# snip " or "%snip " prefixes
def _snip_completer(self, event):
    # Works for both comments and magic usage
    line = getattr(event, "line", "")
    # capture partial name user is typing
    m = re.search(r"(#\s*snip|%snip)\s+([A-Za-z0-9_.\-/]*)$", line[:getattr(event, "cursor_position", len(line))])
    if not m:
        return []
    prefix = m.group(2)
    return STORE.names(prefix)

@magics_class
class SnipMagics(Magics):
    @line_magic("snip")
    def snip(self, line):
        """Usage: %snip <name>  -> prints the snippet body
        With 'insert' in classic Notebook, it replaces the current cell."""
        name = line.strip()
        if not name:
            print("\n".join(STORE.names()))
            return
        body = STORE.get(name)
        if body is None:
            print(f"[snip: '{name}' not found]")
            return
        # Print for copy/paste
        print(body)
        # Optional: try to replace current cell in classic Notebook UI
        try:
            js = Javascript("""
                if (typeof Jupyter !== 'undefined') {
                  var cell = Jupyter.notebook.get_selected_cell();
                  cell.set_text(atob('%s'));
                }
            """ % body.encode("utf-8").hex())
            display(js)
        except Exception:
            pass

def load_ipython_extension(ip):
    # Expand "# snip ..." lines before execution
    ip.input_transformers_post.append(_expand_snips)
    # Add completer (works in Jupyter and VS Code's Jupyter)
    try:
        ip.set_hook('complete_command', _snip_completer, re_key='.*')
    except Exception:
        # Fallback: older IPython versions
        completer = getattr(ip, 'Completer', None)
        if completer and hasattr(completer, 'matchers'):
            completer.matchers.insert(0, lambda self, text: [])
    ip.register_magics(SnipMagics)

def unload_ipython_extension(ip):
    # Best-effort cleanup; new sessions usually just restart
    pass
```

---

### 3) Use it in Jupyter/VS Code‚Äôs Jupyter

In a cell:

```python
%load_ext snipline   # run once per session (or auto-load, see below)
```

Now you can:

* Type: `# snip pandas.null_audit` then press **Tab** after `# snip ` to see snippet **name suggestions**.
* Run the cell: that line is **replaced** with the snippet‚Äôs code before execution.
* Or: `%snip pandas.null_audit` to print/insert the snippet body.

**Example:**

```python
# snip pandas.null_audit
```

‚Ä¶expands to:

```python
na = df.isna().sum().sort_values(ascending=False)
print(na[na>0].head(10))
```

---

### 4) Auto-load every session (optional)

Create (or edit) `~/.ipython/profile_default/ipython_config.py`:

```python
c = get_config()
c.InteractiveShellApp.extensions = ['snipline']
```

---

### Notes & limits

* **Where suggestions work:** in **Jupyter contexts** (classic, Lab, or VS Code‚Äôs Jupyter) because completion comes from the kernel. In a plain `.py` file editor view, completion is handled by the editor‚Äôs language server, so you won‚Äôt get the `# snip` suggestions there. For `.py` editing, pair this with **editor snippets** (VS Code/Windsurf) or a text expander (Espanso).
* **Variables/placeholders:** You can embed `${1:placeholder}` in snippets for editor tabstops (useful when you paste into VS Code). The transformer leaves those as-is; your editor‚Äôs snippet engine will handle them if present.
* **Versioning:** Keep `~/.snippets/` under Git so you can sync across machines. The loader auto-reloads when YAML files change.

If you want, tell me your top 10 snippet names and I‚Äôll generate the YAML ready to drop into `~/.snippets/snippets.yaml`.


----



----


    Heck yes‚Äîlet‚Äôs make **snipples** your cross-workspace snippet engine. Below are two files:

1. **`snipples.py`** ‚Äì an IPython/Jupyter extension so you can type `# snipples <name>` in a cell and it expands before execution (with tab-complete for names).
2. **`~/.snipples/snippets.yaml`** ‚Äì a chunky library covering the Telco Churn project end-to-end (ingest ‚Üí EDA ‚Üí cleaning ‚Üí features ‚Üí ML ‚Üí explainability ‚Üí dbt/SQL ‚Üí ops).

---

## 1) `snipples.py` (drop anywhere on your PYTHONPATH)

```python
# snipples.py
from __future__ import annotations
import os, re, glob, yaml, textwrap
from pathlib import Path
from typing import Dict, Optional
from IPython.core.magic import Magics, magics_class, line_magic
from IPython.display import Javascript, display

SNIPS_DIR = Path(os.environ.get("SNIPPLES_DIR", "~/.snipples")).expanduser()

class SnipStore:
    def __init__(self, directory: Path):
        self.dir = Path(directory)
        self.cache: Dict[str, str] = {}
        self._mtimes: Dict[str, float] = {}
        self.reload()

    def reload(self):
        self.cache.clear()
        self._mtimes.clear()
        self.dir.mkdir(parents=True, exist_ok=True)
        for yml in sorted(self.dir.glob("*.y*ml")):
            with open(yml, "r", encoding="utf-8") as f:
                data = yaml.safe_load(f) or {}
            for k, v in data.items():
                self.cache[str(k)] = str(v)
            self._mtimes[str(yml)] = yml.stat().st_mtime

    def maybe_reload(self):
        for yml, old in list(self._mtimes.items()):
            p = Path(yml)
            if p.exists() and p.stat().st_mtime != old:
                self.reload()
                break

    def names(self, prefix: str = ""):
        self.maybe_reload()
        return sorted([k for k in self.cache if k.startswith(prefix)])

    def get(self, name: str) -> Optional[str]:
        self.maybe_reload()
        return self.cache.get(name)

STORE = SnipStore(SNIPS_DIR)

_SNIPLINE = re.compile(r"^[ \t]*#\s*snipples\s+([A-Za-z0-9_.\-/]+)[ \t]*$", re.MULTILINE)

def _expand_snips(cell: str) -> str:
    def repl(m):
        name = m.group(1)
        body = STORE.get(name)
        if body is None:
            return f"# [snipples: '{name}' not found]"
        # Preserve indentation level of the directive line
        line = cell[:m.start()].splitlines()[-1] if cell[:m.start()].splitlines() else ""
        indent = len(line) - len(line.lstrip(" "))
        expanded = textwrap.indent(body.rstrip("\n"), " " * max(indent, 0))
        return expanded
    return _SNIPLINE.sub(repl, cell)

def _snipples_completer(self, event):
    line = getattr(event, "line", "")
    cur = getattr(event, "cursor_position", len(line))
    m = re.search(r"(#\s*snipples|%snipples)\s+([A-Za-z0-9_.\-/]*)$", line[:cur])
    if not m:
        return []
    return STORE.names(m.group(2))

@magics_class
class SnipplesMagics(Magics):
    @line_magic("snipples")
    def snipples(self, line):
        """%snipples <name>  -> prints/optionally inserts the snippet body"""
        name = line.strip()
        if not name:
            print("\n".join(STORE.names()))
            return
        body = STORE.get(name)
        if body is None:
            print(f"[snipples: '{name}' not found]")
            return
        print(body)
        try:
            js = Javascript("""
                if (typeof Jupyter !== 'undefined') {
                  var cell = Jupyter.notebook.get_selected_cell();
                  cell.set_text(atob('%s'));
                }
            """ % body.encode("utf-8").hex())
            display(js)
        except Exception:
            pass

def load_ipython_extension(ip):
    ip.input_transformers_post.append(_expand_snips)
    try:
        ip.set_hook('complete_command', _snipples_completer, re_key='.*')
    except Exception:
        pass
    ip.register_magics(SnipplesMagics)

def unload_ipython_extension(ip):
    pass
```

**Use:** in a Jupyter cell run once per session:

```python
%load_ext snipples
# then type:  # snipples telco.qa.null_audit   (Tab to see suggestions)
```

To autoload every time, add to `~/.ipython/profile_default/ipython_config.py`:

```python
c = get_config()
c.InteractiveShellApp.extensions = ['snipples']
```

---

## 2) `~/.snipples/snippets.yaml` (Telco Churn end-to-end)

Create the folder and file:

```bash
mkdir -p ~/.snipples
# paste the following into ~/.snipples/snippets.yaml
```

```yaml
# =========================
# Project scaffolding / setup
# =========================
proj.readme.telco: |
  # Telco Churn ‚Äî Analytics & ML
  ## Stack
  - Python 3.11, pandas, scikit-learn, xgboost, catboost, imbalanced-learn, shap
  - BigQuery + dbt (analytics)
  - Great Expectations or Pandera (ingest DQ)
  - MLflow (experiment tracking)

  ## Structure
  .
  ‚îú‚îÄ data/{raw,interim,processed}
  ‚îú‚îÄ notebooks/
  ‚îú‚îÄ src/telco/...
  ‚îú‚îÄ models/ (dbt)
  ‚îî‚îÄ reports/

  ## Targets
  - Clean EDA + feature marts
  - Baseline + tree models
  - Explainability (SHAP)
  - Reproducible pipelines + tests

env.venv.setup: |
  python3 -m venv .venv
  source .venv/bin/activate
  python -m pip install --upgrade pip
  pip install pandas numpy scikit-learn imbalanced-learn xgboost catboost shap mlflow matplotlib seaborn pandera great-expectations python-dotenv pandas-gbq google-cloud-bigquery pyarrow

nb.header.imports: |
  import os, sys, math, json, textwrap, warnings
  import numpy as np
  import pandas as pd
  import matplotlib.pyplot as plt
  import seaborn as sns
  from pathlib import Path
  warnings.filterwarnings("ignore")
  pd.set_option("display.max_columns", 100)
  plt.rcParams["figure.figsize"] = (10,5)

# =========================
# Ingest / IO
# =========================
ingest.csv.read_telco: |
  dtype_map = {
      "customerID": "string",
      "gender": "string",
      "SeniorCitizen": "Int64",
      "Partner": "string",
      "Dependents": "string",
      "tenure": "Int64",
      "PhoneService": "string",
      "MultipleLines": "string",
      "InternetService": "string",
      "OnlineSecurity": "string",
      "OnlineBackup": "string",
      "DeviceProtection": "string",
      "TechSupport": "string",
      "StreamingTV": "string",
      "StreamingMovies": "string",
      "Contract": "string",
      "PaperlessBilling": "string",
      "PaymentMethod": "string",
      "MonthlyCharges": "float64",
      "TotalCharges": "string",   # coercion later
      "Churn": "string"
  }
  df = pd.read_csv("data/raw/Telco-Customer-Churn.csv", dtype=dtype_map)
  print(df.shape)

ingest.bigquery.read_table: |
  # pip install pandas-gbq google-cloud-bigquery
  from pandas_gbq import read_gbq
  df = read_gbq("""
      SELECT * FROM `PROJECT.DATASET.telco_customers`
  """, project_id=os.environ.get("GCP_PROJECT"))

save.outputs.standard_paths: |
  Path("data/interim").mkdir(parents=True, exist_ok=True)
  Path("data/processed").mkdir(parents=True, exist_ok=True)
  df.to_csv("data/interim/telco_cleaned.csv", index=False)

# =========================
# Data Quality (ingest)
# =========================
qa.null_audit: |
  na = df.isna().sum().sort_values(ascending=False)
  print("Missing by column:")
  print(na[na>0])

qa.dup_pkey_check: |
  dups = df["customerID"].value_counts()
  print("Dup primary keys:", (dups > 1).sum())

qa.pandera.schema_telco: |
  import pandera as pa
  from pandera import Column, Check
  TelcoSchema = pa.DataFrameSchema({
      "customerID": Column(str, nullable=False),
      "tenure": Column(int, Check.in_range(0, 84), nullable=False),
      "MonthlyCharges": Column(float, Check.in_range(0, 200), nullable=False),
      "TotalCharges": Column(object, nullable=True),  # coerced later
      "Churn": Column(str, Check.isin(["Yes","No"]), nullable=False),
  })
  TelcoSchema.validate(df, lazy=True)

qa.business_rule_totalcharges_tenure: |
  bad = df[df["tenure"].fillna(0) > 0].copy()
  bad = bad[np.abs(pd.to_numeric(bad["TotalCharges"], errors="coerce") - bad["tenure"] * bad["MonthlyCharges"]) > 10]
  print(f"Rows outside $10 tolerance: {len(bad)}")

# =========================
# Cleaning / Imputation
# =========================
clean.totalcharges_fix: |
  df["TotalCharges"] = pd.to_numeric(df["TotalCharges"].astype(str).str.strip(), errors="coerce")
  mask0 = df["TotalCharges"].isna() & (df["tenure"].fillna(0) == 0)
  df.loc[mask0, "TotalCharges"] = 0.0
  mask_other = df["TotalCharges"].isna() & df["tenure"].notna() & df["MonthlyCharges"].notna()
  df.loc[mask_other, "TotalCharges"] = df.loc[mask_other, "tenure"] * df.loc[mask_other, "MonthlyCharges"]
  df["TotalCharges"] = df["TotalCharges"].astype("float64")

clean.category_normalize: |
  yesno = ["Partner","Dependents","PhoneService","PaperlessBilling","Churn"]
  for c in yesno:
      df[c] = df[c].str.strip().str.title()
  df["MultipleLines"]   = df["MultipleLines"].str.strip().str.replace("No phone service","No Phone Service", regex=False)
  internet_cols = ["OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies"]
  for c in internet_cols:
      df[c] = df[c].str.strip().str.replace("No internet service","No Internet Service", regex=False)

# =========================
# EDA
# =========================
eda.quick_overview: |
  print(df.shape)
  display(df.head())
  print(df.describe(include="all").T)

eda.target_balance: |
  ax = (df["Churn"].value_counts(normalize=True)*100).plot.bar()
  ax.set_title("Churn class balance (%)"); plt.show()

eda.corr_heatmap_numeric: |
  num = df.select_dtypes(include=["number"])
  sns.heatmap(num.corr(numeric_only=True), annot=False, linewidths=.5)
  plt.title("Numeric correlations"); plt.show()

eda.churn_rate_by_col_template: |
  col = "Contract"  # <- change me
  rate = (df.groupby(col)["Churn"].apply(lambda s: (s=="Yes").mean()).sort_values()*100)
  print(rate.round(2))

# =========================
# Feature Engineering
# =========================
feat.tenure_buckets: |
  bins = [-1, 0, 6, 12, 24, 48, 84, 999]
  labels = ["0","1-6","7-12","13-24","25-48","49-84","85+"]
  df["tenure_bucket"] = pd.cut(df["tenure"], bins=bins, labels=labels)

feat.boolean_target: |
  df["y"] = (df["Churn"].str.upper() == "YES").astype(int)

feat.split_train_test: |
  from sklearn.model_selection import train_test_split
  target = "y"
  y = df[target]
  feature_drop = ["customerID","Churn","y"]
  X = df.drop(columns=feature_drop, errors="ignore")
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# =========================
# Modeling ‚Äì shared utilities
# =========================
model.columns_splitter: |
  num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
  cat_cols = X_train.select_dtypes(exclude=["number"]).columns.tolist()
  from sklearn.preprocessing import OneHotEncoder, StandardScaler
  from sklearn.compose import ColumnTransformer
  pre = ColumnTransformer(
      transformers=[
          ("num", StandardScaler(with_mean=False), num_cols),
          ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
      ],
      remainder="drop"
  )

model.metrics_helpers: |
  from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, classification_report, confusion_matrix
  def print_metrics(y_true, prob, thr=0.5):
      y_pred = (prob >= thr).astype(int)
      print(f"ROC AUC: {roc_auc_score(y_true, prob):.4f}")
      print(f"PR AUC : {average_precision_score(y_true, prob):.4f}")
      print(f"F1     : {f1_score(y_true, y_pred):.4f}")
      print(confusion_matrix(y_true, y_pred))
      print(classification_report(y_true, y_pred, digits=3))

# =========================
# Baseline model (LogReg)
# =========================
model.baseline_logreg: |
  from sklearn.linear_model import LogisticRegression
  from sklearn.pipeline import Pipeline
  clf = Pipeline(steps=[
      ("pre", pre),
      ("lr", LogisticRegression(max_iter=200, class_weight="balanced", n_jobs=None))
  ])
  clf.fit(X_train, y_train)
  prob = clf.predict_proba(X_test)[:,1]
  print_metrics(y_test, prob)

# =========================
# Tree models (XGBoost / CatBoost)
# =========================
model.xgboost_cv: |
  import xgboost as xgb
  from sklearn.model_selection import StratifiedKFold
  from sklearn.pipeline import Pipeline
  skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  clf = Pipeline(steps=[
      ("pre", pre),
      ("xgb", xgb.XGBClassifier(
          n_estimators=600, max_depth=6, learning_rate=0.05,
          subsample=0.8, colsample_bytree=0.8, eval_metric="logloss",
          tree_method="hist", reg_lambda=1.0, n_jobs=-1
      ))
  ])
  aucs = []
  for tr, va in skf.split(X_train, y_train):
      clf.fit(X_train.iloc[tr], y_train.iloc[tr])
      p = clf.predict_proba(X_train.iloc[va])[:,1]
      aucs.append(roc_auc_score(y_train.iloc[va], p))
  print("CV ROC AUC:", np.mean(aucs).round(4), "+/-", np.std(aucs).round(4))
  clf.fit(X_train, y_train)
  prob = clf.predict_proba(X_test)[:,1]
  print_metrics(y_test, prob)

model.catboost_simple: |
  from catboost import CatBoostClassifier
  # For CatBoost you can pass categorical indices directly (works best on raw categories)
  Xc = df.drop(columns=["customerID","Churn","y"], errors="ignore")
  cat_idx = np.where(Xc.dtypes == "object")[0].tolist()
  from sklearn.model_selection import train_test_split
  Xtr, Xte, ytr, yte = train_test_split(Xc, df["y"], test_size=0.2, stratify=df["y"], random_state=42)
  cat = CatBoostClassifier(
      depth=6, iterations=1500, learning_rate=0.03, loss_function="Logloss",
      eval_metric="AUC", verbose=200, random_seed=42, auto_class_weights="Balanced"
  )
  cat.fit(Xtr, ytr, cat_features=cat_idx, eval_set=(Xte, yte), use_best_model=True)
  prob = cat.predict_proba(Xte)[:,1]
  print_metrics(yte, prob)

# =========================
# Imbalance strategies (optional)
# =========================
imb.smote_pipeline_xgb: |
  from imblearn.pipeline import Pipeline as ImbPipeline
  from imblearn.over_sampling import SMOTE
  import xgboost as xgb
  imb_clf = ImbPipeline(steps=[
      ("pre", pre),
      ("smote", SMOTE(random_state=42)),
      ("xgb", xgb.XGBClassifier(
          n_estimators=500, max_depth=5, learning_rate=0.05,
          subsample=0.9, colsample_bytree=0.9, eval_metric="logloss", n_jobs=-1
      ))
  ])
  imb_clf.fit(X_train, y_train)
  prob = imb_clf.predict_proba(X_test)[:,1]
  print_metrics(y_test, prob)

# =========================
# Threshold tuning & calibration
# =========================
eval.threshold_opt_pr: |
  from sklearn.metrics import precision_recall_curve
  pr, rc, thr = precision_recall_curve(y_test, prob)
  f = 2*pr*rc/(pr+rc+1e-9)
  best = np.nanargmax(f)
  print("Best F1 threshold:", thr[best].round(4), "F1:", f[best].round(4))

eval.calibration_plot: |
  from sklearn.calibration import calibration_curve
  prob_true, prob_pred = calibration_curve(y_test, prob, n_bins=10)
  plt.plot(prob_pred, prob_true, marker="o"); plt.plot([0,1],[0,1],"--")
  plt.title("Calibration"); plt.xlabel("Predicted"); plt.ylabel("Observed"); plt.show()

# =========================
# SHAP explainability
# =========================
exp.shap_tree: |
  import shap
  shap.initjs()
  # Works with tree-based models like XGBoost/CatBoost
  booster = clf.named_steps["xgb"] if "xgb" in dict(clf.steps) else None
  if booster is None:
      raise RuntimeError("This snippet expects a fitted Pipeline with step 'xgb'.")
  Xs = clf.named_steps["pre"].transform(X_test)
  explainer = shap.TreeExplainer(booster)
  shap_values = explainer.shap_values(Xs)
  shap.summary_plot(shap_values, Xs)

exp.shap_linear: |
  import shap
  linear = clf.named_steps.get("lr", None)
  if linear is None:
      raise RuntimeError("This snippet expects a fitted Pipeline with step 'lr'.")
  Xs = clf.named_steps["pre"].transform(X_test)
  explainer = shap.LinearExplainer(linear, Xs, feature_dependence="independent")
  shap_values = explainer.shap_values(Xs)
  shap.summary_plot(shap_values, Xs)

# =========================
# MLflow tracking (optional)
# =========================
mlflow.start_run_and_log: |
  import mlflow, mlflow.sklearn
  mlflow.set_experiment("telco-churn")
  with mlflow.start_run(run_name="xgb_baseline"):
      mlflow.log_params({"n_estimators":600,"max_depth":6,"lr":0.05})
      mlflow.log_metric("roc_auc", roc_auc_score(y_test, prob))
      mlflow.sklearn.log_model(clf, "model")

# =========================
# SQL (analysis & sanity)
# =========================
sql.churn_rate_by_segment: |
  SELECT Contract,
         ROUND(100 * AVG(CASE WHEN Churn='Yes' THEN 1 ELSE 0 END), 2) AS churn_pct,
         COUNT(*) AS n
  FROM analytics.telco_customers
  GROUP BY Contract
  ORDER BY churn_pct DESC;

sql.monthly_retention: |
  -- Requires start_date and end_date fields for each subscription
  WITH months AS (
    SELECT DATE_TRUNC(d, MONTH) AS month
    FROM UNNEST(GENERATE_DATE_ARRAY('2017-01-01','2018-12-31', INTERVAL 1 DAY)) AS d
  ),
  active AS (
    SELECT m.month,
           COUNTIF(t.start_date <= m.month AND (t.end_date IS NULL OR t.end_date >= m.month)) AS active_customers
    FROM months m CROSS JOIN analytics.telco_subscriptions t
    GROUP BY m.month
  )
  SELECT * FROM active ORDER BY month;

# =========================
# dbt scaffolds
# =========================
dbt.model_scaffold: |
  {{ config(materialized='table') }}
  WITH src AS (
    SELECT * FROM {{ ref('stg_telco_customers') }}
  )
  SELECT * FROM src;

dbt.schema_tests_min: |
  version: 2
  models:
    - name: stg_telco_customers
      columns:
        - name: customerID
          tests: [not_null, unique]
        - name: tenure
          tests:
            - dbt_expectations.expect_column_values_to_be_between:
                min_value: 0
                max_value: 84
        - name: Churn
          tests:
            - accepted_values:
                values: ['Yes','No']
      tests:
        - dbt_utils.expression_is_true:
            expression: "case when tenure = 0 then coalesce(TotalCharges,0) = 0 else true end"
        - dbt_utils.expression_is_true:
            expression: "abs(coalesce(TotalCharges,0) - coalesce(tenure,0)*coalesce(MonthlyCharges,0)) <= 10"
            where: "tenure > 0"

dbt.freshness_recency: |
  models:
    - name: stg_telco_customers
      tests:
        - dbt_utils.recency:
            field: ingested_at
            datepart: hour
            interval: 24

# =========================
# Reporting / export
# =========================
report.top_drivers_summary: |
  # After SHAP: simple text summary
  import numpy as np
  def top_features(shap_vals, k=10):
      imp = np.abs(shap_vals).mean(axis=0)
      idx = np.argsort(imp)[::-1][:k]
      return list(zip(idx, imp[idx]))
  print("Top features by mean |SHAP| (indices):")
  print(top_features(shap_values, 10))

export.predictions_csv: |
  out = X_test.copy()
  out["prob_churn"] = prob
  out["y_true"] = y_test.values
  out.to_csv("reports/predictions.csv", index=False)
  print("Saved reports/predictions.csv")
```

---

### Quick start flow (what to run in a fresh notebook)

```python
%load_ext snipples

# snipples nb.header.imports
# snipples ingest.csv.read_telco
# snipples qa.null_audit
# snipples clean.totalcharges_fix
# snipples clean.category_normalize
# snipples feat.boolean_target
# snipples feat.tenure_buckets
# snipples feat.split_train_test
# snipples model.columns_splitter
# snipples model.metrics_helpers
# snipples model.baseline_logreg       # => baseline
# snipples model.xgboost_cv            # => stronger model
# snipples eval.threshold_opt_pr
# snipples exp.shap_tree               # if using the XGB pipeline
# snipples export.predictions_csv
```

# Generate **workspace snippets** (VS Code/Windsurf JSON) 

    Here are two drop-in artifacts for Windsurf/VS Code:

---

# 1) Workspace snippets

Save as **`.vscode/snippets.code-snippets`** in your repo.

```json
{
  "Snipples: expand line": {
    "prefix": "snipples",
    "description": "Insert a # snipples directive (works in Jupyter cells via snipples.py)",
    "body": ["# snipples ${1:namespace.snippet_name}"]
  },

  "Imports: data analyst notebook header": {
    "prefix": "py-imports-notebook",
    "scope": "python",
    "description": "Standard imports for analysis notebooks",
    "body": [
      "import os, sys, math, json, textwrap, warnings",
      "import numpy as np",
      "import pandas as pd",
      "import matplotlib.pyplot as plt",
      "import seaborn as sns",
      "from pathlib import Path",
      "warnings.filterwarnings('ignore')",
      "pd.set_option('display.max_columns', 100)",
      "plt.rcParams['figure.figsize'] = (10,5)"
    ]
  },

  "IO: read Telco CSV with dtypes": {
    "prefix": "py-read-telco",
    "scope": "python",
    "description": "Read IBM Telco Churn CSV with explicit dtypes",
    "body": [
      "dtype_map = {",
      "  'customerID':'string','gender':'string','SeniorCitizen':'Int64','Partner':'string','Dependents':'string',",
      "  'tenure':'Int64','PhoneService':'string','MultipleLines':'string','InternetService':'string','OnlineSecurity':'string',",
      "  'OnlineBackup':'string','DeviceProtection':'string','TechSupport':'string','StreamingTV':'string','StreamingMovies':'string',",
      "  'Contract':'string','PaperlessBilling':'string','PaymentMethod':'string','MonthlyCharges':'float64','TotalCharges':'string','Churn':'string'",
      "}",
      "df = pd.read_csv('${1:data/raw/Telco-Customer-Churn.csv}', dtype=dtype_map)",
      "print(df.shape)"
    ]
  },

  "Clean: TotalCharges fix (tenure==0 => 0 else tenure*MonthlyCharges)": {
    "prefix": "py-clean-totalcharges",
    "scope": "python",
    "description": "Coerce TotalCharges to numeric and impute correctly",
    "body": [
      "df['TotalCharges'] = pd.to_numeric(df['TotalCharges'].astype(str).str.strip(), errors='coerce')",
      "mask0 = df['TotalCharges'].isna() & (df['tenure'].fillna(0) == 0)",
      "df.loc[mask0, 'TotalCharges'] = 0.0",
      "mask_other = df['TotalCharges'].isna() & df['tenure'].notna() & df['MonthlyCharges'].notna()",
      "df.loc[mask_other, 'TotalCharges'] = df.loc[mask_other, 'tenure'] * df.loc[mask_other, 'MonthlyCharges']",
      "df['TotalCharges'] = df['TotalCharges'].astype('float64')"
    ]
  },

  "EDA: churn class balance plot": {
    "prefix": "py-eda-balance",
    "scope": "python",
    "description": "Bar chart of churn class balance",
    "body": [
      "ax = (df['Churn'].value_counts(normalize=true)*100).plot.bar()",
      "ax.set_title('Churn class balance (%)'); plt.show()"
    ]
  },

  "Feature: target + tenure buckets": {
    "prefix": "py-feat-target-buckets",
    "scope": "python",
    "description": "Create y and tenure_bucket features",
    "body": [
      "df['y'] = (df['Churn'].str.upper() == 'YES').astype(int)",
      "bins = [-1,0,6,12,24,48,84,999]",
      "labels = ['0','1-6','7-12','13-24','25-48','49-84','85+']",
      "df['tenure_bucket'] = pd.cut(df['tenure'], bins=bins, labels=labels)"
    ]
  },

  "Split: train/test": {
    "prefix": "py-split",
    "scope": "python",
    "description": "Train/test split with drop of ID/label columns",
    "body": [
      "from sklearn.model_selection import train_test_split",
      "target = 'y'",
      "y = df[target]",
      "X = df.drop(columns=['customerID','Churn','y'], errors='ignore')",
      "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)"
    ]
  },

  "Preprocess: ColumnTransformer (num scale + OHE cat)": {
    "prefix": "py-pre-coltx",
    "scope": "python",
    "description": "StandardScaler for numeric and OneHotEncoder for categories",
    "body": [
      "num_cols = X_train.select_dtypes(include=['number']).columns.tolist()",
      "cat_cols = X_train.select_dtypes(exclude=['number']).columns.tolist()",
      "from sklearn.preprocessing import OneHotEncoder, StandardScaler",
      "from sklearn.compose import ColumnTransformer",
      "pre = ColumnTransformer([",
      "  ('num', StandardScaler(with_mean=false), num_cols),",
      "  ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=true), cat_cols)",
      "])"
    ]
  },

  "Model: Logistic Regression baseline": {
    "prefix": "py-model-logreg",
    "scope": "python",
    "description": "Baseline classifier pipeline + quick metrics",
    "body": [
      "from sklearn.pipeline import Pipeline",
      "from sklearn.linear_model import LogisticRegression",
      "from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, classification_report, confusion_matrix",
      "def print_metrics(y_true, prob, thr=0.5):",
      "  y_pred = (prob >= thr).astype(int)",
      "  print(f'ROC AUC: {roc_auc_score(y_true, prob):.4f}')",
      "  print(f'PR AUC : {average_precision_score(y_true, prob):.4f}')",
      "  print(f'F1     : {f1_score(y_true, y_pred):.4f}')",
      "  print(confusion_matrix(y_true, y_pred))",
      "  print(classification_report(y_true, y_pred, digits=3))",
      "clf = Pipeline([('pre', pre), ('lr', LogisticRegression(max_iter=200, class_weight='balanced'))])",
      "clf.fit(X_train, y_train)",
      "prob = clf.predict_proba(X_test)[:,1]",
      "print_metrics(y_test, prob)"
    ]
  },

  "Model: XGBoost pipeline (hist)": {
    "prefix": "py-model-xgb",
    "scope": "python",
    "description": "XGBoost classifier with ColumnTransformer preprocessing",
    "body": [
      "import xgboost as xgb",
      "from sklearn.pipeline import Pipeline",
      "from sklearn.metrics import roc_auc_score",
      "clf = Pipeline([",
      "  ('pre', pre),",
      "  ('xgb', xgb.XGBClassifier(n_estimators=600, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, eval_metric='logloss', tree_method='hist', n_jobs=-1))",
      "])",
      "clf.fit(X_train, y_train)",
      "prob = clf.predict_proba(X_test)[:,1]",
      "print_metrics(y_test, prob)"
    ]
  },

  "Explain: SHAP (tree models)": {
    "prefix": "py-shap-tree",
    "scope": "python",
    "description": "SHAP summary for tree-based model within a Pipeline",
    "body": [
      "import shap",
      "shap.initjs()",
      "booster = clf.named_steps.get('xgb')",
      "Xs = clf.named_steps['pre'].transform(X_test)",
      "explainer = shap.TreeExplainer(booster)",
      "shap_values = explainer.shap_values(Xs)",
      "shap.summary_plot(shap_values, Xs)"
    ]
  },

  "Quality: Pandera mini schema": {
    "prefix": "py-pandera-telco",
    "scope": "python",
    "description": "Quick Pandera schema to validate key columns",
    "body": [
      "import pandera as pa",
      "from pandera import Column, Check",
      "TelcoSchema = pa.DataFrameSchema({",
      "  'customerID': Column(str, nullable=false),",
      "  'tenure': Column(int, Check.in_range(0,84), nullable=false),",
      "  'MonthlyCharges': Column(float, Check.in_range(0,200), nullable=false),",
      "  'TotalCharges': Column(float, nullable=false),",
      "  'Churn': Column(str, Check.isin(['Yes','No']), nullable=false)",
      "})",
      "TelcoSchema.validate(df, lazy=true)"
    ]
  },

  "dbt: model scaffold": {
    "prefix": "dbt-model",
    "scope": "sql",
    "description": "dbt table model scaffold",
    "body": [
      "{{ config(materialized='table') }}",
      "WITH src AS (",
      "  SELECT * FROM {{ ref('${1:stg_telco_customers}') }}",
      ")",
      "SELECT * FROM src;"
    ]
  },

  "SQL: churn by segment": {
    "prefix": "sql-churn-seg",
    "scope": "sql",
    "description": "Segment churn rates by Contract",
    "body": [
      "SELECT Contract,",
      "       ROUND(100 * AVG(CASE WHEN Churn='Yes' THEN 1 ELSE 0 END), 2) AS churn_pct,",
      "       COUNT(*) AS n",
      "FROM ${1:analytics.telco_customers}",
      "GROUP BY Contract",
      "ORDER BY churn_pct DESC;"
    ]
  }
}
```

---
># Tiny **Makefile** to lint/test/run your dbt & notebooks?

# 2) Makefile

Save as **`Makefile`** at the repo root. It‚Äôs opinionated but tidy for AE/ML + dbt.

```make
# -------- Config --------
PY          ?= python3
VENV        ?= .venv
ACTIVATE    = . $(VENV)/bin/activate
REQS        ?= requirements.txt

DBT_TARGET  ?= dev
DBT_PROJ    ?= .           # path to dbt project (has dbt_project.yml)
RAW_CSV     ?= data/raw/Telco-Customer-Churn.csv

# -------- Helpers --------
.PHONY: help
help:
	@echo "Common targets:"
	@echo "  make venv           # create venv and install requirements"
	@echo "  make install        # install/upgrade packages in venv"
	@echo "  make lint           # ruff lint + format check"
	@echo "  make fmt            # ruff format"
	@echo "  make test           # run pytest"
	@echo "  make nb-run         # run notebooks with papermill (paramizable)"
	@echo "  make dbt-deps/run/test/freshness  # dbt workflow"
	@echo "  make qa-quick       # quick CSV sanity checks (row count, nulls, dup IDs)"
	@echo "  make clean          # remove build artifacts"

# -------- Environment --------
$(VENV):
	$(PY) -m venv $(VENV)
	$(ACTIVATE) && python -m pip install --upgrade pip

.PHONY: venv
venv: $(VENV) install

.PHONY: install
install:
	@if [ -f "$(REQS)" ]; then \
		$(ACTIVATE) && pip install -r $(REQS); \
	else \
		echo "No requirements.txt found; installing a sane default set..."; \
		$(ACTIVATE) && pip install pandas numpy scikit-learn imbalanced-learn xgboost catboost shap mlflow matplotlib seaborn pandera great-expectations jupyter papermill ruff pytest dbt-bigquery pandas-gbq google-cloud-bigquery pyarrow; \
	fi

# -------- Lint & Test --------
.PHONY: lint
lint:
	$(ACTIVATE) && ruff check .

.PHONY: fmt
fmt:
	$(ACTIVATE) && ruff format .

.PHONY: test
test:
	$(ACTIVATE) && pytest -q

# -------- Notebooks (papermill) --------
# Usage: make nb-run NB=notebooks/01_eda.ipynb OUT=reports/01_eda.out.ipynb
NB ?= notebooks/01_eda.ipynb
OUT ?= reports/01_eda.out.ipynb
.PHONY: nb-run
nb-run:
	$(ACTIVATE) && papermill $(NB) $(OUT)

# -------- Quick CSV QA (no dbt) --------
.PHONY: qa-quick
qa-quick:
	@echo "QA: $(RAW_CSV)"
	$(ACTIVATE) && python - <<'PY'
import pandas as pd, sys
path = "$(RAW_CSV)"
df = pd.read_csv(path)
print("Shape:", df.shape)
print("Nulls (top 10):")
print(df.isna().sum().sort_values(ascending=False).head(10))
if "customerID" in df.columns:
    dup = (df["customerID"].value_counts() > 1).sum()
    print("Duplicate customerID:", dup)
PY

# -------- dbt workflow --------
.PHONY: dbt-deps
dbt-deps:
	$(ACTIVATE) && cd $(DBT_PROJ) && dbt deps

.PHONY: dbt-run
dbt-run:
	$(ACTIVATE) && cd $(DBT_PROJ) && dbt run --target $(DBT_TARGET)

.PHONY: dbt-test
dbt-test:
	$(ACTIVATE) && cd $(DBT_PROJ) && dbt test --target $(DBT_TARGET)

.PHONY: dbt-freshness
dbt-freshness:
	$(ACTIVATE) && cd $(DBT_PROJ) && dbt source freshness --target $(DBT_TARGET)

# -------- Clean --------
.PHONY: clean
clean:
	rm -rf $(VENV) .pytest_cache .ruff_cache **/__pycache__
	find . -name "*.ipynb_checkpoints" -type d -prune -exec rm -rf {} +
```

---

## How to wire this up fast

1. **Put your extension + snippets in place**

```bash
mkdir -p ~/.snipples
# (Paste your large ~/.snipples/snippets.yaml we created earlier)
# Put snipples.py somewhere on PYTHONPATH, or `pip install -e .` if you package it.
```

In notebooks:

```python
%load_ext snipples
# then type:  # snipples model.xgboost_cv   (Tab to browse)
```

2. **Add the workspace snippets**

* Create `.vscode/snippets.code-snippets` and paste the JSON above.

3. **Makefile**

```bash
make venv
make qa-quick
make dbt-deps dbt-run dbt-test
```

---

># **generate a tiny `requirements.txt` and a `tasks.json` (Windsurf/VS Code tasks that wrap `make` targets so you can ‚åò‚áßB to run ‚ÄúDBT: test‚Äù)**



># CLAUDE SUGGESTION: CODE LIBRARY

Great question! Code organization and storage strategy is crucial for professional development and portfolio presentation. Here are the most appropriate approaches for different aspects of your projects:

## üóÇÔ∏è **Repository Structure & Organization**

### **Primary Code Storage: GitHub**
```
your-username/
‚îú‚îÄ‚îÄ telco-churn-analysis-level-0/
‚îÇ   ‚îú‚îÄ‚îÄ README.md
‚îÇ   ‚îú‚îÄ‚îÄ requirements.txt
‚îÇ   ‚îú‚îÄ‚îÄ src/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ data_loader.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ analysis.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ utils.py
‚îÇ   ‚îú‚îÄ‚îÄ notebooks/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ exploratory_analysis.ipynb
‚îÇ   ‚îú‚îÄ‚îÄ data/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ telco.csv
‚îÇ   ‚îú‚îÄ‚îÄ docs/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ methodology.md
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ findings.md
‚îÇ   ‚îú‚îÄ‚îÄ tests/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ test_analysis.py
‚îÇ   ‚îî‚îÄ‚îÄ results/
‚îÇ       ‚îú‚îÄ‚îÄ figures/
‚îÇ       ‚îî‚îÄ‚îÄ reports/
```

### **Multi-Level Portfolio Structure**
```
data-analytics-journey/
‚îú‚îÄ‚îÄ README.md (Portfolio overview)
‚îú‚îÄ‚îÄ level-00-bare-bones/
‚îú‚îÄ‚îÄ level-01-structured-eda/
‚îú‚îÄ‚îÄ level-02-data-cleaning/
‚îú‚îÄ‚îÄ ...
‚îú‚îÄ‚îÄ level-10-enterprise/
‚îú‚îÄ‚îÄ docs/
‚îÇ   ‚îú‚îÄ‚îÄ learning-progression.md
‚îÇ   ‚îú‚îÄ‚îÄ technologies-mastered.md
‚îÇ   ‚îî‚îÄ‚îÄ business-impact.md
‚îî‚îÄ‚îÄ portfolio-assets/
    ‚îú‚îÄ‚îÄ presentations/
    ‚îú‚îÄ‚îÄ publications/
    ‚îî‚îÄ‚îÄ certifications/
```

## üíª **Code Workflow Storage Options**

### **1. GitHub (Primary - Essential)**
**Best for:**
- Version control and collaboration
- Portfolio presentation to employers
- Open source contributions
- CI/CD integration

**Structure:**
```
‚îú‚îÄ‚îÄ .github/
‚îÇ   ‚îú‚îÄ‚îÄ workflows/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ ci.yml
‚îÇ   ‚îî‚îÄ‚îÄ ISSUE_TEMPLATE/
‚îú‚îÄ‚îÄ src/
‚îú‚îÄ‚îÄ tests/
‚îú‚îÄ‚îÄ docs/
‚îî‚îÄ‚îÄ examples/
```

### **2. Jupyter Notebooks (Development)**
**Best for:**
- Exploratory analysis documentation
- Step-by-step methodology explanation
- Interactive presentations
- Educational content

**Organization:**
```
notebooks/
‚îú‚îÄ‚îÄ 01-data-exploration.ipynb
‚îú‚îÄ‚îÄ 02-data-cleaning.ipynb
‚îú‚îÄ‚îÄ 03-feature-engineering.ipynb
‚îú‚îÄ‚îÄ 04-modeling.ipynb
‚îî‚îÄ‚îÄ 05-results-analysis.ipynb
```

### **3. Professional Platforms**

#### **Kaggle Notebooks**
```python
# Kaggle-optimized structure
"""
TELCO CUSTOMER CHURN - LEVEL 0 ANALYSIS
=======================================
Author: Your Name
Created: Date
Updated: Date

Table of Contents:
1. Introduction & Business Problem
2. Data Loading & Initial Exploration  
3. Data Quality Assessment
4. Key Insights Discovery
5. Business Recommendations
6. Technical Implementation
7. Next Steps & Learning Journey
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')

# Load data
df = pd.read_csv('../input/telco-customer-churn/telco.csv')
```

#### **Google Colab (Shareable)**
```python
# Mount Google Drive for data persistence
from google.colab import drive
drive.mount('/content/drive')

# Install required packages
!pip install -q pandas numpy matplotlib seaborn scikit-learn

# Professional notebook structure with clear sections
```

## üèóÔ∏è **Advanced Code Organization (Level 5+)**

### **Modular Python Package Structure**
```
telco_churn_analysis/
‚îú‚îÄ‚îÄ setup.py
‚îú‚îÄ‚îÄ pyproject.toml
‚îú‚îÄ‚îÄ telco_churn/
‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îú‚îÄ‚îÄ data/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ loader.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ cleaner.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ validator.py
‚îÇ   ‚îú‚îÄ‚îÄ features/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ engineering.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ selection.py
‚îÇ   ‚îú‚îÄ‚îÄ models/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ training.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ evaluation.py
‚îÇ   ‚îú‚îÄ‚îÄ visualization/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ plots.py
‚îÇ   ‚îî‚îÄ‚îÄ utils/
‚îÇ       ‚îú‚îÄ‚îÄ __init__.py
‚îÇ       ‚îî‚îÄ‚îÄ helpers.py
‚îú‚îÄ‚îÄ tests/
‚îú‚îÄ‚îÄ docs/
‚îú‚îÄ‚îÄ examples/
‚îî‚îÄ‚îÄ data/
```

### **Configuration Management**
```python
# config.py
from dataclasses import dataclass
from pathlib import Path

@dataclass
class Config:
    # Data paths
    DATA_DIR: Path = Path("data")
    RAW_DATA_FILE: str = "telco.csv"
    PROCESSED_DATA_FILE: str = "telco_processed.csv"
    
    # Model parameters
    TEST_SIZE: float = 0.2
    RANDOM_STATE: int = 42
    
    # Output paths
    RESULTS_DIR: Path = Path("results")
    FIGURES_DIR: Path = Path("results/figures")
    REPORTS_DIR: Path = Path("results/reports")
```

## üìä **Documentation & Workflow Storage**

### **README.md Template**
```markdown
# Level X: Telco Customer Churn Analysis

## Quick Start
```bash
pip install -r requirements.txt
python src/analysis.py
```

## Project Structure
- `src/`: Core analysis code
- `notebooks/`: Jupyter exploration
- `data/`: Dataset files  
- `results/`: Output artifacts
- `tests/`: Unit tests
- `docs/`: Documentation

## Key Findings
- Finding 1: Contract type drives churn
- Finding 2: Payment method indicates engagement  
- Finding 3: Premium services show paradox

## Business Impact
- $2.4M annual revenue at risk identified
- 3 strategic initiatives recommended
- Clear ROI projections provided

## Technologies Used
- Python, Pandas, NumPy
- Matplotlib, Seaborn
- Scikit-learn (for advanced levels)

## Next Steps
See Level X+1 for progression...
```

### **Code Documentation Standards**
```python
def analyze_churn_by_contract(df: pd.DataFrame) -> pd.Series:
    """
    Analyze churn rates by contract type.
    
    This function calculates churn rates for each contract type,
    revealing the primary business insight of this analysis.
    
    Args:
        df (pd.DataFrame): Customer dataset with 'Contract' and 'Churn' columns
        
    Returns:
        pd.Series: Churn rates by contract type, sorted descending
        
    Example:
        >>> churn_rates = analyze_churn_by_contract(df)
        >>> print(churn_rates)
        Month-to-month    0.427
        One year          0.113  
        Two year          0.028
    
    Business Impact:
        This analysis reveals that month-to-month customers churn at 15x 
        the rate of two-year contract customers, representing the single 
        most actionable insight for retention strategy.
    """
    return df.groupby('Contract')['Churn'].apply(
        lambda x: (x == 'Yes').mean()
    ).sort_values(ascending=False)
```

## üîß **Workflow Management Tools**

### **For Individual Projects (Levels 0-3)**
- **Primary**: GitHub repository per level
- **Development**: Jupyter notebooks + VS Code
- **Documentation**: Markdown files + README
- **Sharing**: Kaggle notebooks for community

### **For Advanced Projects (Levels 4-7)**
- **Code**: Modular Python packages
- **Environment**: Poetry/Conda for dependency management
- **Testing**: pytest framework
- **Documentation**: Sphinx for API docs
- **Workflow**: GitHub Actions for CI/CD

### **For Enterprise Projects (Levels 8-10)**
- **Infrastructure**: Infrastructure as Code (Terraform)
- **Containerization**: Docker + Kubernetes
- **Orchestration**: Apache Airflow
- **Monitoring**: Logging and metrics collection
- **Deployment**: Blue-green deployment strategies

## üí° **Best Practices by Level**

### **Level 0-2: Foundation**
```python
# Simple, clear, well-commented code
import pandas as pd

# Load data with error handling
try:
    df = pd.read_csv('data/telco.csv')
    print(f"‚úÖ Data loaded: {df.shape}")
except FileNotFoundError:
    print("‚ùå Data file not found")
    exit(1)

# Clear business-focused analysis
contract_churn = df.groupby('Contract')['Churn'].mean()
print("üìä Churn by Contract Type:")
print(contract_churn.sort_values(ascending=False))
```

### **Level 5-7: Professional**
```python
# Type hints, docstrings, error handling
from typing import Tuple, Dict, Any
import logging

logger = logging.getLogger(__name__)

class ChurnAnalyzer:
    """Professional customer churn analysis class."""
    
    def __init__(self, config: Config):
        self.config = config
        self.df = None
        
    def load_data(self) -> None:
        """Load and validate customer data."""
        try:
            self.df = pd.read_csv(self.config.data_path)
            logger.info(f"Data loaded successfully: {self.df.shape}")
        except Exception as e:
            logger.error(f"Failed to load data: {e}")
            raise
```

### **Level 8-10: Enterprise**
```python
# Production-ready with monitoring
from dataclasses import dataclass
from abc import ABC, abstractmethod
import structlog

logger = structlog.get_logger()

@dataclass
class PredictionMetrics:
    accuracy: float
    precision: float
    recall: float
    latency_ms: float

class ChurnPredictor(ABC):
    """Abstract base class for churn prediction models."""
    
    @abstractmethod
    async def predict(self, customer_data: Dict) -> Tuple[float, PredictionMetrics]:
        """Predict churn probability with performance metrics."""
        pass
```

## üìà **Storage Strategy Recommendations**

### **For Learning Journey Documentation**
1. **GitHub**: Primary portfolio repository
2. **Kaggle**: Public notebooks for community engagement  
3. **Personal Blog**: Medium/LinkedIn articles
4. **Cloud Storage**: AWS S3/Google Cloud for large datasets
5. **Local Development**: VS Code + Jupyter for daily work

### **For Professional Presentation**
1. **Clean GitHub repos** with professional README files
2. **Comprehensive documentation** explaining methodology
3. **Business-focused presentations** with clear ROI
4. **Code quality standards** with proper testing
5. **Progression documentation** showing skill development

The key is to maintain **consistency across platforms** while optimizing each for its specific audience and purpose. Your Level 0 should be simple and accessible, while your Level 10 should demonstrate enterprise-scale thinking and implementation.

---

You're building a **personal code library** - this is exactly what professional data scientists do. Here are several effective approaches:## üéØ **My Recommendation: Start Simple with Markdown Files**

Create a folder structure like this:
```
my_code_library/
‚îú‚îÄ‚îÄ missing_values.md
‚îú‚îÄ‚îÄ data_types.md
‚îú‚îÄ‚îÄ anomaly_detection.md
‚îú‚îÄ‚îÄ data_cleaning.md
‚îî‚îÄ‚îÄ quick_exploration.md
```

## üìù **Why Markdown Files Work Best:**

1. **Easy to search** - Ctrl+F to find what you need
2. **Copy-paste ready** - Code blocks ready to use  
3. **Context included** - Notes on when/why to use each method
4. **GitHub friendly** - Looks professional in your portfolio

## üí° **Example: `missing_values.md`**

```markdown
# Missing Values Toolkit

## Quick Count (Most Used)
```python
df.isna().sum().sum()  # 18 characters, fastest
```

## Show Details When Needed  
```python
df.isnull().sum()[df.isnull().sum() > 0]  # Only non-zero columns
```

## When to Use Each:
- **Quick count**: Initial assessment, just need total
- **Detailed view**: When you find missing values and need to know where
```

## üöÄ **Start Your Library Today:**

1. **Create the folder structure**
2. **Start with one file** - `missing_values.md`
3. **Add your variations** from today's session
4. **Include context** - when to use each method
5. **Build as you learn** - add new categories in future levels

## üìà **Benefits for Your Goals:**

- **Goal #2**: Documents your learning progression
- **Goal #4**: Creates your personal reference library
- **Portfolio value**: Shows systematic thinking to hiring managers

This approach scales perfectly - start simple, add complexity as you grow!

Here are several ways to check for empty values across the entire dataset:

## **Quick One-Liners for Different Types of Empty:**

### **Check NaN/Null values:**
```python
df.isnull().sum().sum()  # Total count
df.isnull().sum()        # By column
```

### **Check empty strings (''):**
```python
(df == '').sum().sum()   # Total count
(df == '').sum()         # By column
```

### **Check empty spaces (' '):**
```python
(df == ' ').sum().sum()  # Total count
(df == ' ').sum()        # By column
```

### **Ultimate one-liner for all types:**
```python
print(f"NaN: {df.isnull().sum().sum()}, Empty: {(df == '').sum().sum()}, Spaces: {(df == ' ').sum().sum()}")
```

## **Most Comprehensive Check:**

```python
# Check every column for all types of empty values
for col in df.columns:
    nan_count = df[col].isnull().sum()
    empty_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
    space_count = (df[col] == ' ').sum() if df[col].dtype == 'object' else 0
    
    total = nan_count + empty_count + space_count
    if total > 0:
        print(f"{col}: {total} empty values")
```

## **Show Actual Problem Rows:**

```python
# Find and display rows with any empty values
problem_rows = df[df.isnull().any(axis=1) | (df == '').any(axis=1) | (df == ' ').any(axis=1)]
print(f"Rows with empty values: {len(problem_rows)}")
if len(problem_rows) > 0:
    print(problem_rows[['customerID']].head())  # Show first few problem customer IDs
```

**After your TotalCharges fix, you should see:**
- **NaN: 0**
- **Empty strings: 0** 
- **Spaces: 0**

If you see any non-zero counts, that tells you where you still have empty value issues!

---

># **#2: Keep a snippet library handy across projects/workspaces**

    Here are three ‚Äúprofessional and portable‚Äù ways to keep a snippet library handy across projects/workspaces on your Mac‚Äîordered from most editor-native to most universal. Pick one or combine B+C for a killer setup.

# A) VS Code user snippets + built-in Sync (simple, editor-native)

**Best if you primarily use VS Code.**

1. Open Command Palette ‚Üí ‚ÄúPreferences: Configure User Snippets‚Äù.
2. Create language or global snippets. Example (`python.json`):

```json
{
  "Py docstring": {
    "prefix": "pydoc",
    "body": [
      "\"\"\"${1:Summary}",
      "",
      "Args:",
      "    ${2:param}: ${3:desc}",
      "",
      "Returns:",
      "    ${4:type}: ${5:desc}",
      "\"\"\""
    ],
    "description": "Docstring scaffold"
  }
}
```

3. Turn on **Settings Sync** (Account icon ‚Üí Turn On Sync). Your snippets follow you on any machine you sign into VS Code with.

**Pro tip (shared across repos):** keep your snippet JSON files in a Git repo and **symlink** them into VS Code‚Äôs snippets folder so you can version-control them:

```bash
# Paths (macOS)
VS_SNIPS="$HOME/Library/Application Support/Code/User/snippets"
mkdir -p ~/snippets/vscode
ln -s ~/snippets/vscode/python.json "$VS_SNIPS/python.json"
```

# B) Git + Markdown snippets + fzf CLI (portable, editor-agnostic)

**Best if you hop between tools (VS Code, Windsurf, terminals, notebooks).**

1. Make a repo:

```
~/snippets/
  python/
  sql/
  dbt/
  shell/
  README.md
```

Each snippet is a small `.md` with a clear title and a fenced code block.

2. Install tools (free):

```bash
brew install fzf ripgrep
```

3. Add a tiny **`snip`** helper to your `~/.zshrc`:

````zsh
snip() {
  local file
  file=$(rg -l --hidden --glob "!*.git/*" "${*:-.}" "$HOME/snippets" | fzf --prompt="Snip> ") || return
  # copy the code block contents to clipboard
  awk '/^```/{f=!f; next} f' "$file" | pbcopy
  echo "üìã Copied from: $file"
}
````

Usage:

````bash
snip pandas groupby
# copies the code inside the first ``` block from the chosen file
# ‚åò+V (paste) in any editor/terminal
````

**Why this rocks:** works everywhere (terminal, notebooks, different editors), versioned with Git, searchable via fuzzy find. You can keep private snippets in a private repo.

# C) Text expander (Espanso) for instant boilerplate (cross-app)

**Great for short, frequently typed patterns.** Free & open-source.

```bash
brew install espanso
espanso start
espanso edit
```

Add triggers in `default.yml`:

```yaml
matches:
  - trigger: ":sqlsel"
    replace: |
      SELECT ${1:*}
      FROM ${2:table}
      WHERE ${3:cond};
    vars:
      - name: cursor
        type: cursor
  - trigger: ":dbtmodel"
    replace: |
      {{ config(materialized='table') }}
      WITH src AS (
          SELECT * FROM {{ ref('${1:stg_table}') }}
      )
      SELECT * FROM src;
    propagate_case: true
filters:
  apps:
    only: ["com.microsoft.VSCode"]  # limit expansions to VS Code
```

Now typing `:sqlsel` in VS Code expands into a SQL skeleton with tabstops‚Äîwithout touching your editor‚Äôs snippet system.

---

## What I recommend for you

Given you‚Äôre on macOS + zsh and bounce across projects:

* **Primary:** VS Code user snippets + Sync (A) for language-aware tab-trigger snippets.
* **Universal:** Git + `snip` (B) so you can search/paste any snippet in *any* workspace, terminal, or notebook.
* **Boilerplate:** Espanso (C) for lightning-fast expansions you want everywhere.

---

## Starter pack you can paste in today

**Repo skeleton**

```bash
mkdir -p ~/snippets/{python,sql,dbt,shell}
git init ~/snippets
```

**Example files**
`~/snippets/sql/select-where.md`

````md
# Select with WHERE & limit
```sql
SELECT ${1:cols}
FROM ${2:schema.table}
WHERE ${3:cond}
LIMIT ${4:100};
````

````

`~/snippets/python/pandas-null-check.md`
```md
# Null audit (top offenders)
```python
na = df.isna().sum().sort_values(ascending=False)
print(na[na>0].head(10))
````

````

**VS Code snippet (dbt model)**
`~/snippets/vscode/sql.json`
```json
{
  "dbt model": {
    "prefix": "dbtmodel",
    "body": [
      "{{ config(materialized='table') }}",
      "WITH src AS (",
      "  SELECT * FROM {{ ref('${1:stg_table}') }}",
      ")",
      "SELECT * FROM src;"
    ],
    "description": "dbt model scaffold"
  }
}
````

Then symlink it as shown above.

---

### Guardrails

* **Secrets:** never store tokens/keys in snippets; use placeholders like `${SECRET_NAME}`.
* **Licensing:** if you paste from blogs/StackOverflow, keep a note/link at the top of the snippet file.
* **Naming:** use a consistent prefix (e.g., `:sql*`, `:py*`) for Espanso triggers to avoid accidental expansion.

>If you want, tell me your top 10 snippet needs (Python/pandas, SQL/dbt, shell, GCP/AWS), and I‚Äôll generate the files to drop into `~/snippets/` plus the matching VS Code/Espanso entries.

---

># **#3: Windsurf with VS Code‚Äìstyle snippets**

Yep‚ÄîWindsurf plays nicely with **VS Code‚Äìstyle snippets**, plus you can layer on a git repo and a text expander for portability. Here‚Äôs a clean setup that works across projects/workspaces.

# 1) Built-in (VS Code‚Äìstyle) user & workspace snippets

* Open the **Command Palette ‚Üí ‚ÄúPreferences: Configure User Snippets.‚Äù** Create a language file (e.g., `python.json`) or a **global** snippets file. Snippet format is the same as VS Code‚Äôs JSON (prefix/body/description). ([Arsturn][1], [Visual Studio Code][2])
* Bringing snippets from VS Code? On first run Windsurf lets you **import settings from VS Code/Cursor**, which includes snippets. ([DEV Community][3])
* For repo-scoped snippets, add a workspace file (e.g., `.vscode/snippets.code-snippets`) so they travel with the project in Git. (Same mechanism as VS Code.) ([Stack Overflow][4])

Example snippet body (works in Windsurf since it‚Äôs VS Code syntax):

```json
{
  "Docstring scaffold": {
    "prefix": "pydoc",
    "body": [
      "\"\"\"${1:Summary}",
      "",
      "Args:",
      "    ${2:param}: ${3:desc}",
      "",
      "Returns:",
      "    ${4:type}: ${5:desc}",
      "\"\"\""
    ],
    "description": "Python docstring"
  }
}
```

# 2) Manage snippets with extensions (Open VSX)

Windsurf uses the **Open VSX** marketplace, so install snippet helpers from there in the Extensions view:

* **Snippets Manager** ‚Äì simple UI to create/edit snippets. ([open-vsx.org][5])
* **Snippets Viewer** ‚Äì browse built-in, extension, and user snippets in a tree view. ([open-vsx.org][6])
* **Snippets Ranger** ‚Äì create/select snippet files quickly. ([open-vsx.org][7])
  (If you‚Äôre used to VS Code‚Äôs marketplace, note the store difference‚ÄîWindsurf surfaces Open VSX by default.) ([Reddit][8])

# 3) Keep them portable with Git (+ optional symlink)

* Put all snippet JSON (and markdown examples) in `~/snippets/` under Git.
* In Windsurf, open one of your snippet files and copy its on-disk path; then **symlink** your repo files to that location so edits are versioned automatically (works on Mac/Linux).
  This avoids guessing paths and keeps snippets consistent across workspaces.

# 4) Universal quick-paste option (works in any app)

If you want snippets outside the editor too:

* **Espanso** (free text expander): type `:sqlsel` ‚Üí expands to a SQL skeleton anywhere.
* Or a tiny **fzf** picker in your terminal that copies code blocks from `~/snippets/*.md` to clipboard (great inside Windsurf‚Äôs integrated terminal).

# 5) Bonus: make snippets ‚Äúsearchable‚Äù by the AI

If you keep a **Google Doc** of your favorite patterns, Windsurf‚Äôs **Knowledge Base / docs context** can ingest docs for retrieval in chat. It‚Äôs not an inserter like snippets, but handy for ‚Äúremind me of the dbt model scaffold‚Äù prompts. ([Windsurf Docs][9])

---

## A setup I‚Äôd recommend for you

1. **User & workspace snippets** for tab-trigger speed. ([Arsturn][1], [Visual Studio Code][2])
2. **Git repo** at `~/snippets/` + **symlink** to Windsurf‚Äôs snippet files so changes sync across workspaces.
3. Install **Snippets Manager** (UI), plus **Snippets Viewer** (browse). ([open-vsx.org][5])
4. Add **Espanso** for cross-app boilerplate (optional).
5. (Nice-to-have) A **Google Doc** of patterns connected to Windsurf‚Äôs knowledge base for quick AI recall. ([Windsurf Docs][9])

If you want, tell me your top 10 snippets (Python/pandas, SQL/dbt, shell/GCP) and I‚Äôll generate the JSON files + a starter `~/snippets/` repo structure to drop in.

[1]: https://www.arsturn.com/blog/creating-custom-code-snippets-in-windsurf-for-reusability "Mastering Code Snippets in Windsurf: Reusability Techniques"
[2]: https://code.visualstudio.com/docs/editing/userdefinedsnippets?utm_source=chatgpt.com "Snippets in Visual Studio Code"
[3]: https://dev.to/proflead/this-ai-ide-can-code-for-you-windsurf-ai-full-tutorial-4p94 "This AI IDE Can Code For You ‚Äì Windsurf AI Full Tutorial - DEV Community"
[4]: https://stackoverflow.com/questions/29995863/how-to-add-custom-code-snippets-in-vscode?utm_source=chatgpt.com "How to add custom code snippets in VSCode?"
[5]: https://open-vsx.org/extension/zjffun/snippetsmanager?utm_source=chatgpt.com "Snippets Manager"
[6]: https://open-vsx.org/extension/RandomFractalsInc/snippets-viewer?utm_source=chatgpt.com "Snippets Viewer"
[7]: https://open-vsx.org/extension/robole/snippets-ranger?utm_source=chatgpt.com "Snippets Ranger"
[8]: https://www.reddit.com/r/RooCode/comments/1i6ns94/new_users_discovery_of_roocode_on_vscode/?utm_source=chatgpt.com "New users' discovery of RooCode on vscode marketplace ..."
[9]: https://docs.windsurf.com/context-awareness/overview?utm_source=chatgpt.com "Overview - Windsurf Docs"

---

># **#4: create a custome library that gives me suggestions in a jupyter cell or python file when I type # to call the library then call a specific snippet by name**

Yes‚Äîyou can make a tiny IPython/Jupyter extension that:

* lets you write a **comment directive** like `# snip <name>` which gets **expanded into code** before execution, and
* gives you **Tab-completion suggestions** for snippet names after you type `# snip ` in a Jupyter cell (works in JupyterLab/Notebook and VS Code‚Äôs Jupyter).

Below is a minimal, drop-in ‚Äúsnippet library‚Äù you can keep in `~/.snippets/`. It uses a YAML file for your snippets, a transformer to expand `# snip ...` lines, and a custom completer to suggest names.

---

### 1) Snippet storage (YAML)

`~/.snippets/snippets.yaml`

```yaml
pandas.null_audit: |
  na = df.isna().sum().sort_values(ascending=False)
  print(na[na>0].head(10))
sql.select_where: |
  SELECT ${1:cols}
  FROM ${2:schema.table}
  WHERE ${3:cond}
  LIMIT ${4:100};
dbt.model_scaffold: |
  {{ config(materialized='table') }}
  WITH src AS (
    SELECT * FROM {{ ref('${1:stg_table}') }}
  )
  SELECT * FROM src;
```

> You can keep multiple YAML files under `~/.snippets/`‚Äîthe loader will merge them.

---

### 2) The extension (`snipline.py`)

Put this file anywhere on your PYTHONPATH (e.g., `~/snipline/snipline.py`) and `pip install pyyaml` if you don‚Äôt have it.

```python
# snipline.py
from __future__ import annotations
import os, re, glob, time, yaml, textwrap
from pathlib import Path
from typing import Dict, Optional
from IPython.core.magic import Magics, magics_class, line_magic
from IPython.display import Javascript, display

SNIP_DIR = Path(os.environ.get("SNIP_DIR", "~/.snippets")).expanduser()

class SnipStore:
    def __init__(self, directory: Path):
        self.dir = Path(directory)
        self.cache: Dict[str, str] = {}
        self._mtimes: Dict[str, float] = {}
        self.reload()

    def reload(self):
        self.cache.clear()
        self._mtimes.clear()
        for yml in glob.glob(str(self.dir / "*.y*ml")):
            with open(yml, "r", encoding="utf-8") as f:
                data = yaml.safe_load(f) or {}
            for k, v in data.items():
                self.cache[str(k)] = str(v)
            self._mtimes[yml] = os.path.getmtime(yml)

    def maybe_reload(self):
        for yml, old in list(self._mtimes.items()):
            if os.path.exists(yml) and os.path.getmtime(yml) != old:
                self.reload()
                break

    def names(self, prefix: str = ""):
        self.maybe_reload()
        return sorted([k for k in self.cache if k.startswith(prefix)])

    def get(self, name: str) -> Optional[str]:
        self.maybe_reload()
        return self.cache.get(name)

STORE = SnipStore(SNIP_DIR)

# --- Input transformer: replace lines like "# snip <name>" with the snippet text
_SNIP_LINE = re.compile(r"^[ \t]*#\s*snip\s+([A-Za-z0-9_.\-/]+)[ \t]*$", re.MULTILINE)

def _expand_snips(cell: str) -> str:
    def repl(m):
        name = m.group(1)
        body = STORE.get(name)
        if body is None:
            # Leave a visible marker if missing
            return f"# [snip: '{name}' not found]"
        # Preserve indentation of the directive line
        indent = len(m.string[:m.start()].splitlines()[-1]) - len(m.string[:m.start()].splitlines()[-1].lstrip())
        ind = " " * max(indent, 0)
        expanded = textwrap.indent(body.rstrip("\n"), ind)
        return expanded
    return _SNIP_LINE.sub(repl, cell)

# --- Tab completion for "# snip " or "%snip " prefixes
def _snip_completer(self, event):
    # Works for both comments and magic usage
    line = getattr(event, "line", "")
    # capture partial name user is typing
    m = re.search(r"(#\s*snip|%snip)\s+([A-Za-z0-9_.\-/]*)$", line[:getattr(event, "cursor_position", len(line))])
    if not m:
        return []
    prefix = m.group(2)
    return STORE.names(prefix)

@magics_class
class SnipMagics(Magics):
    @line_magic("snip")
    def snip(self, line):
        """Usage: %snip <name>  -> prints the snippet body
        With 'insert' in classic Notebook, it replaces the current cell."""
        name = line.strip()
        if not name:
            print("\n".join(STORE.names()))
            return
        body = STORE.get(name)
        if body is None:
            print(f"[snip: '{name}' not found]")
            return
        # Print for copy/paste
        print(body)
        # Optional: try to replace current cell in classic Notebook UI
        try:
            js = Javascript("""
                if (typeof Jupyter !== 'undefined') {
                  var cell = Jupyter.notebook.get_selected_cell();
                  cell.set_text(atob('%s'));
                }
            """ % body.encode("utf-8").hex())
            display(js)
        except Exception:
            pass

def load_ipython_extension(ip):
    # Expand "# snip ..." lines before execution
    ip.input_transformers_post.append(_expand_snips)
    # Add completer (works in Jupyter and VS Code's Jupyter)
    try:
        ip.set_hook('complete_command', _snip_completer, re_key='.*')
    except Exception:
        # Fallback: older IPython versions
        completer = getattr(ip, 'Completer', None)
        if completer and hasattr(completer, 'matchers'):
            completer.matchers.insert(0, lambda self, text: [])
    ip.register_magics(SnipMagics)

def unload_ipython_extension(ip):
    # Best-effort cleanup; new sessions usually just restart
    pass
```

---

### 3) Use it in Jupyter/VS Code‚Äôs Jupyter

In a cell:

```python
%load_ext snipline   # run once per session (or auto-load, see below)
```

Now you can:

* Type: `# snip pandas.null_audit` then press **Tab** after `# snip ` to see snippet **name suggestions**.
* Run the cell: that line is **replaced** with the snippet‚Äôs code before execution.
* Or: `%snip pandas.null_audit` to print/insert the snippet body.

**Example:**

```python
# snip pandas.null_audit
```

‚Ä¶expands to:

```python
na = df.isna().sum().sort_values(ascending=False)
print(na[na>0].head(10))
```

---

### 4) Auto-load every session (optional)

Create (or edit) `~/.ipython/profile_default/ipython_config.py`:

```python
c = get_config()
c.InteractiveShellApp.extensions = ['snipline']
```

---

### Notes & limits

* **Where suggestions work:** in **Jupyter contexts** (classic, Lab, or VS Code‚Äôs Jupyter) because completion comes from the kernel. In a plain `.py` file editor view, completion is handled by the editor‚Äôs language server, so you won‚Äôt get the `# snip` suggestions there. For `.py` editing, pair this with **editor snippets** (VS Code/Windsurf) or a text expander (Espanso).
* **Variables/placeholders:** You can embed `${1:placeholder}` in snippets for editor tabstops (useful when you paste into VS Code). The transformer leaves those as-is; your editor‚Äôs snippet engine will handle them if present.
* **Versioning:** Keep `~/.snippets/` under Git so you can sync across machines. The loader auto-reloads when YAML files change.

If you want, tell me your top 10 snippet names and I‚Äôll generate the YAML ready to drop into `~/.snippets/snippets.yaml`.


----



----


    Heck yes‚Äîlet‚Äôs make **snipples** your cross-workspace snippet engine. Below are two files:

1. **`snipples.py`** ‚Äì an IPython/Jupyter extension so you can type `# snipples <name>` in a cell and it expands before execution (with tab-complete for names).
2. **`~/.snipples/snippets.yaml`** ‚Äì a chunky library covering the Telco Churn project end-to-end (ingest ‚Üí EDA ‚Üí cleaning ‚Üí features ‚Üí ML ‚Üí explainability ‚Üí dbt/SQL ‚Üí ops).

---

## 1) `snipples.py` (drop anywhere on your PYTHONPATH)

```python
# snipples.py
from __future__ import annotations
import os, re, glob, yaml, textwrap
from pathlib import Path
from typing import Dict, Optional
from IPython.core.magic import Magics, magics_class, line_magic
from IPython.display import Javascript, display

SNIPS_DIR = Path(os.environ.get("SNIPPLES_DIR", "~/.snipples")).expanduser()

class SnipStore:
    def __init__(self, directory: Path):
        self.dir = Path(directory)
        self.cache: Dict[str, str] = {}
        self._mtimes: Dict[str, float] = {}
        self.reload()

    def reload(self):
        self.cache.clear()
        self._mtimes.clear()
        self.dir.mkdir(parents=True, exist_ok=True)
        for yml in sorted(self.dir.glob("*.y*ml")):
            with open(yml, "r", encoding="utf-8") as f:
                data = yaml.safe_load(f) or {}
            for k, v in data.items():
                self.cache[str(k)] = str(v)
            self._mtimes[str(yml)] = yml.stat().st_mtime

    def maybe_reload(self):
        for yml, old in list(self._mtimes.items()):
            p = Path(yml)
            if p.exists() and p.stat().st_mtime != old:
                self.reload()
                break

    def names(self, prefix: str = ""):
        self.maybe_reload()
        return sorted([k for k in self.cache if k.startswith(prefix)])

    def get(self, name: str) -> Optional[str]:
        self.maybe_reload()
        return self.cache.get(name)

STORE = SnipStore(SNIPS_DIR)

_SNIPLINE = re.compile(r"^[ \t]*#\s*snipples\s+([A-Za-z0-9_.\-/]+)[ \t]*$", re.MULTILINE)

def _expand_snips(cell: str) -> str:
    def repl(m):
        name = m.group(1)
        body = STORE.get(name)
        if body is None:
            return f"# [snipples: '{name}' not found]"
        # Preserve indentation level of the directive line
        line = cell[:m.start()].splitlines()[-1] if cell[:m.start()].splitlines() else ""
        indent = len(line) - len(line.lstrip(" "))
        expanded = textwrap.indent(body.rstrip("\n"), " " * max(indent, 0))
        return expanded
    return _SNIPLINE.sub(repl, cell)

def _snipples_completer(self, event):
    line = getattr(event, "line", "")
    cur = getattr(event, "cursor_position", len(line))
    m = re.search(r"(#\s*snipples|%snipples)\s+([A-Za-z0-9_.\-/]*)$", line[:cur])
    if not m:
        return []
    return STORE.names(m.group(2))

@magics_class
class SnipplesMagics(Magics):
    @line_magic("snipples")
    def snipples(self, line):
        """%snipples <name>  -> prints/optionally inserts the snippet body"""
        name = line.strip()
        if not name:
            print("\n".join(STORE.names()))
            return
        body = STORE.get(name)
        if body is None:
            print(f"[snipples: '{name}' not found]")
            return
        print(body)
        try:
            js = Javascript("""
                if (typeof Jupyter !== 'undefined') {
                  var cell = Jupyter.notebook.get_selected_cell();
                  cell.set_text(atob('%s'));
                }
            """ % body.encode("utf-8").hex())
            display(js)
        except Exception:
            pass

def load_ipython_extension(ip):
    ip.input_transformers_post.append(_expand_snips)
    try:
        ip.set_hook('complete_command', _snipples_completer, re_key='.*')
    except Exception:
        pass
    ip.register_magics(SnipplesMagics)

def unload_ipython_extension(ip):
    pass
```

**Use:** in a Jupyter cell run once per session:

```python
%load_ext snipples
# then type:  # snipples telco.qa.null_audit   (Tab to see suggestions)
```

To autoload every time, add to `~/.ipython/profile_default/ipython_config.py`:

```python
c = get_config()
c.InteractiveShellApp.extensions = ['snipples']
```

---

## 2) `~/.snipples/snippets.yaml` (Telco Churn end-to-end)

Create the folder and file:

```bash
mkdir -p ~/.snipples
# paste the following into ~/.snipples/snippets.yaml
```

```yaml
# =========================
# Project scaffolding / setup
# =========================
proj.readme.telco: |
  # Telco Churn ‚Äî Analytics & ML
  ## Stack
  - Python 3.11, pandas, scikit-learn, xgboost, catboost, imbalanced-learn, shap
  - BigQuery + dbt (analytics)
  - Great Expectations or Pandera (ingest DQ)
  - MLflow (experiment tracking)

  ## Structure
  .
  ‚îú‚îÄ data/{raw,interim,processed}
  ‚îú‚îÄ notebooks/
  ‚îú‚îÄ src/telco/...
  ‚îú‚îÄ models/ (dbt)
  ‚îî‚îÄ reports/

  ## Targets
  - Clean EDA + feature marts
  - Baseline + tree models
  - Explainability (SHAP)
  - Reproducible pipelines + tests

env.venv.setup: |
  python3 -m venv .venv
  source .venv/bin/activate
  python -m pip install --upgrade pip
  pip install pandas numpy scikit-learn imbalanced-learn xgboost catboost shap mlflow matplotlib seaborn pandera great-expectations python-dotenv pandas-gbq google-cloud-bigquery pyarrow

nb.header.imports: |
  import os, sys, math, json, textwrap, warnings
  import numpy as np
  import pandas as pd
  import matplotlib.pyplot as plt
  import seaborn as sns
  from pathlib import Path
  warnings.filterwarnings("ignore")
  pd.set_option("display.max_columns", 100)
  plt.rcParams["figure.figsize"] = (10,5)

# =========================
# Ingest / IO
# =========================
ingest.csv.read_telco: |
  dtype_map = {
      "customerID": "string",
      "gender": "string",
      "SeniorCitizen": "Int64",
      "Partner": "string",
      "Dependents": "string",
      "tenure": "Int64",
      "PhoneService": "string",
      "MultipleLines": "string",
      "InternetService": "string",
      "OnlineSecurity": "string",
      "OnlineBackup": "string",
      "DeviceProtection": "string",
      "TechSupport": "string",
      "StreamingTV": "string",
      "StreamingMovies": "string",
      "Contract": "string",
      "PaperlessBilling": "string",
      "PaymentMethod": "string",
      "MonthlyCharges": "float64",
      "TotalCharges": "string",   # coercion later
      "Churn": "string"
  }
  df = pd.read_csv("data/raw/Telco-Customer-Churn.csv", dtype=dtype_map)
  print(df.shape)

ingest.bigquery.read_table: |
  # pip install pandas-gbq google-cloud-bigquery
  from pandas_gbq import read_gbq
  df = read_gbq("""
      SELECT * FROM `PROJECT.DATASET.telco_customers`
  """, project_id=os.environ.get("GCP_PROJECT"))

save.outputs.standard_paths: |
  Path("data/interim").mkdir(parents=True, exist_ok=True)
  Path("data/processed").mkdir(parents=True, exist_ok=True)
  df.to_csv("data/interim/telco_cleaned.csv", index=False)

# =========================
# Data Quality (ingest)
# =========================
qa.null_audit: |
  na = df.isna().sum().sort_values(ascending=False)
  print("Missing by column:")
  print(na[na>0])

qa.dup_pkey_check: |
  dups = df["customerID"].value_counts()
  print("Dup primary keys:", (dups > 1).sum())

qa.pandera.schema_telco: |
  import pandera as pa
  from pandera import Column, Check
  TelcoSchema = pa.DataFrameSchema({
      "customerID": Column(str, nullable=False),
      "tenure": Column(int, Check.in_range(0, 84), nullable=False),
      "MonthlyCharges": Column(float, Check.in_range(0, 200), nullable=False),
      "TotalCharges": Column(object, nullable=True),  # coerced later
      "Churn": Column(str, Check.isin(["Yes","No"]), nullable=False),
  })
  TelcoSchema.validate(df, lazy=True)

qa.business_rule_totalcharges_tenure: |
  bad = df[df["tenure"].fillna(0) > 0].copy()
  bad = bad[np.abs(pd.to_numeric(bad["TotalCharges"], errors="coerce") - bad["tenure"] * bad["MonthlyCharges"]) > 10]
  print(f"Rows outside $10 tolerance: {len(bad)}")

# =========================
# Cleaning / Imputation
# =========================
clean.totalcharges_fix: |
  df["TotalCharges"] = pd.to_numeric(df["TotalCharges"].astype(str).str.strip(), errors="coerce")
  mask0 = df["TotalCharges"].isna() & (df["tenure"].fillna(0) == 0)
  df.loc[mask0, "TotalCharges"] = 0.0
  mask_other = df["TotalCharges"].isna() & df["tenure"].notna() & df["MonthlyCharges"].notna()
  df.loc[mask_other, "TotalCharges"] = df.loc[mask_other, "tenure"] * df.loc[mask_other, "MonthlyCharges"]
  df["TotalCharges"] = df["TotalCharges"].astype("float64")

clean.category_normalize: |
  yesno = ["Partner","Dependents","PhoneService","PaperlessBilling","Churn"]
  for c in yesno:
      df[c] = df[c].str.strip().str.title()
  df["MultipleLines"]   = df["MultipleLines"].str.strip().str.replace("No phone service","No Phone Service", regex=False)
  internet_cols = ["OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies"]
  for c in internet_cols:
      df[c] = df[c].str.strip().str.replace("No internet service","No Internet Service", regex=False)

# =========================
# EDA
# =========================
eda.quick_overview: |
  print(df.shape)
  display(df.head())
  print(df.describe(include="all").T)

eda.target_balance: |
  ax = (df["Churn"].value_counts(normalize=True)*100).plot.bar()
  ax.set_title("Churn class balance (%)"); plt.show()

eda.corr_heatmap_numeric: |
  num = df.select_dtypes(include=["number"])
  sns.heatmap(num.corr(numeric_only=True), annot=False, linewidths=.5)
  plt.title("Numeric correlations"); plt.show()

eda.churn_rate_by_col_template: |
  col = "Contract"  # <- change me
  rate = (df.groupby(col)["Churn"].apply(lambda s: (s=="Yes").mean()).sort_values()*100)
  print(rate.round(2))

# =========================
# Feature Engineering
# =========================
feat.tenure_buckets: |
  bins = [-1, 0, 6, 12, 24, 48, 84, 999]
  labels = ["0","1-6","7-12","13-24","25-48","49-84","85+"]
  df["tenure_bucket"] = pd.cut(df["tenure"], bins=bins, labels=labels)

feat.boolean_target: |
  df["y"] = (df["Churn"].str.upper() == "YES").astype(int)

feat.split_train_test: |
  from sklearn.model_selection import train_test_split
  target = "y"
  y = df[target]
  feature_drop = ["customerID","Churn","y"]
  X = df.drop(columns=feature_drop, errors="ignore")
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# =========================
# Modeling ‚Äì shared utilities
# =========================
model.columns_splitter: |
  num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
  cat_cols = X_train.select_dtypes(exclude=["number"]).columns.tolist()
  from sklearn.preprocessing import OneHotEncoder, StandardScaler
  from sklearn.compose import ColumnTransformer
  pre = ColumnTransformer(
      transformers=[
          ("num", StandardScaler(with_mean=False), num_cols),
          ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
      ],
      remainder="drop"
  )

model.metrics_helpers: |
  from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, classification_report, confusion_matrix
  def print_metrics(y_true, prob, thr=0.5):
      y_pred = (prob >= thr).astype(int)
      print(f"ROC AUC: {roc_auc_score(y_true, prob):.4f}")
      print(f"PR AUC : {average_precision_score(y_true, prob):.4f}")
      print(f"F1     : {f1_score(y_true, y_pred):.4f}")
      print(confusion_matrix(y_true, y_pred))
      print(classification_report(y_true, y_pred, digits=3))

# =========================
# Baseline model (LogReg)
# =========================
model.baseline_logreg: |
  from sklearn.linear_model import LogisticRegression
  from sklearn.pipeline import Pipeline
  clf = Pipeline(steps=[
      ("pre", pre),
      ("lr", LogisticRegression(max_iter=200, class_weight="balanced", n_jobs=None))
  ])
  clf.fit(X_train, y_train)
  prob = clf.predict_proba(X_test)[:,1]
  print_metrics(y_test, prob)

# =========================
# Tree models (XGBoost / CatBoost)
# =========================
model.xgboost_cv: |
  import xgboost as xgb
  from sklearn.model_selection import StratifiedKFold
  from sklearn.pipeline import Pipeline
  skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  clf = Pipeline(steps=[
      ("pre", pre),
      ("xgb", xgb.XGBClassifier(
          n_estimators=600, max_depth=6, learning_rate=0.05,
          subsample=0.8, colsample_bytree=0.8, eval_metric="logloss",
          tree_method="hist", reg_lambda=1.0, n_jobs=-1
      ))
  ])
  aucs = []
  for tr, va in skf.split(X_train, y_train):
      clf.fit(X_train.iloc[tr], y_train.iloc[tr])
      p = clf.predict_proba(X_train.iloc[va])[:,1]
      aucs.append(roc_auc_score(y_train.iloc[va], p))
  print("CV ROC AUC:", np.mean(aucs).round(4), "+/-", np.std(aucs).round(4))
  clf.fit(X_train, y_train)
  prob = clf.predict_proba(X_test)[:,1]
  print_metrics(y_test, prob)

model.catboost_simple: |
  from catboost import CatBoostClassifier
  # For CatBoost you can pass categorical indices directly (works best on raw categories)
  Xc = df.drop(columns=["customerID","Churn","y"], errors="ignore")
  cat_idx = np.where(Xc.dtypes == "object")[0].tolist()
  from sklearn.model_selection import train_test_split
  Xtr, Xte, ytr, yte = train_test_split(Xc, df["y"], test_size=0.2, stratify=df["y"], random_state=42)
  cat = CatBoostClassifier(
      depth=6, iterations=1500, learning_rate=0.03, loss_function="Logloss",
      eval_metric="AUC", verbose=200, random_seed=42, auto_class_weights="Balanced"
  )
  cat.fit(Xtr, ytr, cat_features=cat_idx, eval_set=(Xte, yte), use_best_model=True)
  prob = cat.predict_proba(Xte)[:,1]
  print_metrics(yte, prob)

# =========================
# Imbalance strategies (optional)
# =========================
imb.smote_pipeline_xgb: |
  from imblearn.pipeline import Pipeline as ImbPipeline
  from imblearn.over_sampling import SMOTE
  import xgboost as xgb
  imb_clf = ImbPipeline(steps=[
      ("pre", pre),
      ("smote", SMOTE(random_state=42)),
      ("xgb", xgb.XGBClassifier(
          n_estimators=500, max_depth=5, learning_rate=0.05,
          subsample=0.9, colsample_bytree=0.9, eval_metric="logloss", n_jobs=-1
      ))
  ])
  imb_clf.fit(X_train, y_train)
  prob = imb_clf.predict_proba(X_test)[:,1]
  print_metrics(y_test, prob)

# =========================
# Threshold tuning & calibration
# =========================
eval.threshold_opt_pr: |
  from sklearn.metrics import precision_recall_curve
  pr, rc, thr = precision_recall_curve(y_test, prob)
  f = 2*pr*rc/(pr+rc+1e-9)
  best = np.nanargmax(f)
  print("Best F1 threshold:", thr[best].round(4), "F1:", f[best].round(4))

eval.calibration_plot: |
  from sklearn.calibration import calibration_curve
  prob_true, prob_pred = calibration_curve(y_test, prob, n_bins=10)
  plt.plot(prob_pred, prob_true, marker="o"); plt.plot([0,1],[0,1],"--")
  plt.title("Calibration"); plt.xlabel("Predicted"); plt.ylabel("Observed"); plt.show()

# =========================
# SHAP explainability
# =========================
exp.shap_tree: |
  import shap
  shap.initjs()
  # Works with tree-based models like XGBoost/CatBoost
  booster = clf.named_steps["xgb"] if "xgb" in dict(clf.steps) else None
  if booster is None:
      raise RuntimeError("This snippet expects a fitted Pipeline with step 'xgb'.")
  Xs = clf.named_steps["pre"].transform(X_test)
  explainer = shap.TreeExplainer(booster)
  shap_values = explainer.shap_values(Xs)
  shap.summary_plot(shap_values, Xs)

exp.shap_linear: |
  import shap
  linear = clf.named_steps.get("lr", None)
  if linear is None:
      raise RuntimeError("This snippet expects a fitted Pipeline with step 'lr'.")
  Xs = clf.named_steps["pre"].transform(X_test)
  explainer = shap.LinearExplainer(linear, Xs, feature_dependence="independent")
  shap_values = explainer.shap_values(Xs)
  shap.summary_plot(shap_values, Xs)

# =========================
# MLflow tracking (optional)
# =========================
mlflow.start_run_and_log: |
  import mlflow, mlflow.sklearn
  mlflow.set_experiment("telco-churn")
  with mlflow.start_run(run_name="xgb_baseline"):
      mlflow.log_params({"n_estimators":600,"max_depth":6,"lr":0.05})
      mlflow.log_metric("roc_auc", roc_auc_score(y_test, prob))
      mlflow.sklearn.log_model(clf, "model")

# =========================
# SQL (analysis & sanity)
# =========================
sql.churn_rate_by_segment: |
  SELECT Contract,
         ROUND(100 * AVG(CASE WHEN Churn='Yes' THEN 1 ELSE 0 END), 2) AS churn_pct,
         COUNT(*) AS n
  FROM analytics.telco_customers
  GROUP BY Contract
  ORDER BY churn_pct DESC;

sql.monthly_retention: |
  -- Requires start_date and end_date fields for each subscription
  WITH months AS (
    SELECT DATE_TRUNC(d, MONTH) AS month
    FROM UNNEST(GENERATE_DATE_ARRAY('2017-01-01','2018-12-31', INTERVAL 1 DAY)) AS d
  ),
  active AS (
    SELECT m.month,
           COUNTIF(t.start_date <= m.month AND (t.end_date IS NULL OR t.end_date >= m.month)) AS active_customers
    FROM months m CROSS JOIN analytics.telco_subscriptions t
    GROUP BY m.month
  )
  SELECT * FROM active ORDER BY month;

# =========================
# dbt scaffolds
# =========================
dbt.model_scaffold: |
  {{ config(materialized='table') }}
  WITH src AS (
    SELECT * FROM {{ ref('stg_telco_customers') }}
  )
  SELECT * FROM src;

dbt.schema_tests_min: |
  version: 2
  models:
    - name: stg_telco_customers
      columns:
        - name: customerID
          tests: [not_null, unique]
        - name: tenure
          tests:
            - dbt_expectations.expect_column_values_to_be_between:
                min_value: 0
                max_value: 84
        - name: Churn
          tests:
            - accepted_values:
                values: ['Yes','No']
      tests:
        - dbt_utils.expression_is_true:
            expression: "case when tenure = 0 then coalesce(TotalCharges,0) = 0 else true end"
        - dbt_utils.expression_is_true:
            expression: "abs(coalesce(TotalCharges,0) - coalesce(tenure,0)*coalesce(MonthlyCharges,0)) <= 10"
            where: "tenure > 0"

dbt.freshness_recency: |
  models:
    - name: stg_telco_customers
      tests:
        - dbt_utils.recency:
            field: ingested_at
            datepart: hour
            interval: 24

# =========================
# Reporting / export
# =========================
report.top_drivers_summary: |
  # After SHAP: simple text summary
  import numpy as np
  def top_features(shap_vals, k=10):
      imp = np.abs(shap_vals).mean(axis=0)
      idx = np.argsort(imp)[::-1][:k]
      return list(zip(idx, imp[idx]))
  print("Top features by mean |SHAP| (indices):")
  print(top_features(shap_values, 10))

export.predictions_csv: |
  out = X_test.copy()
  out["prob_churn"] = prob
  out["y_true"] = y_test.values
  out.to_csv("reports/predictions.csv", index=False)
  print("Saved reports/predictions.csv")
```

---

### Quick start flow (what to run in a fresh notebook)

```python
%load_ext snipples

# snipples nb.header.imports
# snipples ingest.csv.read_telco
# snipples qa.null_audit
# snipples clean.totalcharges_fix
# snipples clean.category_normalize
# snipples feat.boolean_target
# snipples feat.tenure_buckets
# snipples feat.split_train_test
# snipples model.columns_splitter
# snipples model.metrics_helpers
# snipples model.baseline_logreg       # => baseline
# snipples model.xgboost_cv            # => stronger model
# snipples eval.threshold_opt_pr
# snipples exp.shap_tree               # if using the XGB pipeline
# snipples export.predictions_csv
```

# Generate **workspace snippets** (VS Code/Windsurf JSON) 

    Here are two drop-in artifacts for Windsurf/VS Code:

---

# 1) Workspace snippets

Save as **`.vscode/snippets.code-snippets`** in your repo.

```json
{
  "Snipples: expand line": {
    "prefix": "snipples",
    "description": "Insert a # snipples directive (works in Jupyter cells via snipples.py)",
    "body": ["# snipples ${1:namespace.snippet_name}"]
  },

  "Imports: data analyst notebook header": {
    "prefix": "py-imports-notebook",
    "scope": "python",
    "description": "Standard imports for analysis notebooks",
    "body": [
      "import os, sys, math, json, textwrap, warnings",
      "import numpy as np",
      "import pandas as pd",
      "import matplotlib.pyplot as plt",
      "import seaborn as sns",
      "from pathlib import Path",
      "warnings.filterwarnings('ignore')",
      "pd.set_option('display.max_columns', 100)",
      "plt.rcParams['figure.figsize'] = (10,5)"
    ]
  },

  "IO: read Telco CSV with dtypes": {
    "prefix": "py-read-telco",
    "scope": "python",
    "description": "Read IBM Telco Churn CSV with explicit dtypes",
    "body": [
      "dtype_map = {",
      "  'customerID':'string','gender':'string','SeniorCitizen':'Int64','Partner':'string','Dependents':'string',",
      "  'tenure':'Int64','PhoneService':'string','MultipleLines':'string','InternetService':'string','OnlineSecurity':'string',",
      "  'OnlineBackup':'string','DeviceProtection':'string','TechSupport':'string','StreamingTV':'string','StreamingMovies':'string',",
      "  'Contract':'string','PaperlessBilling':'string','PaymentMethod':'string','MonthlyCharges':'float64','TotalCharges':'string','Churn':'string'",
      "}",
      "df = pd.read_csv('${1:data/raw/Telco-Customer-Churn.csv}', dtype=dtype_map)",
      "print(df.shape)"
    ]
  },

  "Clean: TotalCharges fix (tenure==0 => 0 else tenure*MonthlyCharges)": {
    "prefix": "py-clean-totalcharges",
    "scope": "python",
    "description": "Coerce TotalCharges to numeric and impute correctly",
    "body": [
      "df['TotalCharges'] = pd.to_numeric(df['TotalCharges'].astype(str).str.strip(), errors='coerce')",
      "mask0 = df['TotalCharges'].isna() & (df['tenure'].fillna(0) == 0)",
      "df.loc[mask0, 'TotalCharges'] = 0.0",
      "mask_other = df['TotalCharges'].isna() & df['tenure'].notna() & df['MonthlyCharges'].notna()",
      "df.loc[mask_other, 'TotalCharges'] = df.loc[mask_other, 'tenure'] * df.loc[mask_other, 'MonthlyCharges']",
      "df['TotalCharges'] = df['TotalCharges'].astype('float64')"
    ]
  },

  "EDA: churn class balance plot": {
    "prefix": "py-eda-balance",
    "scope": "python",
    "description": "Bar chart of churn class balance",
    "body": [
      "ax = (df['Churn'].value_counts(normalize=true)*100).plot.bar()",
      "ax.set_title('Churn class balance (%)'); plt.show()"
    ]
  },

  "Feature: target + tenure buckets": {
    "prefix": "py-feat-target-buckets",
    "scope": "python",
    "description": "Create y and tenure_bucket features",
    "body": [
      "df['y'] = (df['Churn'].str.upper() == 'YES').astype(int)",
      "bins = [-1,0,6,12,24,48,84,999]",
      "labels = ['0','1-6','7-12','13-24','25-48','49-84','85+']",
      "df['tenure_bucket'] = pd.cut(df['tenure'], bins=bins, labels=labels)"
    ]
  },

  "Split: train/test": {
    "prefix": "py-split",
    "scope": "python",
    "description": "Train/test split with drop of ID/label columns",
    "body": [
      "from sklearn.model_selection import train_test_split",
      "target = 'y'",
      "y = df[target]",
      "X = df.drop(columns=['customerID','Churn','y'], errors='ignore')",
      "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)"
    ]
  },

  "Preprocess: ColumnTransformer (num scale + OHE cat)": {
    "prefix": "py-pre-coltx",
    "scope": "python",
    "description": "StandardScaler for numeric and OneHotEncoder for categories",
    "body": [
      "num_cols = X_train.select_dtypes(include=['number']).columns.tolist()",
      "cat_cols = X_train.select_dtypes(exclude=['number']).columns.tolist()",
      "from sklearn.preprocessing import OneHotEncoder, StandardScaler",
      "from sklearn.compose import ColumnTransformer",
      "pre = ColumnTransformer([",
      "  ('num', StandardScaler(with_mean=false), num_cols),",
      "  ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=true), cat_cols)",
      "])"
    ]
  },

  "Model: Logistic Regression baseline": {
    "prefix": "py-model-logreg",
    "scope": "python",
    "description": "Baseline classifier pipeline + quick metrics",
    "body": [
      "from sklearn.pipeline import Pipeline",
      "from sklearn.linear_model import LogisticRegression",
      "from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, classification_report, confusion_matrix",
      "def print_metrics(y_true, prob, thr=0.5):",
      "  y_pred = (prob >= thr).astype(int)",
      "  print(f'ROC AUC: {roc_auc_score(y_true, prob):.4f}')",
      "  print(f'PR AUC : {average_precision_score(y_true, prob):.4f}')",
      "  print(f'F1     : {f1_score(y_true, y_pred):.4f}')",
      "  print(confusion_matrix(y_true, y_pred))",
      "  print(classification_report(y_true, y_pred, digits=3))",
      "clf = Pipeline([('pre', pre), ('lr', LogisticRegression(max_iter=200, class_weight='balanced'))])",
      "clf.fit(X_train, y_train)",
      "prob = clf.predict_proba(X_test)[:,1]",
      "print_metrics(y_test, prob)"
    ]
  },

  "Model: XGBoost pipeline (hist)": {
    "prefix": "py-model-xgb",
    "scope": "python",
    "description": "XGBoost classifier with ColumnTransformer preprocessing",
    "body": [
      "import xgboost as xgb",
      "from sklearn.pipeline import Pipeline",
      "from sklearn.metrics import roc_auc_score",
      "clf = Pipeline([",
      "  ('pre', pre),",
      "  ('xgb', xgb.XGBClassifier(n_estimators=600, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, eval_metric='logloss', tree_method='hist', n_jobs=-1))",
      "])",
      "clf.fit(X_train, y_train)",
      "prob = clf.predict_proba(X_test)[:,1]",
      "print_metrics(y_test, prob)"
    ]
  },

  "Explain: SHAP (tree models)": {
    "prefix": "py-shap-tree",
    "scope": "python",
    "description": "SHAP summary for tree-based model within a Pipeline",
    "body": [
      "import shap",
      "shap.initjs()",
      "booster = clf.named_steps.get('xgb')",
      "Xs = clf.named_steps['pre'].transform(X_test)",
      "explainer = shap.TreeExplainer(booster)",
      "shap_values = explainer.shap_values(Xs)",
      "shap.summary_plot(shap_values, Xs)"
    ]
  },

  "Quality: Pandera mini schema": {
    "prefix": "py-pandera-telco",
    "scope": "python",
    "description": "Quick Pandera schema to validate key columns",
    "body": [
      "import pandera as pa",
      "from pandera import Column, Check",
      "TelcoSchema = pa.DataFrameSchema({",
      "  'customerID': Column(str, nullable=false),",
      "  'tenure': Column(int, Check.in_range(0,84), nullable=false),",
      "  'MonthlyCharges': Column(float, Check.in_range(0,200), nullable=false),",
      "  'TotalCharges': Column(float, nullable=false),",
      "  'Churn': Column(str, Check.isin(['Yes','No']), nullable=false)",
      "})",
      "TelcoSchema.validate(df, lazy=true)"
    ]
  },

  "dbt: model scaffold": {
    "prefix": "dbt-model",
    "scope": "sql",
    "description": "dbt table model scaffold",
    "body": [
      "{{ config(materialized='table') }}",
      "WITH src AS (",
      "  SELECT * FROM {{ ref('${1:stg_telco_customers}') }}",
      ")",
      "SELECT * FROM src;"
    ]
  },

  "SQL: churn by segment": {
    "prefix": "sql-churn-seg",
    "scope": "sql",
    "description": "Segment churn rates by Contract",
    "body": [
      "SELECT Contract,",
      "       ROUND(100 * AVG(CASE WHEN Churn='Yes' THEN 1 ELSE 0 END), 2) AS churn_pct,",
      "       COUNT(*) AS n",
      "FROM ${1:analytics.telco_customers}",
      "GROUP BY Contract",
      "ORDER BY churn_pct DESC;"
    ]
  }
}
```

---
># Tiny **Makefile** to lint/test/run your dbt & notebooks?

# 2) Makefile

Save as **`Makefile`** at the repo root. It‚Äôs opinionated but tidy for AE/ML + dbt.

```make
# -------- Config --------
PY          ?= python3
VENV        ?= .venv
ACTIVATE    = . $(VENV)/bin/activate
REQS        ?= requirements.txt

DBT_TARGET  ?= dev
DBT_PROJ    ?= .           # path to dbt project (has dbt_project.yml)
RAW_CSV     ?= data/raw/Telco-Customer-Churn.csv

# -------- Helpers --------
.PHONY: help
help:
	@echo "Common targets:"
	@echo "  make venv           # create venv and install requirements"
	@echo "  make install        # install/upgrade packages in venv"
	@echo "  make lint           # ruff lint + format check"
	@echo "  make fmt            # ruff format"
	@echo "  make test           # run pytest"
	@echo "  make nb-run         # run notebooks with papermill (paramizable)"
	@echo "  make dbt-deps/run/test/freshness  # dbt workflow"
	@echo "  make qa-quick       # quick CSV sanity checks (row count, nulls, dup IDs)"
	@echo "  make clean          # remove build artifacts"

# -------- Environment --------
$(VENV):
	$(PY) -m venv $(VENV)
	$(ACTIVATE) && python -m pip install --upgrade pip

.PHONY: venv
venv: $(VENV) install

.PHONY: install
install:
	@if [ -f "$(REQS)" ]; then \
		$(ACTIVATE) && pip install -r $(REQS); \
	else \
		echo "No requirements.txt found; installing a sane default set..."; \
		$(ACTIVATE) && pip install pandas numpy scikit-learn imbalanced-learn xgboost catboost shap mlflow matplotlib seaborn pandera great-expectations jupyter papermill ruff pytest dbt-bigquery pandas-gbq google-cloud-bigquery pyarrow; \
	fi

# -------- Lint & Test --------
.PHONY: lint
lint:
	$(ACTIVATE) && ruff check .

.PHONY: fmt
fmt:
	$(ACTIVATE) && ruff format .

.PHONY: test
test:
	$(ACTIVATE) && pytest -q

# -------- Notebooks (papermill) --------
# Usage: make nb-run NB=notebooks/01_eda.ipynb OUT=reports/01_eda.out.ipynb
NB ?= notebooks/01_eda.ipynb
OUT ?= reports/01_eda.out.ipynb
.PHONY: nb-run
nb-run:
	$(ACTIVATE) && papermill $(NB) $(OUT)

# -------- Quick CSV QA (no dbt) --------
.PHONY: qa-quick
qa-quick:
	@echo "QA: $(RAW_CSV)"
	$(ACTIVATE) && python - <<'PY'
import pandas as pd, sys
path = "$(RAW_CSV)"
df = pd.read_csv(path)
print("Shape:", df.shape)
print("Nulls (top 10):")
print(df.isna().sum().sort_values(ascending=False).head(10))
if "customerID" in df.columns:
    dup = (df["customerID"].value_counts() > 1).sum()
    print("Duplicate customerID:", dup)
PY

# -------- dbt workflow --------
.PHONY: dbt-deps
dbt-deps:
	$(ACTIVATE) && cd $(DBT_PROJ) && dbt deps

.PHONY: dbt-run
dbt-run:
	$(ACTIVATE) && cd $(DBT_PROJ) && dbt run --target $(DBT_TARGET)

.PHONY: dbt-test
dbt-test:
	$(ACTIVATE) && cd $(DBT_PROJ) && dbt test --target $(DBT_TARGET)

.PHONY: dbt-freshness
dbt-freshness:
	$(ACTIVATE) && cd $(DBT_PROJ) && dbt source freshness --target $(DBT_TARGET)

# -------- Clean --------
.PHONY: clean
clean:
	rm -rf $(VENV) .pytest_cache .ruff_cache **/__pycache__
	find . -name "*.ipynb_checkpoints" -type d -prune -exec rm -rf {} +
```

---

## How to wire this up fast

1. **Put your extension + snippets in place**

```bash
mkdir -p ~/.snipples
# (Paste your large ~/.snipples/snippets.yaml we created earlier)
# Put snipples.py somewhere on PYTHONPATH, or `pip install -e .` if you package it.
```

In notebooks:

```python
%load_ext snipples
# then type:  # snipples model.xgboost_cv   (Tab to browse)
```

2. **Add the workspace snippets**

* Create `.vscode/snippets.code-snippets` and paste the JSON above.

3. **Makefile**

```bash
make venv
make qa-quick
make dbt-deps dbt-run dbt-test
```

---

># **generate a tiny `requirements.txt` and a `tasks.json` (Windsurf/VS Code tasks that wrap `make` targets so you can ‚åò‚áßB to run ‚ÄúDBT: test‚Äù)**



># CLAUDE SUGGESTION: CODE LIBRARY

Great question! Code organization and storage strategy is crucial for professional development and portfolio presentation. Here are the most appropriate approaches for different aspects of your projects:

## üóÇÔ∏è **Repository Structure & Organization**

### **Primary Code Storage: GitHub**
```
your-username/
‚îú‚îÄ‚îÄ telco-churn-analysis-level-0/
‚îÇ   ‚îú‚îÄ‚îÄ README.md
‚îÇ   ‚îú‚îÄ‚îÄ requirements.txt
‚îÇ   ‚îú‚îÄ‚îÄ src/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ data_loader.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ analysis.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ utils.py
‚îÇ   ‚îú‚îÄ‚îÄ notebooks/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ exploratory_analysis.ipynb
‚îÇ   ‚îú‚îÄ‚îÄ data/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ telco.csv
‚îÇ   ‚îú‚îÄ‚îÄ docs/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ methodology.md
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ findings.md
‚îÇ   ‚îú‚îÄ‚îÄ tests/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ test_analysis.py
‚îÇ   ‚îî‚îÄ‚îÄ results/
‚îÇ       ‚îú‚îÄ‚îÄ figures/
‚îÇ       ‚îî‚îÄ‚îÄ reports/
```

### **Multi-Level Portfolio Structure**
```
data-analytics-journey/
‚îú‚îÄ‚îÄ README.md (Portfolio overview)
‚îú‚îÄ‚îÄ level-00-bare-bones/
‚îú‚îÄ‚îÄ level-01-structured-eda/
‚îú‚îÄ‚îÄ level-02-data-cleaning/
‚îú‚îÄ‚îÄ ...
‚îú‚îÄ‚îÄ level-10-enterprise/
‚îú‚îÄ‚îÄ docs/
‚îÇ   ‚îú‚îÄ‚îÄ learning-progression.md
‚îÇ   ‚îú‚îÄ‚îÄ technologies-mastered.md
‚îÇ   ‚îî‚îÄ‚îÄ business-impact.md
‚îî‚îÄ‚îÄ portfolio-assets/
    ‚îú‚îÄ‚îÄ presentations/
    ‚îú‚îÄ‚îÄ publications/
    ‚îî‚îÄ‚îÄ certifications/
```

## üíª **Code Workflow Storage Options**

### **1. GitHub (Primary - Essential)**
**Best for:**
- Version control and collaboration
- Portfolio presentation to employers
- Open source contributions
- CI/CD integration

**Structure:**
```
‚îú‚îÄ‚îÄ .github/
‚îÇ   ‚îú‚îÄ‚îÄ workflows/
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ ci.yml
‚îÇ   ‚îî‚îÄ‚îÄ ISSUE_TEMPLATE/
‚îú‚îÄ‚îÄ src/
‚îú‚îÄ‚îÄ tests/
‚îú‚îÄ‚îÄ docs/
‚îî‚îÄ‚îÄ examples/
```

### **2. Jupyter Notebooks (Development)**
**Best for:**
- Exploratory analysis documentation
- Step-by-step methodology explanation
- Interactive presentations
- Educational content

**Organization:**
```
notebooks/
‚îú‚îÄ‚îÄ 01-data-exploration.ipynb
‚îú‚îÄ‚îÄ 02-data-cleaning.ipynb
‚îú‚îÄ‚îÄ 03-feature-engineering.ipynb
‚îú‚îÄ‚îÄ 04-modeling.ipynb
‚îî‚îÄ‚îÄ 05-results-analysis.ipynb
```

### **3. Professional Platforms**

#### **Kaggle Notebooks**
```python
# Kaggle-optimized structure
"""
TELCO CUSTOMER CHURN - LEVEL 0 ANALYSIS
=======================================
Author: Your Name
Created: Date
Updated: Date

Table of Contents:
1. Introduction & Business Problem
2. Data Loading & Initial Exploration  
3. Data Quality Assessment
4. Key Insights Discovery
5. Business Recommendations
6. Technical Implementation
7. Next Steps & Learning Journey
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')

# Load data
df = pd.read_csv('../input/telco-customer-churn/telco.csv')
```

#### **Google Colab (Shareable)**
```python
# Mount Google Drive for data persistence
from google.colab import drive
drive.mount('/content/drive')

# Install required packages
!pip install -q pandas numpy matplotlib seaborn scikit-learn

# Professional notebook structure with clear sections
```

## üèóÔ∏è **Advanced Code Organization (Level 5+)**

### **Modular Python Package Structure**
```
telco_churn_analysis/
‚îú‚îÄ‚îÄ setup.py
‚îú‚îÄ‚îÄ pyproject.toml
‚îú‚îÄ‚îÄ telco_churn/
‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îú‚îÄ‚îÄ data/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ loader.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ cleaner.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ validator.py
‚îÇ   ‚îú‚îÄ‚îÄ features/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ engineering.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ selection.py
‚îÇ   ‚îú‚îÄ‚îÄ models/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ training.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ evaluation.py
‚îÇ   ‚îú‚îÄ‚îÄ visualization/
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ __init__.py
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ plots.py
‚îÇ   ‚îî‚îÄ‚îÄ utils/
‚îÇ       ‚îú‚îÄ‚îÄ __init__.py
‚îÇ       ‚îî‚îÄ‚îÄ helpers.py
‚îú‚îÄ‚îÄ tests/
‚îú‚îÄ‚îÄ docs/
‚îú‚îÄ‚îÄ examples/
‚îî‚îÄ‚îÄ data/
```

### **Configuration Management**
```python
# config.py
from dataclasses import dataclass
from pathlib import Path

@dataclass
class Config:
    # Data paths
    DATA_DIR: Path = Path("data")
    RAW_DATA_FILE: str = "telco.csv"
    PROCESSED_DATA_FILE: str = "telco_processed.csv"
    
    # Model parameters
    TEST_SIZE: float = 0.2
    RANDOM_STATE: int = 42
    
    # Output paths
    RESULTS_DIR: Path = Path("results")
    FIGURES_DIR: Path = Path("results/figures")
    REPORTS_DIR: Path = Path("results/reports")
```

## üìä **Documentation & Workflow Storage**

### **README.md Template**
```markdown
# Level X: Telco Customer Churn Analysis

## Quick Start
```bash
pip install -r requirements.txt
python src/analysis.py
```

## Project Structure
- `src/`: Core analysis code
- `notebooks/`: Jupyter exploration
- `data/`: Dataset files  
- `results/`: Output artifacts
- `tests/`: Unit tests
- `docs/`: Documentation

## Key Findings
- Finding 1: Contract type drives churn
- Finding 2: Payment method indicates engagement  
- Finding 3: Premium services show paradox

## Business Impact
- $2.4M annual revenue at risk identified
- 3 strategic initiatives recommended
- Clear ROI projections provided

## Technologies Used
- Python, Pandas, NumPy
- Matplotlib, Seaborn
- Scikit-learn (for advanced levels)

## Next Steps
See Level X+1 for progression...
```

### **Code Documentation Standards**
```python
def analyze_churn_by_contract(df: pd.DataFrame) -> pd.Series:
    """
    Analyze churn rates by contract type.
    
    This function calculates churn rates for each contract type,
    revealing the primary business insight of this analysis.
    
    Args:
        df (pd.DataFrame): Customer dataset with 'Contract' and 'Churn' columns
        
    Returns:
        pd.Series: Churn rates by contract type, sorted descending
        
    Example:
        >>> churn_rates = analyze_churn_by_contract(df)
        >>> print(churn_rates)
        Month-to-month    0.427
        One year          0.113  
        Two year          0.028
    
    Business Impact:
        This analysis reveals that month-to-month customers churn at 15x 
        the rate of two-year contract customers, representing the single 
        most actionable insight for retention strategy.
    """
    return df.groupby('Contract')['Churn'].apply(
        lambda x: (x == 'Yes').mean()
    ).sort_values(ascending=False)
```

## üîß **Workflow Management Tools**

### **For Individual Projects (Levels 0-3)**
- **Primary**: GitHub repository per level
- **Development**: Jupyter notebooks + VS Code
- **Documentation**: Markdown files + README
- **Sharing**: Kaggle notebooks for community

### **For Advanced Projects (Levels 4-7)**
- **Code**: Modular Python packages
- **Environment**: Poetry/Conda for dependency management
- **Testing**: pytest framework
- **Documentation**: Sphinx for API docs
- **Workflow**: GitHub Actions for CI/CD

### **For Enterprise Projects (Levels 8-10)**
- **Infrastructure**: Infrastructure as Code (Terraform)
- **Containerization**: Docker + Kubernetes
- **Orchestration**: Apache Airflow
- **Monitoring**: Logging and metrics collection
- **Deployment**: Blue-green deployment strategies

## üí° **Best Practices by Level**

### **Level 0-2: Foundation**
```python
# Simple, clear, well-commented code
import pandas as pd

# Load data with error handling
try:
    df = pd.read_csv('data/telco.csv')
    print(f"‚úÖ Data loaded: {df.shape}")
except FileNotFoundError:
    print("‚ùå Data file not found")
    exit(1)

# Clear business-focused analysis
contract_churn = df.groupby('Contract')['Churn'].mean()
print("üìä Churn by Contract Type:")
print(contract_churn.sort_values(ascending=False))
```

### **Level 5-7: Professional**
```python
# Type hints, docstrings, error handling
from typing import Tuple, Dict, Any
import logging

logger = logging.getLogger(__name__)

class ChurnAnalyzer:
    """Professional customer churn analysis class."""
    
    def __init__(self, config: Config):
        self.config = config
        self.df = None
        
    def load_data(self) -> None:
        """Load and validate customer data."""
        try:
            self.df = pd.read_csv(self.config.data_path)
            logger.info(f"Data loaded successfully: {self.df.shape}")
        except Exception as e:
            logger.error(f"Failed to load data: {e}")
            raise
```

### **Level 8-10: Enterprise**
```python
# Production-ready with monitoring
from dataclasses import dataclass
from abc import ABC, abstractmethod
import structlog

logger = structlog.get_logger()

@dataclass
class PredictionMetrics:
    accuracy: float
    precision: float
    recall: float
    latency_ms: float

class ChurnPredictor(ABC):
    """Abstract base class for churn prediction models."""
    
    @abstractmethod
    async def predict(self, customer_data: Dict) -> Tuple[float, PredictionMetrics]:
        """Predict churn probability with performance metrics."""
        pass
```

## üìà **Storage Strategy Recommendations**

### **For Learning Journey Documentation**
1. **GitHub**: Primary portfolio repository
2. **Kaggle**: Public notebooks for community engagement  
3. **Personal Blog**: Medium/LinkedIn articles
4. **Cloud Storage**: AWS S3/Google Cloud for large datasets
5. **Local Development**: VS Code + Jupyter for daily work

### **For Professional Presentation**
1. **Clean GitHub repos** with professional README files
2. **Comprehensive documentation** explaining methodology
3. **Business-focused presentations** with clear ROI
4. **Code quality standards** with proper testing
5. **Progression documentation** showing skill development

The key is to maintain **consistency across platforms** while optimizing each for its specific audience and purpose. Your Level 0 should be simple and accessible, while your Level 10 should demonstrate enterprise-scale thinking and implementation.

---

Here‚Äôs a **professional, production-grade** way to do data quality (DQ) that hiring managers expect to see. It‚Äôs tool-agnostic but I‚Äôll show dbt / SQL / Python snippets you can drop in.

# 1) Write a data contract (single source of truth)

For each dataset/table define: owner, schema (types, nullability), keys, ranges, enums, referential links, freshness SLA, row-level filters, and severity rules (P0=fail pipeline, P1=alert only).

```yaml
# data_contract.yml (excerpt)
table: analytics.telco_customers
owner: data@company.com
freshness_sla_minutes: 60
columns:
  customerID: {type: STRING, unique: true, nullable: false}
  tenure:     {type: INT64, nullable: false, range: [0, 120]}
  MonthlyCharges: {type: NUMERIC, nullable: false, range: [0, 500]}
  TotalCharges:   {type: NUMERIC, nullable: false}
rules:
  - name: fk_contract_type
    description: contractType must exist in dim_contracts
    severity: P0
```

# 2) Test across the full pipeline (Bronze ‚Üí Silver ‚Üí Gold)

* **Ingest/Bronze:** schema, required columns, type coercion, row count sanity, duplicate primary keys.
* **Transform/Silver:** referential integrity, accepted values, business rules (e.g., `TotalCharges ‚âà tenure*MonthlyCharges` tolerance), distribution drift checks.
* **Publish/Gold:** KPI reconciliation vs previous day, freshness, completeness of key segments.

# 3) Standard test catalog (cover the dimensions)

* **Completeness:** `NOT NULL`, required % filled.
* **Validity:** types, ranges, regexes, enumerations.
* **Uniqueness:** primary keys, composite keys.
* **Consistency/Integrity:** FKs, cross-column rules.
* **Timeliness/Freshness:** max timestamp vs now.
* **Accuracy/Reconciliation:** totals vs source-of-record.
* **Drift/Anomaly:** z-score/IQR/seasonal anomaly on row\_count, null\_rate, mean/median.

# 4) Implement with your stack

### dbt (fast, declarative tests)

```yaml
# models/telco/schema.yml
version: 2
models:
  - name: stg_telco_customers
    tests:
      - dbt_utils.unique_combination_of_columns:
          combination_of_columns: [customerID]
      - dbt_utils.not_null_proportion:
          column_name: tenure
          at_least: 0.999
    columns:
      - name: contractType
        tests:
          - accepted_values:
              values: ['Month-to-month','One year','Two year']
      - name: plan_id
        tests:
          - relationships:
              to: ref('dim_contracts')
              field: plan_id
```

### SQL spot checks (works anywhere)

```sql
-- P0: duplicate primary keys
SELECT customerID, COUNT(*) c
FROM analytics.telco_customers
GROUP BY 1 HAVING c > 1;

-- Freshness (expect < 60 min)
SELECT TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), MAX(updated_at), MINUTE) AS minutes_late
FROM analytics.telco_customers;

-- Business rule tolerance
SELECT COUNT(*) AS bad_rows
FROM analytics.telco_customers
WHERE tenure > 0
  AND ABS(TotalCharges - tenure*MonthlyCharges) > 10;  -- $10 tolerance
```

### Python contracts (schema in code; great for ingestion)

**Pandera** (lightweight, free):

```python
import pandera as pa
from pandera import Column, Check

TelcoSchema = pa.DataFrameSchema({
  "customerID": Column(str, nullable=False, unique=True),
  "tenure": Column(int, Check.in_range(0, 120), nullable=False),
  "MonthlyCharges": Column(float, Check.in_range(0, 500), nullable=False),
  "TotalCharges": Column(float, nullable=False),
})
TelcoSchema.validate(df, lazy=True)
```

**Great Expectations** (rich profiling + stores):

```python
validator.expect_column_values_to_not_be_null("customerID")
validator.expect_column_values_to_be_unique("customerID")
validator.expect_column_values_to_be_between("tenure", 0, 120)
validator.expect_column_values_to_be_in_set("contractType",
    ["Month-to-month","One year","Two year"])
```

# 5) Gate the pipeline & alert

* **Fail fast on P0** (stop the job, quarantine bad rows to an error table).
* **Alert on P1/P2** (Slack/Email with row samples and a run link).
* **Auto-ticket** for repeated breaches. Keep **run\_history** of test metrics.

# 6) Monitor, not just test

Track time-series of: row\_count, null\_rate per column, distinct\_count, mean/median, min/max. Use simple anomaly rules (EWMA or seasonal z-scores) to catch silent schema/content drift. Put them on a small dashboard.

# 7) CI/CD integration (professional touch)

* **Pre-merge:** run unit SQL, dbt tests, and sample validations (on a 1‚Äì5% subset).
* **Post-merge:** full dataset tests in the orchestrator (Airflow/Prefect/dbt Cloud).
* **Data diffs** on key tables (e.g., elementary-dbt or data-diff) to show what changed.

# 8) Document & iterate

Every failed test should have: owner, impact, remediation, and a decision (tighten, relax, or deprecate). Review thresholds quarterly with stakeholders.

---

### Minimal ‚Äúday-one‚Äù setup for your Telco project (BigQuery + dbt)

1. Write `data_contract.yml`.
2. Add dbt tests (unique/not\_null/accepted\_values/relationships) in `schema.yml`.
3. Add 3 SQL checks (freshness, dup keys, business rule tolerance) to a nightly job.
4. Add Pandera (or GE) on ingest to catch bad CSVs early.
5. Wire Slack alerts + P0 gating.

If you want, I can generate the **contract + dbt tests** tailored to your current Telco column names so you can paste them in and run today.


Awesome‚Äîhere‚Äôs a ready-to-run **data contract** and **dbt tests** for the classic IBM Telco Churn columns. I‚Äôll assume your final model is named `stg_telco_customers` (rename if yours is different).

---

### `packages.yml`

```yaml
packages:
  - package: dbt-labs/dbt_utils
    version: [">=1.1.1", "<2.0.0"]
  - package: calogica/dbt_expectations
    version: [">=0.10.3", "<1.0.0"]
```

Run once:

```bash
dbt deps
```

---

### `data_contracts/telco_customers_contract.yml`  *(living doc; human + CI reference)*

```yaml
table: stg_telco_customers
owner: you@example.com
freshness_sla_minutes: 1440  # dataset is static; adjust if you refresh
primary_key: customerID

columns:
  customerID:      {type: STRING, nullable: false, unique: true}
  gender:          {type: STRING, nullable: false, enum: ["Male","Female"]}
  SeniorCitizen:   {type: INT64,  nullable: false, enum: [0,1]}
  Partner:         {type: STRING, nullable: false, enum: ["Yes","No"]}
  Dependents:      {type: STRING, nullable: false, enum: ["Yes","No"]}
  tenure:          {type: INT64,  nullable: false, range: [0, 84]}
  PhoneService:    {type: STRING, nullable: false, enum: ["Yes","No"]}
  MultipleLines:   {type: STRING, nullable: false, enum: ["Yes","No","No phone service"]}
  InternetService: {type: STRING, nullable: false, enum: ["DSL","Fiber optic","No"]}
  OnlineSecurity:  {type: STRING, nullable: false, enum: ["Yes","No","No internet service"]}
  OnlineBackup:    {type: STRING, nullable: false, enum: ["Yes","No","No internet service"]}
  DeviceProtection:{type: STRING, nullable: false, enum: ["Yes","No","No internet service"]}
  TechSupport:     {type: STRING, nullable: false, enum: ["Yes","No","No internet service"]}
  StreamingTV:     {type: STRING, nullable: false, enum: ["Yes","No","No internet service"]}
  StreamingMovies: {type: STRING, nullable: false, enum: ["Yes","No","No internet service"]}
  Contract:        {type: STRING, nullable: false, enum: ["Month-to-month","One year","Two year"]}
  PaperlessBilling:{type: STRING, nullable: false, enum: ["Yes","No"]}
  PaymentMethod:   {type: STRING, nullable: false, enum: ["Electronic check","Mailed check","Bank transfer (automatic)","Credit card (automatic)"]}
  MonthlyCharges:  {type: FLOAT64, nullable: false, range: [0, 200]}
  TotalCharges:    {type: FLOAT64, nullable: false, min: 0}
  Churn:           {type: STRING, nullable: false, enum: ["Yes","No"]}

rules:
  - name: total_vs_tenure_rate_tolerance
    description: |-
      For tenure > 0, TotalCharges should be close to tenure * MonthlyCharges.
      Allow $10 tolerance for pro-rations/fees.
    severity: P1
  - name: zero_total_when_tenure_zero
    description: For tenure == 0, TotalCharges must be 0.
    severity: P0
```

---

### `models/telco/schema.yml`  *(dbt tests for `stg_telco_customers`)*

```yaml
version: 2

models:
  - name: stg_telco_customers
    config:
      # Optional: if your warehouse supports dbt contracts, enforce types at build time.
      # BigQuery/Snowflake work well. Remove if you don't want hard enforcement.
      contract:
        enforced: true
    columns:
      - name: customerID
        data_type: string
        tests:
          - not_null
          - unique

      - name: gender
        data_type: string
        tests:
          - not_null
          - accepted_values:
              values: ['Male','Female']

      - name: SeniorCitizen
        data_type: int64
        tests:
          - not_null
          - accepted_values:
              values: [0,1]

      - name: Partner
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['Yes','No']}

      - name: Dependents
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['Yes','No']}

      - name: tenure
        data_type: int64
        tests:
          - not_null
          - dbt_expectations.expect_column_values_to_be_between:
              min_value: 0
              max_value: 84
              strictly: false

      - name: PhoneService
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['Yes','No']}

      - name: MultipleLines
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['Yes','No','No phone service']}

      - name: InternetService
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['DSL','Fiber optic','No']}

      - name: OnlineSecurity
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['Yes','No','No internet service']}

      - name: OnlineBackup
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['Yes','No','No internet service']}

      - name: DeviceProtection
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['Yes','No','No internet service']}

      - name: TechSupport
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['Yes','No','No internet service']}

      - name: StreamingTV
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['Yes','No','No internet service']}

      - name: StreamingMovies
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['Yes','No','No internet service']}

      - name: Contract
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['Month-to-month','One year','Two year']}

      - name: PaperlessBilling
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['Yes','No']}

      - name: PaymentMethod
        data_type: string
        tests:
          - not_null
          - accepted_values:
              values: ['Electronic check','Mailed check','Bank transfer (automatic)','Credit card (automatic)']

      - name: MonthlyCharges
        data_type: float64
        tests:
          - not_null
          - dbt_expectations.expect_column_values_to_be_between:
              min_value: 0
              max_value: 200
              strictly: false

      - name: TotalCharges
        data_type: float64
        tests:
          - not_null
          - dbt_expectations.expect_column_values_to_be_between:
              min_value: 0
              max_value: 10000   # generous upper bound
              strictly: false

      - name: Churn
        data_type: string
        tests:
          - not_null
          - accepted_values: {values: ['Yes','No']}

    tests:
      # P0: tenure==0 => TotalCharges must be 0
      - dbt_utils.expression_is_true:
          expression: "case when tenure = 0 then coalesce(TotalCharges,0) = 0 else true end"

      # P1: business-rule tolerance for tenure>0
      - dbt_utils.expression_is_true:
          expression: "abs(coalesce(TotalCharges,0) - coalesce(tenure,0) * coalesce(MonthlyCharges,0)) <= 10"
          where: "tenure > 0"
```

---

### How to run

```bash
# From your dbt project root
dbt deps
dbt test -s stg_telco_customers
```

If your model name isn‚Äôt `stg_telco_customers`, just replace it in both files. Want me to generate the same for your **Gold** table (post-imputation/feature-engineered version) or add a couple of **freshness and row-count anomaly** checks?

---

># **freshness and row-count anomaly checks**

here are drop-in **freshness** and **row-count anomaly** checks you can add to your Telco dbt project. I‚Äôll show three layers: (A) model-level freshness, (B) source freshness, and (C) row-count anomaly (static bounds + statistical anomalies).


### 0) Add the package (one time)

```yaml
# packages.yml
packages:
  - package: dbt-labs/dbt_utils
    version: [">=1.1.1", "<2.0.0"]
  - package: calogica/dbt_expectations
    version: [">=0.10.3", "<1.0.0"]
  - package: elementary-data/elementary
    version: [">=0.14.0", "<1.0.0"]
```

Run:

```bash
dbt deps
```

---

### 1) Model-level freshness (dbt\_utils)

Assumes your staged model `stg_telco_customers` has a timestamp column (e.g., `ingested_at`). If you don‚Äôt have one, add it in your staging SQL (e.g., `CURRENT_TIMESTAMP()` at load).

```yaml
# models/telco/schema.yml (append under the same model block)
models:
  - name: stg_telco_customers
    tests:
      # Data should be no older than 24 hours
      - dbt_utils.recency:
          field: ingested_at
          datepart: hour
          interval: 24
```

This asserts the **max(ingested\_at)** is within the last 24 hours. ([GitHub][1], [Elementary Data][2])

---

### 2) Source freshness (built-in dbt ‚Äúsource freshness‚Äù)

If you ingest from a raw/source table, define it with a `loaded_at_field` and SLAs:

```yaml
# models/telco/sources.yml
version: 2
sources:
  - name: telco_raw
    database: your_db
    schema: your_schema
    tables:
      - name: telco_customers_raw
        loaded_at_field: _ingested_at        # or your load timestamp col
        freshness:
          warn_after: {count: 1, period: day}
          error_after: {count: 2, period: day}
```

Run it with:

```bash
dbt source freshness -s source:telco_raw
```

This is the standard way to gate stale sources in CI. ([Datafold][3])

---

### 3) Row-count anomaly checks

#### 3a) Simple guardrail (static bounds)

Because the classic Telco CSV is \~7,043 rows, put a conservative envelope around it (adjust if you‚Äôre appending/refreshing):

```yaml
# models/telco/schema.yml
models:
  - name: stg_telco_customers
    tests:
      - dbt_expectations.expect_table_row_count_to_be_between:
          min_value: 6500
          max_value: 8000
```

Great for catching truncation/duplication in a single run. ([Elementary Data][4], [Airbyte][5])

#### 3b) Statistical anomalies over time (Elementary)

Elementary learns typical row counts and flags spikes/drops using historical windows.

```yaml
# models/telco/schema.yml
models:
  - name: stg_telco_customers
    tests:
      - elementary.volume_anomalies:
          timestamp_column: ingested_at      # if omitted, counts total rows per run
          time_bucket:                       # daily buckets
            period: day
            count: 1
          training_period:                   # learn from last 28 days
            period: day
            count: 28
          detection_period:                  # check last 2 completed days
            period: day
            count: 2
          anomaly_sensitivity: 3             # ~3 standard deviations
          anomaly_direction: both            # flag drops and spikes
          fail_on_zero: true                 # fail if volume suddenly hits zero
```

Elementary stores historical metrics in your warehouse and compares recent buckets to expected ranges‚Äîso you catch subtle changes, not just hard breaches. ([Elementary Data][6])

> Optional: You can also add **freshness anomalies** the same way to detect delayed updates over time:

```yaml
- elementary.freshness_anomalies:
    timestamp_column: ingested_at
    time_bucket: { period: day, count: 1 }
    training_period: { period: day, count: 28 }
    detection_period:{ period: day, count: 2 }
```

([Elementary Data][7], [Elementary Data][8])

---

### How to run (summary)

```bash
dbt deps
dbt test -s stg_telco_customers           # runs schema tests + Elementary tests
dbt source freshness -s source:telco_raw  # runs source freshness
```

If you want, I can also add **null-rate/enum drift** anomaly checks (Elementary‚Äôs column/dimension anomalies) for fields like `InternetService`, `Contract`, and `Churn`. ([Elementary Data][9])

[1]: https://github.com/dbt-labs/dbt-utils?utm_source=chatgpt.com "dbt-labs/dbt-utils: Utility functions for dbt projects."
[2]: https://www.elementary-data.com/dbt-tests/recency?utm_source=chatgpt.com "dbt-utils.recency | Elementary dbt test hub"
[3]: https://www.datafold.com/blog/dbt-source-freshness?utm_source=chatgpt.com "How to use dbt source freshness tests to detect stale data"
[4]: https://www.elementary-data.com/dbt-tests/expect-table-row-count-to-be-between?utm_source=chatgpt.com "dbt test: expect_table_row_count_to_be_between"
[5]: https://airbyte.com/blog/testing-with-dbt-expectations?utm_source=chatgpt.com "Testing Data Pipelines with dbt-expectations: A Beginner's ..."
[6]: https://docs.elementary-data.com/data-tests/anomaly-detection-tests/volume-anomalies?utm_source=chatgpt.com "volume_anomalies - Elementary Data"
[7]: https://www.elementary-data.com/dbt-tests/freshness-anomalies?utm_source=chatgpt.com "dbt test: freshness_anomalies"
[8]: https://docs.elementary-data.com/data-tests/anomaly-detection-tests/freshness-anomalies?utm_source=chatgpt.com "freshness_anomalies - Elementary Data"
[9]: https://docs.elementary-data.com/data-tests/introduction?utm_source=chatgpt.com "Elementary Data Tests"
