In [1]:
# Cell 3 — Imports & settings
from __future__ import annotations


from datetime import datetime
from pathlib import Path
from typing import Optional, Sequence


import numpy as np
import pandas as pd
from pandas.api import types as ptypes
import matplotlib.pyplot as plt


from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


# Optional override; set to your target column name if auto-guess fails
TARGET_OVERRIDE: Optional[str] = None
TEST_SIZE: float = 0.2
# Use CFG from earlier bootstrap if available; else default
RANDOM_STATE: int = (CFG.get("random_state", 42) if isinstance(CFG, dict) else 42)

NameError: name 'CFG' is not defined

In [None]:
def _latest_csv(folder: Path) -> Path:
    files = sorted(folder.glob("*.csv"), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError(f"No CSV files found in {folder}")
    return files[0]




def _guess_target(df: pd.DataFrame, override: Optional[str]) -> str:
    if override and override in df.columns:
        return override
    common_names: Sequence[str] = (
        "target","label","class","y","outcome","status",
        "default","churn","is_positive","mental_health_status",
    )
    for name in common_names:
        if name in df.columns:
            return name
    # heuristic: pick a low-cardinality non-numeric column
    for c in df.columns:
        if not ptypes.is_numeric_dtype(df[c]) and df[c].nunique(dropna=True) <= 20:
            return c
    raise ValueError(
        "Could not infer target column. Set TARGET_OVERRIDE to a valid column name."
    )

In [None]:
# Cell 5 — Resolve paths and select processed CSV
# Requires: load_cfg, get_paths, ensure_dir already imported in prior bootstrap cells
CFG = CFG if 'CFG' in globals() else load_cfg()
P = P if 'P' in globals() else get_paths(CFG)


processed_dir: Path = P["processed_dir"]
reports_dir: Path = P["reports_dir"]
ensure_dir(reports_dir)


proc_csv = _latest_csv(processed_dir)
print(f"Loading processed dataset: {proc_csv}")




# Cell 6 — Load data & prepare matrices
df = pd.read_csv(proc_csv)


target_col = _guess_target(df, TARGET_OVERRIDE)
print(f"Using target column: {target_col}")


X = df.drop(columns=[target_col])
y = df[target_col]


num_cols = [c for c in X.columns if ptypes.is_numeric_dtype(X[c])]
cat_cols = [c for c in X.columns if c not in num_cols]

Index(['age', 'gender', 'country', 'education_level', 'employment_status',
       'annual_income_usd', 'marital_status', 'children_count',
       'smokes_per_day', 'drinks_per_week', 'age_started_smoking',
       'age_started_drinking', 'attempts_to_quit_smoking',
       'attempts_to_quit_drinking', 'has_health_issues',
       'mental_health_status', 'exercise_frequency', 'diet_quality',
       'sleep_hours', 'bmi', 'social_support', 'therapy_history',
       'salary_percentile', 'age_group', 'adequet_sleep', 'family_status'],
      dtype='object')

In [None]:
# Cell 7 — Build pipeline & train/test split
numeric_pre = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)


categorical_pre = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore")),
    ]
)


pre = ColumnTransformer(
    transformers=[
        ("num", numeric_pre, num_cols),
        ("cat", categorical_pre, cat_cols),
    ]
)


clf = Pipeline(
    steps=[
        ("pre", pre),
        ("model", LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)),
    ]
)


X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE,
stratify=y if y.nunique() > 1 else None,
)


clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
# Cell 8 — Save classification report & confusion matrix to reports/
run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
report_txt_path = Path(reports_dir) / f"classification_report_{run_id}.txt"
report_csv_path = Path(reports_dir) / f"classification_report_{run_id}.csv"
cm_png_path = Path(reports_dir) / f"confusion_matrix_{run_id}.png"
cm_csv_path = Path(reports_dir) / f"confusion_matrix_{run_id}.csv"


# Classification report (txt + csv)
rep_dict = classification_report(y_test, y_pred, output_dict=True)
with open(report_txt_path, "w", encoding="utf-8") as f:
    f.write(classification_report(y_test, y_pred))


pd.DataFrame(rep_dict).transpose().to_csv(report_csv_path)


# Confusion matrix (png + csv)
labels = sorted(pd.unique(pd.concat([y_test, pd.Series(y_pred, index=y_test.index)])))
cm = confusion_matrix(y_test, y_pred, labels=labels)


pd.DataFrame(
    cm,
    index=[f"true_{l}" for l in labels],
    columns=[f"pred_{l}" for l in labels],
).to_csv(cm_csv_path)


fig, ax = plt.subplots(figsize=(6, 5))
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels).plot(ax=ax, colorbar=False)
ax.set_title(f"Confusion Matrix — {Path(proc_csv).name}")
fig.tight_layout()
fig.savefig(cm_png_path, dpi=150)
plt.close(fig)


print("Saved:")
for p in (report_txt_path, report_csv_path, cm_png_path, cm_csv_path):
    print(" -", p)