In [None]:
%load_ext autoreload
%autoreload 2


from pathlib import Path
import json, time
import pandas as pd
import numpy as np


from addiction_ds.io import load_cfg, save_model, load_model, get_paths


from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
)

NameError: name 'CFG' is not defined

In [None]:
# config
CFG_PATH = "configs/experiment.yaml"
cfg = load_cfg(CFG_PATH)
label = cfg.get("label", "is_smoker")
num_feats = cfg["features"]["numeric"]
cat_feats = cfg["features"]["categorical"]
paths = cfg.get("paths", {})
TRAIN_CSV = Path(paths.get("train_csv", "data/processed/train.csv"))
VAL_CSV = Path(paths.get("val_csv", "data/processed/val.csv"))
RANDOM_STATE = int(cfg.get("random_state", 42))
PREP = cfg.get("preprocessing", {})
MODEL_CFG = cfg.get("model", {"name": "logistic_regression", "params": {"max_iter": 1000}})

In [None]:
try:
    P = get_paths(cfg)
    MODELS_DIR = P.get("models_dir", "models")
    REPORTS_DIR = P.get("reports_dir", "reports")
except Exception:
    MODELS_DIR = "models"
    REPORTS_DIR = "reports"
Path(MODELS_DIR).mkdir(parents=True, exist_ok=True)
Path(REPORTS_DIR).mkdir(parents=True, exist_ok=True)

In [None]:
# allowed estimators
ALLOWED = {
    "logistic_regression": LogisticRegression,
    "random_forest": RandomForestClassifier,
    "gradient_boosting": GradientBoostingClassifier,
    "svc": SVC,
    "sgd_classifier": SGDClassifier,
}


print("models_dir:", MODELS_DIR)
print("reports_dir:", REPORTS_DIR)  

In [None]:
assert TRAIN_CSV.exists(), f"Missing {TRAIN_CSV}."
assert VAL_CSV.exists(), f"Missing {VAL_CSV}."


train_df = pd.read_csv(TRAIN_CSV)
val_df = pd.read_csv(VAL_CSV)
print(train_df.shape, val_df.shape)


for col in num_feats + cat_feats + [label]:
    assert col in train_df.columns, f"'{col}' not in train_df"
    assert col in val_df.columns, f"'{col}' not in val_df"


X_train = train_df[num_feats + cat_feats]
y_train = train_df[label]
X_val = val_df[num_feats + cat_feats]
y_val = val_df[label]

Index(['age', 'gender', 'country', 'education_level', 'employment_status',
       'annual_income_usd', 'marital_status', 'children_count',
       'smokes_per_day', 'drinks_per_week', 'age_started_smoking',
       'age_started_drinking', 'attempts_to_quit_smoking',
       'attempts_to_quit_drinking', 'has_health_issues',
       'mental_health_status', 'exercise_frequency', 'diet_quality',
       'sleep_hours', 'bmi', 'social_support', 'therapy_history',
       'salary_percentile', 'age_group', 'adequet_sleep', 'family_status'],
      dtype='object')

In [None]:
num_steps = [("impute", SimpleImputer(strategy=PREP.get("impute_numeric", "median")))]
if PREP.get("scale_numeric", True):
    num_steps.append(("scale", StandardScaler()))
num_pipe = Pipeline(num_steps)


cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy=PREP.get("impute_categorical", "most_frequent"))),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])


pre = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_feats),
        ("cat", cat_pipe, cat_feats),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)
pre

In [None]:
model_name = MODEL_CFG.get("name", "logistic_regression")
params = dict(MODEL_CFG.get("params", {}))


Est = ALLOWED[model_name]
if model_name == "svc":
    params = {"probability": True, **params}
# inject random_state if supported
try:
    if "random_state" in Est().get_params():
        params = {"random_state": RANDOM_STATE, **params}
except Exception:
    pass


clf = Est(**params)
clf

In [None]:
pipe = Pipeline([
    ("pre", pre),
    ("model", clf),
])
pipe.fit(X_train, y_train)
pipe

In [None]:
# why: consistent AUC even if estimator lacks predict_proba
if hasattr(pipe.named_steps["model"], "predict_proba"):
    scores = pipe.predict_proba(X_val)[:, 1]
elif hasattr(pipe.named_steps["model"], "decision_function"):
    s = np.asarray(pipe.decision_function(X_val), dtype=float)
    scores = (s - s.min()) / (s.max() - s.min() + 1e-12)
else:
    raise RuntimeError("Estimator provides neither predict_proba nor decision_function.")


preds = (scores >= 0.5).astype(int)
auc = float(roc_auc_score(y_val, scores))
acc = float(accuracy_score(y_val, preds))
prec = float(precision_score(y_val, preds, zero_division=0))
rec = float(recall_score(y_val, preds, zero_division=0))
f1 = float(f1_score(y_val, preds, zero_division=0))
cm = confusion_matrix(y_val, preds)


metrics = {
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
        "model": model_name,
    "params": params,
    "scores": {"auc": auc, "accuracy": acc, "precision": prec, "recall": rec, "f1": f1},
}
print(metrics)
print(classification_report(y_val, preds, zero_division=0))

In [None]:
stamp = time.strftime("%Y%m%d-%H%M%S")
versioned_name = f"{stamp}_{model_name}"


path_versioned = save_model(pipe, cfg, name=versioned_name, framework="sklearn")
path_latest = save_model(pipe, cfg, name="latest", framework="sklearn")


print("Saved versioned →", path_versioned)
print("Updated latest →", path_latest)

In [None]:
# files
metrics_json = Path(REPORTS_DIR) / f"metrics_{stamp}.json"
metrics_latest = Path(REPORTS_DIR) / "metrics_latest.json"
metrics_csv = Path(REPORTS_DIR) / f"metrics_{stamp}.csv"
clf_txt = Path(REPORTS_DIR) / f"classification_report_{stamp}.txt"
cm_csv = Path(REPORTS_DIR) / f"confusion_matrix_{stamp}.csv"


# json
metrics_json.write_text(json.dumps(metrics, indent=2))
metrics_latest.write_text(json.dumps(metrics, indent=2))


# csv (flat)
(pd.DataFrame([metrics["scores"]])
.assign(model=metrics["model"], timestamp=metrics["timestamp"])
.to_csv(metrics_csv, index=False))


# classification report text
clf_txt.write_text(classification_report(y_val, preds, zero_division=0))


# confusion matrix
pd.DataFrame(cm, columns=["pred_0","pred_1"], index=["true_0","true_1"]).to_csv(cm_csv)


print("Wrote:")
for p in [metrics_json, metrics_latest, metrics_csv, clf_txt, cm_csv]:
    print(" -", p)