In [4]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.2.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.29.3-py3-none-manylinux_2_18_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.2.0-py3-none-manylinux_2_28_x86_64.whl (131.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.7/131.7 MB[0m [31m175.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.29.3-py3-none-manylinux_2_18_x86_64.whl (289.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.8/289.8 MB[0m [31m106.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.29.3 xgboost-3.2.0


In [6]:
# train_models_universal.py

#
# What it does (assignment-aligned):
# 1) Loads Adult Income CSV
# 2) Cleans '?' -> NaN, trims strings
# 3) Trains 6 models on SAME dataset (with preprocessing inside pipeline)
# 4) Computes required metrics and saves metrics_comparison.csv
# 5) Saves 6 trained pipelines into ./model (or /content/model in Colab)
# 6) If running in Colab: zips and downloads the model folder automatically

import os
import warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
)

import joblib

# XGBoost (required model)
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")


# -----------------------------
# Environment detection
# -----------------------------
def running_in_colab() -> bool:
    try:
        import google.colab  
        return True
    except Exception:
        return False


# -----------------------------
# Helpers
# -----------------------------
def normalize_strings(df: pd.DataFrame) -> pd.DataFrame:
    """Strip spaces and convert '?' to NaN for object columns."""
    out = df.copy()
    for col in out.columns:
        if out[col].dtype == "object":
            out[col] = out[col].astype(str).str.strip()
            out[col] = out[col].replace("?", np.nan)
    return out


def detect_target_column(df: pd.DataFrame) -> str:
    """Adult Income usually uses 'income'. Fall back to last column."""
    for c in ["income", "Income", "salary", "Salary", "class", "Class", "target", "Target"]:
        if c in df.columns:
            return c
    return df.columns[-1]


def binarize_target(y: pd.Series) -> pd.Series:
    """Map <=50K / >50K (and dotted variants) to 0/1."""
    y_str = y.astype(str).str.strip().str.replace(".", "", regex=False)
    mapping = {"<=50K": 0, ">50K": 1, "0": 0, "1": 1, "False": 0, "True": 1}
    y_bin = y_str.map(mapping)
    if y_bin.isna().any():
        # fallback if labels are different but binary
        uniques = sorted(y_str.dropna().unique().tolist())
        if len(uniques) == 2:
            auto_map = {uniques[0]: 0, uniques[1]: 1}
            y_bin = y_str.map(auto_map)
        else:
            raise ValueError(f"Unrecognized target labels: {sorted(set(y_str.unique().tolist()))}")
    return y_bin.astype(int)


def make_preprocessor(X: pd.DataFrame) -> ColumnTransformer:
    """Dense OneHot so GaussianNB doesn't fail on sparse matrices."""
    num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    cat_cols = [c for c in X.columns if c not in num_cols]

    numeric_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )

    # Version-safe dense one-hot (sklearn >=1.2 uses sparse_output)
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

    categorical_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", ohe),
        ]
    )

    return ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, num_cols),
            ("cat", categorical_pipe, cat_cols),
        ],
        remainder="drop",
    )


def auc_safe(y_true: np.ndarray, y_proba: np.ndarray) -> float:
    if len(np.unique(y_true)) < 2:
        return float("nan")
    return float(roc_auc_score(y_true, y_proba))


def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_proba: np.ndarray) -> dict:
    return {
        "Accuracy": float(accuracy_score(y_true, y_pred)),
        "AUC": auc_safe(y_true, y_proba),
        "Precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "Recall": float(recall_score(y_true, y_pred, zero_division=0)),
        "F1": float(f1_score(y_true, y_pred, zero_division=0)),
        "MCC": float(matthews_corrcoef(y_true, y_pred)),
    }


# ------------------
# Training + saving 
# ------------------
def train_and_save(
    csv_path: str,
    out_dir: str,
    test_size: float = 0.2,
    random_state: int = 42,
):
    os.makedirs(out_dir, exist_ok=True)

    df = pd.read_csv(csv_path)
    df = normalize_strings(df)

    target_col = detect_target_column(df)
    y = binarize_target(df[target_col])
    X = df.drop(columns=[target_col])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    preprocessor = make_preprocessor(X_train)

    models = {
        "Logistic_Regression": LogisticRegression(max_iter=2000),
        "Decision_Tree": DecisionTreeClassifier(random_state=random_state),
        "KNN": KNeighborsClassifier(n_neighbors=7),
        "Naive_Bayes": GaussianNB(),
        "Random_Forest": RandomForestClassifier(n_estimators=30, random_state=random_state, n_jobs=-1),
        "XGBoost": XGBClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=5,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_lambda=1.0,
            random_state=random_state,
            eval_metric="logloss",
            n_jobs=-1,
        ),
    }

    rows = []
    for name, model in models.items():
        pipe = Pipeline(steps=[("prep", preprocessor), ("model", model)])
        pipe.fit(X_train, y_train)

        y_pred = pipe.predict(X_test)

        if hasattr(pipe.named_steps["model"], "predict_proba"):
            y_proba = pipe.predict_proba(X_test)[:, 1]
        elif hasattr(pipe.named_steps["model"], "decision_function"):
            scores = pipe.decision_function(X_test)
            scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-9)
            y_proba = scores
        else:
            y_proba = y_pred.astype(float)

        m = compute_metrics(y_test.values, y_pred, y_proba)
        m["Model"] = name
        rows.append(m)

        joblib.dump(pipe, os.path.join(out_dir, f"{name}.joblib"))

    metrics_df = pd.DataFrame(rows)[["Model", "Accuracy", "AUC", "Precision", "Recall", "F1", "MCC"]]
    metrics_df = metrics_df.sort_values(by="AUC", ascending=False)
    metrics_path = os.path.join(out_dir, "metrics_comparison.csv")
    metrics_df.to_csv(metrics_path, index=False)

    print("Target column:", target_col)
    print("Saved 6 model pipeline files to:", out_dir)
    print("Saved metrics table to:", metrics_path)
    print("\n=== Metrics Comparison (sorted by AUC) ===")
    print(metrics_df.to_string(index=False))

    return out_dir


# -----------------------------
# Colab convenience: zip & download (only runs in Colab)
# -----------------------------
def zip_and_download_if_colab(folder_path: str, zip_name: str):
    if not running_in_colab():
        return

    import shutil
    from google.colab import files

    if os.path.exists(zip_name):
        os.remove(zip_name)

    shutil.make_archive(zip_name.replace(".zip", ""), "zip", folder_path)
    print("Zipped:", zip_name)
    files.download(zip_name)


# -----------------------------
# Universal entry point
# -----------------------------
def main(csv_path: str = None):
    # Default paths depending on environment
    if running_in_colab():
        default_csv = "/content/adult.csv"
        default_out = "/content/model"
        default_zip = "/content/model.zip"
    else:
        default_csv = "adult.csv"
        default_out = "model"
        default_zip = "model.zip"  

    csv_path = csv_path or default_csv

    if not os.path.exists(csv_path):
        raise FileNotFoundError(
            f"Could not find dataset at: {csv_path}\n"
            f"- In Colab, upload adult.csv to /content/\n"
            f"- Locally, keep adult.csv in the same folder as this script/notebook\n"
        )

    out_dir = train_and_save(csv_path=csv_path, out_dir=default_out)

    # Only Colab will download automatically
    zip_and_download_if_colab(folder_path=out_dir, zip_name=default_zip)


# In Jupyter: run `main("path/to/adult.csv")` or just `main()`
if __name__ == "__main__":
    main()

Target column: income
Saved 6 model pipeline files to: model
Saved metrics table to: model/metrics_comparison.csv

=== Metrics Comparison (sorted by AUC) ===
              Model  Accuracy      AUC  Precision   Recall       F1      MCC
            XGBoost  0.868724 0.922962   0.772762 0.644133 0.702609 0.623495
Logistic_Regression  0.853063 0.902445   0.739233 0.602041 0.663620 0.575799
      Random_Forest  0.848303 0.892875   0.726917 0.592474 0.652846 0.561847
                KNN  0.837402 0.866877   0.684822 0.601403 0.640407 0.537816
      Decision_Tree  0.815292 0.752058   0.613284 0.630102 0.621579 0.499519
        Naive_Bayes  0.536619 0.733448   0.335973 0.947066 0.495992 0.323665
