# ML Assignment 2 — Train & Evaluate 6 Classification Models

**Dataset:** UCI Breast Cancer Wisconsin (Diagnostic)  
Run each cell sequentially. Every step produces visible output so you can verify.


## 1. Imports


In [None]:
from __future__ import annotations
from dataclasses import asdict, dataclass
from pathlib import Path
import json

import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    matthews_corrcoef,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
    ConfusionMatrixDisplay,
)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

print("All imports successful ✅")


## 2. Configuration & Paths


In [None]:
RANDOM_STATE = 42

# Resolve paths (works whether kernel cwd is project-folder/ or model/)
PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "model").exists() and (PROJECT_ROOT.parent / "model").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

MODEL_DIR = PROJECT_ROOT / "model"
DATA_DIR  = PROJECT_ROOT / "data"
MODEL_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"PROJECT_ROOT : {PROJECT_ROOT}")
print(f"MODEL_DIR    : {MODEL_DIR}")
print(f"DATA_DIR     : {DATA_DIR}")


## 3. Load Dataset


In [None]:
dataset = load_breast_cancer(as_frame=True)
X = dataset.data
y = dataset.target

print(f"Total instances : {X.shape[0]}")
print(f"Total features  : {X.shape[1]}")
print(f"Target classes  : {dataset.target_names.tolist()}")
print(f"\nClass distribution:")
print(y.value_counts().rename({0: 'malignant', 1: 'benign'}))


## 4a. Preview First 10 Rows


In [None]:
X.head(10)


## 4b. Statistical Summary


In [None]:
X.describe().round(3)


## 4c. Check Missing Values


In [None]:
missing = X.isnull().sum()
print("Missing values per feature:")
print(missing[missing > 0] if missing.any() else "None — dataset is clean ✅")


## 4d. Target Distribution Plot


In [None]:
fig, ax = plt.subplots(figsize=(5, 3))
y.value_counts().plot.bar(ax=ax, color=["#e74c3c", "#2ecc71"])
ax.set_xticklabels(["Malignant (0)", "Benign (1)"], rotation=0)
ax.set_ylabel("Count")
ax.set_title("Target Class Distribution")
plt.tight_layout()
plt.show()


## 4e. Feature Correlation Heatmap (top 15 features)


In [None]:
top_features = X.corrwith(y).abs().sort_values(ascending=False).head(15).index.tolist()
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(X[top_features].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax, square=True)
ax.set_title("Correlation Heatmap — Top 15 Features (by target correlation)")
plt.tight_layout()
plt.show()


## 5. Train / Test Split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

feature_names = X_train.columns.tolist()
print(f"Training set : {X_train.shape[0]} samples")
print(f"Test set     : {X_test.shape[0]} samples")
print(f"Features     : {len(feature_names)}")


## 6. Helper Functions


In [None]:
def build_preprocessor(feature_names, use_scaler=False):
    steps = [("imputer", SimpleImputer(strategy="median"))]
    if use_scaler:
        steps.append(("scaler", StandardScaler()))
    return ColumnTransformer(
        transformers=[("num", Pipeline(steps=steps), feature_names)],
        remainder="drop",
    )

@dataclass
class ModelResult:
    model_name: str
    accuracy: float
    auc: float
    precision: float
    recall: float
    f1: float
    mcc: float

def compute_metrics(y_true, y_pred, y_prob, model_name):
    return ModelResult(
        model_name=model_name,
        accuracy=accuracy_score(y_true, y_pred),
        auc=roc_auc_score(y_true, y_prob),
        precision=precision_score(y_true, y_pred),
        recall=recall_score(y_true, y_pred),
        f1=f1_score(y_true, y_pred),
        mcc=matthews_corrcoef(y_true, y_pred),
    )

def train_and_evaluate(name, pipeline):
    """Train, print metrics, show confusion matrix + ROC curve."""
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1]

    result = compute_metrics(y_test, y_pred, y_prob, name)
    result_dict = asdict(result)

    # --- Metrics table ---
    print(f"\n{'='*55}")
    print(f"  {name}")
    print(f"{'='*55}")
    for k, v in result_dict.items():
        if k != "model_name":
            print(f"  {k:>10s} : {v:.4f}")

    # --- Classification report ---
    print(f"\n  Classification Report:")
    print(classification_report(y_test, y_pred, target_names=["malignant", "benign"]))

    # --- Confusion Matrix + ROC Curve side by side ---
    fig, axes = plt.subplots(1, 2, figsize=(10, 3.5))

    cm = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(cm, display_labels=["malignant", "benign"]).plot(
        ax=axes[0], cmap="Blues", colorbar=False)
    axes[0].set_title(f"{name} — Confusion Matrix")

    fpr, tpr, _ = roc_curve(y_test, y_prob)
    axes[1].plot(fpr, tpr, lw=2, label=f"AUC = {result.auc:.4f}")
    axes[1].plot([0, 1], [0, 1], "k--", lw=1)
    axes[1].set_xlabel("False Positive Rate")
    axes[1].set_ylabel("True Positive Rate")
    axes[1].set_title(f"{name} — ROC Curve")
    axes[1].legend(loc="lower right")

    plt.tight_layout()
    plt.show()

    return result_dict, pipeline

# Storage for all results
all_results = []
all_models = {}

print("Helpers defined ✅")


## 7a. Model 1 — Logistic Regression


In [None]:
lr_pipe = Pipeline([
    ("preprocess", build_preprocessor(feature_names, use_scaler=True)),
    ("classifier", LogisticRegression(max_iter=2000, random_state=RANDOM_STATE)),
])
res, model = train_and_evaluate("Logistic Regression", lr_pipe)
all_results.append(res)
all_models["Logistic Regression"] = model


## 7b. Model 2 — Decision Tree


In [None]:
dt_pipe = Pipeline([
    ("preprocess", build_preprocessor(feature_names, use_scaler=False)),
    ("classifier", DecisionTreeClassifier(random_state=RANDOM_STATE)),
])
res, model = train_and_evaluate("Decision Tree", dt_pipe)
all_results.append(res)
all_models["Decision Tree"] = model


## 7c. Model 3 — K-Nearest Neighbors (kNN)


In [None]:
knn_pipe = Pipeline([
    ("preprocess", build_preprocessor(feature_names, use_scaler=True)),
    ("classifier", KNeighborsClassifier(n_neighbors=7)),
])
res, model = train_and_evaluate("kNN", knn_pipe)
all_results.append(res)
all_models["kNN"] = model


## 7d. Model 4 — Naive Bayes (Gaussian)


In [None]:
nb_pipe = Pipeline([
    ("preprocess", build_preprocessor(feature_names, use_scaler=False)),
    ("classifier", GaussianNB()),
])
res, model = train_and_evaluate("Naive Bayes", nb_pipe)
all_results.append(res)
all_models["Naive Bayes"] = model


## 7e. Model 5 — Random Forest (Ensemble)


In [None]:
rf_pipe = Pipeline([
    ("preprocess", build_preprocessor(feature_names, use_scaler=False)),
    ("classifier", RandomForestClassifier(n_estimators=400, random_state=RANDOM_STATE, n_jobs=-1)),
])
res, model = train_and_evaluate("Random Forest (Ensemble)", rf_pipe)
all_results.append(res)
all_models["Random Forest (Ensemble)"] = model


## 7f. Model 6 — XGBoost (Ensemble)


In [None]:
xgb_pipe = Pipeline([
    ("preprocess", build_preprocessor(feature_names, use_scaler=False)),
    ("classifier", XGBClassifier(
        n_estimators=350, max_depth=4, learning_rate=0.05,
        subsample=0.9, colsample_bytree=0.9,
        random_state=RANDOM_STATE, objective="binary:logistic",
        eval_metric="logloss", n_jobs=-1,
    )),
])
res, model = train_and_evaluate("XGBoost (Ensemble)", xgb_pipe)
all_results.append(res)
all_models["XGBoost (Ensemble)"] = model


## 8. Comparison Table — All 6 Models


In [None]:
metrics_df = pd.DataFrame(all_results)
metrics_df = metrics_df.sort_values("accuracy", ascending=False).reset_index(drop=True)
metrics_df.style.format({
    "accuracy": "{:.4f}", "auc": "{:.4f}", "precision": "{:.4f}",
    "recall": "{:.4f}", "f1": "{:.4f}", "mcc": "{:.4f}",
}).background_gradient(cmap="Greens", subset=["accuracy", "auc", "precision", "recall", "f1", "mcc"])


## 9. Visual Comparison — Bar Charts


In [None]:
metric_cols = ["accuracy", "auc", "precision", "recall", "f1", "mcc"]
plot_df = metrics_df.set_index("model_name")[metric_cols]

fig, axes = plt.subplots(2, 3, figsize=(16, 8))
colors = sns.color_palette("viridis", len(plot_df))

for ax, col in zip(axes.ravel(), metric_cols):
    plot_df[col].sort_values().plot.barh(ax=ax, color=colors)
    ax.set_title(col.upper(), fontsize=12, fontweight="bold")
    ax.set_xlim(0.8, 1.0)
    ax.axvline(x=plot_df[col].mean(), color="red", linestyle="--", lw=1, label="mean")
    ax.legend(fontsize=8)

plt.suptitle("Model Comparison — Evaluation Metrics", fontsize=14, fontweight="bold", y=1.01)
plt.tight_layout()
plt.show()


## 10. ROC Curves — All Models Overlaid


In [None]:
fig, ax = plt.subplots(figsize=(7, 5))
for name, pipe in all_models.items():
    y_prob = pipe.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc_val = roc_auc_score(y_test, y_prob)
    ax.plot(fpr, tpr, lw=2, label=f"{name} (AUC={auc_val:.4f})")

ax.plot([0, 1], [0, 1], "k--", lw=1)
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
ax.set_title("ROC Curves — All Models")
ax.legend(loc="lower right", fontsize=8)
plt.tight_layout()
plt.show()


## 11. Save Models (.pkl files)


In [None]:
for name, pipe in all_models.items():
    fname = name.lower().replace(" ", "_").replace("(", "").replace(")", "") + ".pkl"
    joblib.dump(pipe, MODEL_DIR / fname)
    print(f"  Saved: {fname}")

# Save metrics CSV
metrics_df.to_csv(MODEL_DIR / "model_metrics.csv", index=False)
print(f"  Saved: model_metrics.csv")

# Save confusion matrices JSON
cm_dict = {}
for name, pipe in all_models.items():
    y_pred = pipe.predict(X_test)
    cm_dict[name] = confusion_matrix(y_test, y_pred).tolist()

with open(MODEL_DIR / "confusion_matrices.json", "w") as f:
    json.dump(cm_dict, f, indent=2)
print(f"  Saved: confusion_matrices.json")


## 12. Save Train/Test CSVs & Metadata


In [None]:
# Save test data
test_df = X_test.copy()
test_df["target"] = y_test.values
test_df.to_csv(DATA_DIR / "test_data.csv", index=False)
print(f"  Saved: {DATA_DIR / 'test_data.csv'}")

# Save train data
train_df = X_train.copy()
train_df["target"] = y_train.values
train_df.to_csv(DATA_DIR / "train_data.csv", index=False)
print(f"  Saved: {DATA_DIR / 'train_data.csv'}")

# Save dataset metadata
metadata = {
    "dataset_name": "UCI Breast Cancer Wisconsin (Diagnostic)",
    "instances": int(X.shape[0]),
    "features": int(X.shape[1]),
    "target_names": dataset.target_names.tolist(),
    "feature_names": feature_names,
    "random_state": RANDOM_STATE,
}
with open(MODEL_DIR / "dataset_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)
print(f"  Saved: dataset_metadata.json")


## 13. Final Verification


In [None]:
print("Saved artifacts:\n")
for p in sorted(MODEL_DIR.glob("*")):
    if p.is_file() and not p.name.startswith("__"):
        print(f"  model/{p.name:40s}  {p.stat().st_size/1024:>8.1f} KB")
for p in sorted(DATA_DIR.glob("*")):
    if p.is_file():
        print(f"  data/{p.name:41s}  {p.stat().st_size/1024:>8.1f} KB")

print("\n✅ All done! Models trained, evaluated, and saved.")
