# Model Training, Selection & PKL Export (Classification)
This notebook trains multiple classification models, evaluates them rigorously, selects the best model, and exports it as a **.pkl** artifact.

‚úÖ Input: `data/cleaned_students.csv` (preferred) OR `data/featured_students.csv`
‚úÖ Output:
- `artifacts/best_model.pkl`
- `artifacts/label_mapping.json`
- `artifacts/model_report.csv`

Competition-ready features:
- Pipelines to prevent leakage
- Stratified split
- Cross-validation
- Hyperparameter tuning
- Confusion matrix + classification report
- Feature importance (where applicable)


## 0) Setup

In [None]:
import os
from pathlib import Path
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix,
    ConfusionMatrixDisplay
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("‚úÖ Imports done")

## 1) Load Data (cleaned preferred)

In [None]:
CLEAN_PATH = Path("data/cleaned_students.csv")
FEATURED_PATH = Path("data/featured_students.csv")

if CLEAN_PATH.exists():
    df = pd.read_csv(CLEAN_PATH)
    DATA_SOURCE = str(CLEAN_PATH)
elif FEATURED_PATH.exists():
    df = pd.read_csv(FEATURED_PATH)
    DATA_SOURCE = str(FEATURED_PATH)
else:
    raise FileNotFoundError(
        "‚ùå Could not find data/cleaned_students.csv or data/featured_students.csv\n"
        "‚û°Ô∏è Run previous notebooks first."
    )

print("‚úÖ Loaded:", DATA_SOURCE)
print("Shape:", df.shape)
df.head()

## 2) Define Target & Features

In [None]:
TARGET_COL = "exam_score_class"

if TARGET_COL not in df.columns:
    raise KeyError(
        f"‚ùå Target column '{TARGET_COL}' not found.\n"
        f"‚û°Ô∏è Ensure data_cleaning.ipynb created '{TARGET_COL}'."
    )

# Drop ID columns if present
DROP_COLS = [c for c in ["student_id"] if c in df.columns]

# If using featured_students.csv it may already be encoded; detect by presence of many one-hot columns
many_cols = df.shape[1] > 50 and any("_" in c for c in df.columns)
IS_PREENCODED = (str(DATA_SOURCE).endswith("featured_students.csv")) or many_cols

print("DROP_COLS:", DROP_COLS)
print("IS_PREENCODED:", IS_PREENCODED)

y = df[TARGET_COL].copy()
X = df.drop(columns=DROP_COLS + [TARGET_COL], errors="ignore").copy()

print("X shape:", X.shape, "| y shape:", y.shape)

## 3) Preprocessing Pipeline (if not pre-encoded)
If `featured_students.csv` is used, we skip encoders and train directly.

In [None]:
def build_preprocessor(X: pd.DataFrame):
    # Detect feature types
    numeric_features = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist()
    categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

    ordinal_features = [c for c in ["sleep_quality", "facility_rating", "exam_difficulty"] if c in X.columns]
    nominal_features = [c for c in categorical_features if c not in ordinal_features]

    ordinal_categories_map = {
        "sleep_quality": ["poor", "average", "good"],
        "facility_rating": ["low", "medium", "high"],
        "exam_difficulty": ["easy", "moderate", "hard"],
    }
    ordinal_categories = [ordinal_categories_map[c] for c in ordinal_features]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", Pipeline(steps=[("scaler", StandardScaler())]), numeric_features),
            ("ord", OrdinalEncoder(categories=ordinal_categories, handle_unknown="use_encoded_value", unknown_value=-1), ordinal_features),
            ("nom", OneHotEncoder(handle_unknown="ignore"), nominal_features),
        ],
        remainder="drop"
    )

    meta = {
        "numeric_features": numeric_features,
        "ordinal_features": ordinal_features,
        "nominal_features": nominal_features
    }
    return preprocessor, meta

preprocessor, meta = build_preprocessor(X) if not IS_PREENCODED else (None, {})

if not IS_PREENCODED:
    print("‚úÖ Preprocessor built")
    print(meta)
else:
    print("‚úÖ Data appears pre-encoded; skipping preprocessing encoders")

## 4) Train/Test Split (Stratified)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)
print("Train class distribution:\n", y_train.value_counts(normalize=True).round(3))

## 5) Define Models (Baseline + Strong Models)

In [None]:
models = {
    "LogReg": LogisticRegression(max_iter=2000, class_weight="balanced"),
    "RandomForest": RandomForestClassifier(random_state=RANDOM_STATE, class_weight="balanced"),
    "GradientBoosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "SVC_RBF": SVC(probability=True, class_weight="balanced", random_state=RANDOM_STATE),
    "KNN": KNeighborsClassifier()
}
models

## 6) Cross-Validation Benchmark (Quick Leaderboard)
We use **F1-macro** as primary metric (robust for class imbalance).

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

leaderboard = []
for name, model in models.items():
    if IS_PREENCODED:
        pipe = model
        scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="f1_macro")
    else:
        pipe = Pipeline(steps=[("prep", preprocessor), ("model", model)])
        scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="f1_macro")
    leaderboard.append({
        "model": name,
        "cv_f1_macro_mean": scores.mean(),
        "cv_f1_macro_std": scores.std()
    })

leaderboard_df = pd.DataFrame(leaderboard).sort_values("cv_f1_macro_mean", ascending=False)
display(leaderboard_df)

## 7) Hyperparameter Tuning (Top 2 Models)
We tune the top performers from the quick leaderboard.

In [None]:
top_models = leaderboard_df["model"].head(2).tolist()
print("Top models for tuning:", top_models)

param_grids = {
    "LogReg": {
        "model__C": np.logspace(-3, 2, 20),
        "model__penalty": ["l2"]
    },
    "RandomForest": {
        "model__n_estimators": [200, 400, 600],
        "model__max_depth": [None, 5, 10, 20],
        "model__min_samples_split": [2, 5, 10],
        "model__min_samples_leaf": [1, 2, 4]
    },
    "GradientBoosting": {
        "model__n_estimators": [100, 200, 300],
        "model__learning_rate": [0.01, 0.05, 0.1],
        "model__max_depth": [2, 3, 4]
    },
    "SVC_RBF": {
        "model__C": np.logspace(-2, 2, 10),
        "model__gamma": ["scale", "auto"]
    },
    "KNN": {
        "model__n_neighbors": list(range(3, 30, 2)),
        "model__weights": ["uniform", "distance"]
    }
}

best_estimators = {}

for name in top_models:
    base_model = models[name]

    if IS_PREENCODED:
        # Tune directly (no preprocessor)
        pipe = base_model
        # Build parameter space accordingly
        if name == "RandomForest":
            search_space = {
                "n_estimators": [200, 400, 600],
                "max_depth": [None, 5, 10, 20],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4]
            }
        elif name == "LogReg":
            search_space = {"C": np.logspace(-3, 2, 20)}
        elif name == "SVC_RBF":
            search_space = {"C": np.logspace(-2, 2, 10), "gamma": ["scale", "auto"]}
        elif name == "KNN":
            search_space = {"n_neighbors": list(range(3, 30, 2)), "weights": ["uniform", "distance"]}
        else:
            search_space = {}
        search = RandomizedSearchCV(
            estimator=pipe,
            param_distributions=search_space,
            n_iter=min(20, max(5, len(search_space))),
            cv=cv,
            scoring="f1_macro",
            random_state=RANDOM_STATE,
            n_jobs=-1
        )
        search.fit(X_train, y_train)
        best_estimators[name] = search.best_estimator_
        print(f"‚úÖ Best {name}:", search.best_params_, "| best CV f1_macro:", search.best_score_)

    else:
        pipe = Pipeline(steps=[("prep", preprocessor), ("model", base_model)])
        search = RandomizedSearchCV(
            estimator=pipe,
            param_distributions=param_grids[name],
            n_iter=25,
            cv=cv,
            scoring="f1_macro",
            random_state=RANDOM_STATE,
            n_jobs=-1
        )
        search.fit(X_train, y_train)
        best_estimators[name] = search.best_estimator_
        print(f"‚úÖ Best {name}:", search.best_params_, "| best CV f1_macro:", search.best_score_)

best_estimators

## 8) Evaluate Best Models on Test Set

In [None]:
results = []
for name, est in best_estimators.items():
    y_pred = est.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1m = f1_score(y_test, y_pred, average="macro")
    f1w = f1_score(y_test, y_pred, average="weighted")

    results.append({
        "model": name,
        "test_accuracy": acc,
        "test_f1_macro": f1m,
        "test_f1_weighted": f1w
    })

results_df = pd.DataFrame(results).sort_values("test_f1_macro", ascending=False)
display(results_df)

## 9) Best Model Selection + Detailed Report

In [None]:
best_model_name = results_df.iloc[0]["model"]
best_model = best_estimators[best_model_name]

print("üèÜ Best model:", best_model_name)

y_pred = best_model.predict(X_test)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred, labels=sorted(y.unique()))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=sorted(y.unique()))
disp.plot(values_format="d")
plt.title(f"Confusion Matrix - {best_model_name}")
plt.show()

## 10) Save Artifacts (PKL + Mapping + Report)

In [None]:
from joblib import dump

ART_DIR = Path("artifacts")
ART_DIR.mkdir(parents=True, exist_ok=True)

MODEL_PATH = ART_DIR / "best_model.pkl"
LABEL_PATH = ART_DIR / "label_mapping.json"
REPORT_PATH = ART_DIR / "model_report.csv"

# Save model
dump(best_model, MODEL_PATH)
print("‚úÖ Saved model:", MODEL_PATH)

# Save label mapping
label_mapping = {"classes": sorted(y.unique())}
with open(LABEL_PATH, "w", encoding="utf-8") as f:
    json.dump(label_mapping, f, indent=2)
print("‚úÖ Saved label mapping:", LABEL_PATH)

# Save model report
results_df.to_csv(REPORT_PATH, index=False)
print("‚úÖ Saved report:", REPORT_PATH)

## 11) (Optional) Feature Importance
Works best for tree-based models. For pipelines, we try to extract feature names if possible.

In [None]:
def try_get_feature_names(prep, X_sample):
    # Try to reconstruct feature names from ColumnTransformer
    names = []
    try:
        # numeric
        num_cols = meta.get("numeric_features", [])
        names += num_cols

        # ordinal
        ord_cols = meta.get("ordinal_features", [])
        names += ord_cols

        # nominal
        nom_cols = meta.get("nominal_features", [])
        if len(nom_cols) > 0:
            ohe = prep.named_transformers_["nom"]
            ohe_names = ohe.get_feature_names_out(nom_cols)
            names += list(ohe_names)
        return names
    except Exception as e:
        print("Could not extract feature names:", e)
        return None

if best_model_name == "RandomForest":
    if IS_PREENCODED:
        importances = best_model.feature_importances_
        feat_names = X.columns.tolist()
    else:
        model_part = best_model.named_steps["model"]
        prep_part = best_model.named_steps["prep"]
        importances = model_part.feature_importances_
        feat_names = try_get_feature_names(prep_part, X_train)

    if feat_names is not None and len(feat_names) == len(importances):
        imp_df = pd.DataFrame({"feature": feat_names, "importance": importances}).sort_values("importance", ascending=False).head(25)
        display(imp_df)

        plt.figure(figsize=(10, 6))
        plt.barh(imp_df["feature"][::-1], imp_df["importance"][::-1])
        plt.title("Top 25 Feature Importances (RandomForest)")
        plt.xlabel("Importance")
        plt.tight_layout()
        plt.show()
    else:
        print("‚ö†Ô∏è Feature names mismatch; skipping plot.")
else:
    print("Feature importance section is mainly for RandomForest in this notebook.")

## Next Step
Next file: **app.py** (Streamlit competition app)

App will:
- Load `artifacts/best_model.pkl`
- Provide single + batch prediction
- Show probabilities + insights
- Offer downloads and a polished UI
