here we test out dataset on different poweful ml models such as RandomForest, SVM, Boostings

In [32]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC

import pandas as pd
import numpy as np  
from pathlib import Path
import os
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import joblib


In [2]:
RAW_DATA_PATH = Path(os.getcwd()).parent / "data" / "raw" / "train_titanic.csv"
PROCESSED_DATA_PATH = Path(os.getcwd()).parent / "data" / "processed" / "titanic_processed.csv"

In [3]:
df= pd.read_csv(PROCESSED_DATA_PATH)
df.head()

Unnamed: 0,Sex,AgeGroup_median,Fare_log,Pclass,FamilySize_cluster,Title_transformed,Has_Cabin_Number,Survived
0,male,Adult,2.110213,3,MidSizeFamily,Mr,0,0
1,female,Adult,4.280593,1,MidSizeFamily,Mrs,1,1
2,female,Adult,2.188856,3,Alone,Miss,0,1
3,female,Adult,3.990834,1,MidSizeFamily,Mrs,1,1
4,male,Adult,2.202765,3,Alone,Mr,0,0


to choose which model to use, it is mostly decided by roc-auc metric. Roc-Auc gives the extent the model can distinguish target classes. we try to look for the best roc-auc with the default hyperparameters

In [None]:
from sklearn.model_selection import (
    StratifiedKFold,
    cross_validate,
    cross_val_predict
)
from sklearn.metrics import roc_auc_score

def make_preprocessor(X: pd.DataFrame) -> ColumnTransformer:
    cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist()

    num_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=True))
    ])

    cat_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols)
        ],
        remainder="drop"
    )
    return preprocessor


def make_pipeline(model, X: pd.DataFrame) -> Pipeline:
    preprocessor = make_preprocessor(X)
    return Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", model)
    ])


def evaluate_models_cv(
    X: pd.DataFrame,
    y: pd.Series,
    models: dict,
    n_splits: int = 5,
    seed: int = 42,
    return_oof: bool = True
) -> dict:
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    results = {}
    for name, model in models.items():
        pipe = make_pipeline(model, X)

        # cross_validate gives per-fold scores + timing
        cv_out = cross_validate(
            pipe, X, y,
            cv=cv,
            scoring="roc_auc",
            return_train_score=False,
            n_jobs=-1
        )
        fold_scores = cv_out["test_score"]
        mean_auc = float(np.mean(fold_scores))
        std_auc = float(np.std(fold_scores))

        oof_auc = None
        if return_oof:
            # For AUC we need scores (probabilities or decision function)
            # cross_val_predict supports method='predict_proba' or 'decision_function'.
            # We try predict_proba first; if not available, fall back to decision_function.
            try:
                oof_scores = cross_val_predict(
                    pipe, X, y, cv=cv, method="predict_proba", n_jobs=-1
                )[:, 1]
            except Exception:
                oof_scores = cross_val_predict(
                    pipe, X, y, cv=cv, method="decision_function", n_jobs=-1
                )
            oof_auc = float(roc_auc_score(y, oof_scores))

        results[name] = {
            "roc_auc_mean": mean_auc,
            "roc_auc_std": std_auc,
            "fold_scores": fold_scores,
            "oof_roc_auc": oof_auc,
            "fit_time_mean": float(np.mean(cv_out["fit_time"])),
            "score_time_mean": float(np.mean(cv_out["score_time"]))
        }

    return results


def print_cv_results(results: dict):
    rows = []
    for name, r in results.items():
        rows.append({
            "model": name,
            "roc_auc_mean": r["roc_auc_mean"],
            "roc_auc_std": r["roc_auc_std"],
            "oof_roc_auc": r["oof_roc_auc"],
            "fit_time_mean": r["fit_time_mean"]
        })
    summary = pd.DataFrame(rows).sort_values("roc_auc_mean", ascending=False)
    print(summary.to_string(index=False))

    print("\nPer-fold scores:")
    for name, r in results.items():
        print(f"\n{name}: {np.round(r['fold_scores'], 4)}")




In [35]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

target = "Survived"
X = df.drop(columns=[target])
y = df[target]

models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "GaussianNB": GaussianNB(),
    "SVC_RBF": SVC(kernel="rbf", C=1.0, gamma="scale", random_state=42),
    "LGBMClassifier": LGBMClassifier(random_state=42, verbose=-1),
    "XGBClassifier": XGBClassifier(random_state=42, eval_metric="auc")
}

results = evaluate_models_cv(X, y, models=models, n_splits=5, seed=42, return_oof=True)
print_cv_results(results)

           model  roc_auc_mean  roc_auc_std  oof_roc_auc  fit_time_mean
  LGBMClassifier      0.880566     0.014194     0.879164       2.787186
GradientBoosting      0.878124     0.009720     0.877297       0.077810
   XGBClassifier      0.877670     0.018398     0.875614       0.278194
         SVC_RBF      0.868453     0.014430     0.870791       0.013178
    RandomForest      0.861516     0.018192     0.861308       0.096936
      GaussianNB      0.858991     0.017547     0.857226       0.010587

Per-fold scores:

RandomForest: [0.8899 0.8719 0.8364 0.858  0.8515]

GradientBoosting: [0.8908 0.887  0.8652 0.8775 0.8701]

GaussianNB: [0.8831 0.8663 0.837  0.8406 0.868 ]

SVC_RBF: [0.8964 0.867  0.8559 0.8632 0.8598]

LGBMClassifier: [0.8879 0.9006 0.8578 0.8751 0.8815]

XGBClassifier: [0.8847 0.8947 0.842  0.8805 0.8865]


from these results we can see that LGBMClassifier made the best, but problem it had is quite bigger roc_auc_std and it had trained more time others. here the best result was seen by the GradientBoosting. What needs to be done now is to check if the results are stable when the df is shuffled with the target.

In [None]:
def sanity_check_shuffle_y(
    X: pd.DataFrame,
    y: pd.Series,
    model,
    seed: int = 42
) -> float:
    """
    Shuffle target; AUC should drop to ~0.50. If not, suspect leakage/bug.
    """
    rng = np.random.default_rng(seed)
    y_shuffled = pd.Series(rng.permutation(y.values), index=y.index)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    pipe = make_pipeline(model, X)

    scores = cross_validate(pipe, X, y_shuffled, cv=cv, scoring="roc_auc", n_jobs=-1)["test_score"]
    return float(np.mean(scores))

# Optional leakage sanity check on your best model:
leak_auc = sanity_check_shuffle_y(X, y, model=models["GradientBoosting"], seed=42)
print("Shuffle-y sanity AUC (should be ~0.50):", leak_auc)

Shuffle-y sanity AUC (should be ~0.50): 0.46315892684085275


Why it should be 0.5? Actually it is quite simple with the basic prob. in this section of code, we change the target randomly therefore it no longer connected with the X_train. when it happens our value should be arounf 0.5 which means that it is 50 percent prob. to guess the target correctly.

In [None]:
# Final training + ONE test evaluation pattern:
def train_final_and_eval_once(df: pd.DataFrame, target: str, best_model, seed: int = 42):
    X = df.drop(columns=[target])
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=seed
    )

    pipe = make_pipeline(best_model, X_train)
    pipe.fit(X_train, y_train)

    # Get scores for AUC
    try:
        test_scores = pipe.predict_proba(X_test)[:, 1]
    except Exception:
        test_scores = pipe.decision_function(X_test)

    test_auc = roc_auc_score(y_test, test_scores)
    return test_auc, pipe

test_auc, final_pipe = train_final_and_eval_once(df, "Survived", best_model=models["GradientBoosting"])
print("FINAL TEST ROC AUC:", test_auc)

FINAL TEST ROC AUC: 0.8375494071146246
