here we test out dataset on different poweful ml models such as RandomForest, SVM, Boostings

In [1]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC

import pandas as pd
import numpy as np  
from pathlib import Path
import os
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import joblib


In [2]:
RAW_DATA_PATH = Path(os.getcwd()).parent / "data" / "raw" / "train_titanic.csv"
PROCESSED_DATA_PATH = Path(os.getcwd()).parent / "data" / "processed" / "titanic_processed.csv"

In [3]:
df= pd.read_csv(PROCESSED_DATA_PATH)
df.head()

Unnamed: 0,Sex,AgeGroup_median,Fare_log,Pclass,FamilySize_cluster,Title_transformed,Has_Cabin_Number,Survived
0,male,Adult,2.110213,3,MidSizeFamily,Mr,0,0
1,female,Adult,4.280593,1,MidSizeFamily,Mrs,1,1
2,female,Adult,2.188856,3,Alone,Miss,0,1
3,female,Adult,3.990834,1,MidSizeFamily,Mrs,1,1
4,male,Adult,2.202765,3,Alone,Mr,0,0


to choose which model to use, it is mostly decided by roc-auc metric. Roc-Auc gives the extent the model can distinguish target classes. we try to look for the best roc-auc with the default hyperparameters

In [4]:
from sklearn.model_selection import (
    StratifiedKFold,
    cross_validate,
    cross_val_predict
)
from sklearn.metrics import roc_auc_score

def make_preprocessor(X: pd.DataFrame) -> ColumnTransformer:
    cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist()

    num_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=True))
    ])

    cat_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols)
        ],
        remainder="drop"
    )
    return preprocessor


def make_pipeline(model, X: pd.DataFrame) -> Pipeline:
    preprocessor = make_preprocessor(X)
    return Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", model)
    ])


def evaluate_models_cv(
    X: pd.DataFrame,
    y: pd.Series,
    models: dict,
    n_splits: int = 5,
    seed: int = 42,
    return_oof: bool = True
) -> dict:
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    results = {}
    for name, model in models.items():
        pipe = make_pipeline(model, X)

        # cross_validate gives per-fold scores + timing
        cv_out = cross_validate(
            pipe, X, y,
            cv=cv,
            scoring="roc_auc",
            return_train_score=False,
            n_jobs=-1
        )
        fold_scores = cv_out["test_score"]
        mean_auc = float(np.mean(fold_scores))
        std_auc = float(np.std(fold_scores))

        oof_auc = None
        if return_oof:
            # For AUC we need scores (probabilities or decision function)
            # cross_val_predict supports method='predict_proba' or 'decision_function'.
            # We try predict_proba first; if not available, fall back to decision_function.
            try:
                oof_scores = cross_val_predict(
                    pipe, X, y, cv=cv, method="predict_proba", n_jobs=-1
                )[:, 1]
            except Exception:
                oof_scores = cross_val_predict(
                    pipe, X, y, cv=cv, method="decision_function", n_jobs=-1
                )
            oof_auc = float(roc_auc_score(y, oof_scores))

        results[name] = {
            "roc_auc_mean": mean_auc,
            "roc_auc_std": std_auc,
            "fold_scores": fold_scores,
            "oof_roc_auc": oof_auc,
            "fit_time_mean": float(np.mean(cv_out["fit_time"])),
            "score_time_mean": float(np.mean(cv_out["score_time"]))
        }

    return results


def print_cv_results(results: dict):
    rows = []
    for name, r in results.items():
        rows.append({
            "model": name,
            "roc_auc_mean": r["roc_auc_mean"],
            "roc_auc_std": r["roc_auc_std"],
            "oof_roc_auc": r["oof_roc_auc"],
            "fit_time_mean": r["fit_time_mean"]
        })
    summary = pd.DataFrame(rows).sort_values("roc_auc_mean", ascending=False)
    print(summary.to_string(index=False))

    print("\nPer-fold scores:")
    for name, r in results.items():
        print(f"\n{name}: {np.round(r['fold_scores'], 4)}")




In [5]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

target = "Survived"
X = df.drop(columns=[target])
y = df[target]

models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "GaussianNB": GaussianNB(),
    "SVC_RBF": SVC(kernel="rbf", C=1.0, gamma="scale", random_state=42),
    "LGBMClassifier": LGBMClassifier(random_state=42, verbose=-1),
    "XGBClassifier": XGBClassifier(random_state=42, eval_metric="auc")
}

results = evaluate_models_cv(X, y, models=models, n_splits=5, seed=42, return_oof=True)
print_cv_results(results)

           model  roc_auc_mean  roc_auc_std  oof_roc_auc  fit_time_mean
  LGBMClassifier      0.880566     0.014194     0.879164       4.555291
GradientBoosting      0.878124     0.009720     0.877297       0.210541
   XGBClassifier      0.877670     0.018398     0.875614       0.336835
         SVC_RBF      0.868453     0.014430     0.870791       0.035320
    RandomForest      0.861516     0.018192     0.861308       0.146548
      GaussianNB      0.858991     0.017547     0.857226       0.026294

Per-fold scores:

RandomForest: [0.8899 0.8719 0.8364 0.858  0.8515]

GradientBoosting: [0.8908 0.887  0.8652 0.8775 0.8701]

GaussianNB: [0.8831 0.8663 0.837  0.8406 0.868 ]

SVC_RBF: [0.8964 0.867  0.8559 0.8632 0.8598]

LGBMClassifier: [0.8879 0.9006 0.8578 0.8751 0.8815]

XGBClassifier: [0.8847 0.8947 0.842  0.8805 0.8865]


from these results we can see that LGBMClassifier made the best, but problem it had is quite bigger roc_auc_std and it had trained more time others. here the best result was seen by the GradientBoosting. but the problem is when i start using the optuna the models such as Xgboost or lgbm will give better result and they mostly perform faster and better result with bigger datasets. So my decision falls into Xgboost(much less time than lgbm and similar accuracy with the top 2 models). What needs to be done now is to check if the results are stable when the df is shuffled with the target.

In [6]:
def sanity_check_shuffle_y(
    X: pd.DataFrame,
    y: pd.Series,
    model,
    seed: int = 42
) -> float:
    """
    Shuffle target; AUC should drop to ~0.50. If not, suspect leakage/bug.
    """
    rng = np.random.default_rng(seed)
    y_shuffled = pd.Series(rng.permutation(y.values), index=y.index)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    pipe = make_pipeline(model, X)

    scores = cross_validate(pipe, X, y_shuffled, cv=cv, scoring="roc_auc", n_jobs=-1)["test_score"]
    return float(np.mean(scores))

# Optional leakage sanity check on your best model:
leak_auc = sanity_check_shuffle_y(X, y, model=models["XGBClassifier"], seed=42)
print("Shuffle-y sanity AUC (should be ~0.50):", leak_auc)

Shuffle-y sanity AUC (should be ~0.50): 0.46913498532095865


Why it should be 0.5? Actually it is quite simple with the basic prob. in this section of code, we change the target randomly therefore it no longer connected with the X_train. when it happens our value should be arounf 0.5 which means that it is 50 percent prob. to guess the target correctly.

In [7]:
# Final training + ONE test evaluation pattern:
def train_final_and_eval_once(df: pd.DataFrame, target: str, best_model, seed: int = 42):
    X = df.drop(columns=[target])
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=seed
    )

    pipe = make_pipeline(best_model, X_train)
    pipe.fit(X_train, y_train)

    # Get scores for AUC
    try:
        test_scores = pipe.predict_proba(X_test)[:, 1]
    except Exception:
        test_scores = pipe.decision_function(X_test)

    test_auc = roc_auc_score(y_test, test_scores)
    return test_auc, pipe

test_auc, final_pipe = train_final_and_eval_once(df, "Survived", best_model=models["XGBClassifier"])
print("FINAL TEST ROC AUC:", test_auc)

FINAL TEST ROC AUC: 0.813965744400527


In [8]:
def make_preprocessor(X: pd.DataFrame) -> ColumnTransformer:
    cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist()

    num_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=True))
    ])

    cat_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    return ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols),
        ],
        remainder="drop"
    )


def make_pipeline(model, X: pd.DataFrame) -> Pipeline:
    return Pipeline(steps=[
        ("preprocess", make_preprocessor(X)),
        ("model", model)
    ])


In [9]:
# ---------- Optuna objective ----------
def make_objective(X: pd.DataFrame, y: pd.Series, n_splits=5, seed=42):
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    def objective(trial: optuna.Trial) -> float:
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 2, 10),
            "min_child_weight": trial.suggest_float("min_child_weight", 1e-2, 20.0, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "gamma": trial.suggest_float("gamma", 0.0, 10.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 50.0, log=True),

            "eval_metric": "logloss",
            "tree_method": "hist",   
            "random_state": seed,
            "n_jobs": -1,
        }

        model = XGBClassifier(**params)

        pipe = make_pipeline(model, X)

        oof_scores = cross_val_predict(pipe, X, y, cv=cv, method="predict_proba", n_jobs=-1)[:, 1]
        auc = roc_auc_score(y, oof_scores)

        return auc

    return objective


In [10]:
# ---------- Run Optuna ----------
def tune_xgb_optuna(X, y, n_trials=50, seed=42):
    sampler = optuna.samplers.TPESampler(seed=seed)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(make_objective(X, y, n_splits=5, seed=seed), n_trials=n_trials)

    print("Best AUC:", study.best_value)
    print("Best params:", study.best_params)
    return study

X = df.drop(columns=[target])
y = df[target]

study = tune_xgb_optuna(X, y, n_trials=50, seed=42)

[I 2026-01-24 00:27:02,398] A new study created in memory with name: no-name-3a058527-a8ec-41aa-b3ed-4d37f42f460f
[I 2026-01-24 00:27:02,631] Trial 0 finished with value: 0.8792248532685693 and parameters: {'n_estimators': 500, 'learning_rate': 0.2536999076681772, 'max_depth': 8, 'min_child_weight': 0.9466503798478175, 'subsample': 0.5780093202212182, 'colsample_bytree': 0.5779972601681014, 'gamma': 0.5808361216819946, 'reg_alpha': 0.6245760287469893, 'reg_lambda': 0.006763888939818983}. Best is trial 0 with value: 0.8792248532685693.
[I 2026-01-24 00:27:02,795] Trial 1 finished with value: 0.8678485071208684 and parameters: {'n_estimators': 767, 'learning_rate': 0.010725209743171996, 'max_depth': 10, 'min_child_weight': 5.596520861285641, 'subsample': 0.6061695553391381, 'colsample_bytree': 0.5909124836035503, 'gamma': 1.8340450985343382, 'reg_alpha': 5.472429642032198e-06, 'reg_lambda': 0.0012291273711520685}. Best is trial 0 with value: 0.8792248532685693.
[I 2026-01-24 00:27:02,886

Best AUC: 0.8920365576966095
Best params: {'n_estimators': 243, 'learning_rate': 0.03666482838050812, 'max_depth': 5, 'min_child_weight': 0.02562133314695501, 'subsample': 0.6593062469411188, 'colsample_bytree': 0.7473769928469448, 'gamma': 0.8656742942809635, 'reg_alpha': 1.630330926122681e-05, 'reg_lambda': 3.8117031449276377e-08}


without tuning the auc was about 0.87, it increases by 0.02

In [11]:
# ---------- Train final model and evaluate on test ----------
def train_final_once(df: pd.DataFrame, target: str, best_params: dict, seed: int = 42):
    X = df.drop(columns=[target])
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=seed
    )

    final_params = dict(best_params)
    final_params.update({
        "eval_metric": "logloss",
        "tree_method": "hist",
        "random_state": seed,
        "n_jobs": -1
    })

    model = XGBClassifier(**final_params)
    pipe = make_pipeline(model, X_train)
    pipe.fit(X_train, y_train)

    test_scores = pipe.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, test_scores)

    print("FINAL TEST AUC:", test_auc)
    return pipe, test_auc

final_pipe, test_auc = train_final_once(df, target, study.best_params, seed=42)

FINAL TEST AUC: 0.8432147562582346


In [12]:
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_recall_curve, confusion_matrix,
    precision_score, recall_score, f1_score, classification_report
)
seed = 42
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

best_model = XGBClassifier(
    **study.best_params,
    eval_metric="logloss",
    tree_method="hist",
    random_state=seed,
    n_jobs=-1
)

pipe = make_pipeline(best_model, X)  # uses your make_pipeline + preprocessing

oof_proba = cross_val_predict(pipe, X, y, cv=cv, method="predict_proba", n_jobs=-1)[:, 1]

print("OOF ROC AUC:", roc_auc_score(y, oof_proba))
print("OOF PR AUC (Average Precision):", average_precision_score(y, oof_proba))

OOF ROC AUC: 0.8920365576966095
OOF PR AUC (Average Precision): 0.8667218157217855


In [13]:
prec, rec, thresh = precision_recall_curve(y, oof_proba)

f1 = 2 * (prec[1:] * rec[1:]) / (prec[1:] + rec[1:] + 1e-12)

best_idx = np.argmax(f1)
best_threshold_f1 = thresh[best_idx]

print("Best threshold (max F1):", best_threshold_f1)
print("Precision:", prec[best_idx + 1], "Recall:", rec[best_idx + 1], "F1:", f1[best_idx])


Best threshold (max F1): 0.3768992
Precision: 0.7915492957746478 Recall: 0.8216374269005848 F1: 0.8063127690095432


In [14]:
threshold = best_threshold_f1  

pred = (oof_proba >= threshold).astype(int)

print("Confusion matrix:\n", confusion_matrix(y, pred))
print("Precision:", precision_score(y, pred))
print("Recall:", recall_score(y, pred))
print("F1:", f1_score(y, pred))
print("\nClassification report:\n", classification_report(y, pred))


Confusion matrix:
 [[474  75]
 [ 61 281]]
Precision: 0.7893258426966292
Recall: 0.8216374269005848
F1: 0.8051575931232091

Classification report:
               precision    recall  f1-score   support

           0       0.89      0.86      0.87       549
           1       0.79      0.82      0.81       342

    accuracy                           0.85       891
   macro avg       0.84      0.84      0.84       891
weighted avg       0.85      0.85      0.85       891



In [15]:
Path(Path(os.getcwd()).parent / "artifacts" / "model_data").mkdir(parents=True, exist_ok=True)

In [16]:
import joblib, json
MODEL_PATH = Path(os.getcwd()).parent / "artifacts" / "model_data" / "xgb_pipeline.joblib"
joblib.dump(final_pipe, MODEL_PATH)
DATA_PATH=Path(os.getcwd()).parent / "artifacts" / "model_data" / "threshold.json"
with open(DATA_PATH, "w") as f:
    json.dump({"threshold_f1": float(threshold)}, f)

In [24]:
best_params=study.best_params
out = Path(os.getcwd()).parent / "artifacts" / "model_data" / "best_params.json"
out.parent.mkdir(parents=True, exist_ok=True)

with out.open("w", encoding="utf-8") as f:
    json.dump(best_params, f, ensure_ascii=False, indent=2)


In [25]:
payload = {
    "best_value": float(study.best_value),
    "best_params": study.best_params
} 
out = Path(os.getcwd()).parent / "artifacts" / "model_data" / "best_score.json"
out.parent.mkdir(parents=True, exist_ok=True)
with out.open("w", encoding="utf-8") as f:
    json.dump(payload, f, ensure_ascii=False, indent=2)