In [1]:
# Cell 1 - imports & load processed data
import pandas as pd
import numpy as np
import joblib
from pathlib import Path

X_train = pd.read_csv("../data/processed/X_train.csv")
X_test  = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").values.ravel()
y_test  = pd.read_csv("../data/processed/y_test.csv").values.ravel()

print("Shapes:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("Columns:", X_train.columns.tolist())


Shapes: (560, 19) (141, 19) (560,) (141,)
Columns: ['a1_score', 'a2_score', 'a3_score', 'a4_score', 'a5_score', 'a6_score', 'a7_score', 'a8_score', 'a9_score', 'a10_score', 'age', 'gender', 'ethnicity', 'jundice', 'contry_of_res', 'used_app_before', 'age_desc', 'relation', 'age_group']


In [2]:
# Cell 2 - evaluation helper
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, classification_report

def evaluate_model(model, X_train, X_test, y_train, y_test, verbose=True):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    proba = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else None

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    rec = recall_score(y_test, preds)
    roc = roc_auc_score(y_test, proba) if proba is not None else np.nan

    if verbose:
        print(classification_report(y_test, preds, digits=4))
        print(f"Acc: {acc:.4f}  F1: {f1:.4f}  Recall: {rec:.4f}  ROC_AUC: {roc:.4f}")
    return {"accuracy": acc, "f1": f1, "recall": rec, "roc_auc": roc}


In [3]:
# Cell 3 - define models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

models = {
    "Logistic Regression": LogisticRegression(max_iter=3000, class_weight='balanced', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=42),
    "XGBoost": xgb.XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.9,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    ),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(128,64), max_iter=2000, random_state=42)
}


In [4]:
# Cell 4 - train & evaluate
results = {}
for name, m in models.items():
    print("Training:", name)
    try:
        res = evaluate_model(m, X_train, X_test, y_train, y_test)
    except Exception as e:
        print("Training failed for", name, ":", e)
        res = {"accuracy": np.nan, "f1": np.nan, "recall": np.nan, "roc_auc": np.nan}
    results[name] = res

res_df = pd.DataFrame(results).T
res_df = res_df[["accuracy","f1","recall","roc_auc"]]
display(res_df)


Training: Logistic Regression
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       103
           1     1.0000    1.0000    1.0000        38

    accuracy                         1.0000       141
   macro avg     1.0000    1.0000    1.0000       141
weighted avg     1.0000    1.0000    1.0000       141

Acc: 1.0000  F1: 1.0000  Recall: 1.0000  ROC_AUC: 1.0000
Training: Random Forest
              precision    recall  f1-score   support

           0     0.9439    0.9806    0.9619       103
           1     0.9412    0.8421    0.8889        38

    accuracy                         0.9433       141
   macro avg     0.9426    0.9113    0.9254       141
weighted avg     0.9432    0.9433    0.9422       141

Acc: 0.9433  F1: 0.8889  Recall: 0.8421  ROC_AUC: 0.9964
Training: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0     0.9903    0.9903    0.9903       103
           1     0.9737    0.9737    0.9737        38

    accuracy                         0.9858       141
   macro avg     0.9820    0.9820    0.9820       141
weighted avg     0.9858    0.9858    0.9858       141

Acc: 0.9858  F1: 0.9737  Recall: 0.9737  ROC_AUC: 0.9995
Training: Neural Network
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       103
           1     1.0000    1.0000    1.0000        38

    accuracy                         1.0000       141
   macro avg     1.0000    1.0000    1.0000       141
weighted avg     1.0000    1.0000    1.0000       141

Acc: 1.0000  F1: 1.0000  Recall: 1.0000  ROC_AUC: 1.0000


Unnamed: 0,accuracy,f1,recall,roc_auc
Logistic Regression,1.0,1.0,1.0,1.0
Random Forest,0.943262,0.888889,0.842105,0.996423
XGBoost,0.985816,0.973684,0.973684,0.999489
Neural Network,1.0,1.0,1.0,1.0


In [5]:
# Cell 5 - select best model (force XGBoost as requested), refit and save
best_name = "XGBoost"
if best_name not in models:
    # fallback to best by F1
    best_name = res_df.sort_values(["f1","roc_auc"], ascending=False).index[0]
print("Selected best model:", best_name)

best_model = models[best_name]
# fit on train (again, to be safe)
best_model.fit(X_train, y_train)

# Save model
Path("../models").mkdir(parents=True, exist_ok=True)
joblib.dump(best_model, "../models/best_model.pkl")
print("Saved best_model.pkl (XGBoost).")


Selected best model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Saved best_model.pkl (XGBoost).


In [6]:
# Cell 6 - SHAP TreeExplainer for XGBoost (fast). Save explainer + values if possible.
import shap
import numpy as np

try:
    explainer = shap.TreeExplainer(best_model)
    # compute shap_values for the whole test set or a reasonable subset
    # full test is fine if small; otherwise subset to speed up
    subset = X_test if len(X_test) <= 1000 else X_test.sample(1000, random_state=42)
    shap_values = explainer(subset)

    # Save the explainer and shap_values (joblib usually works for TreeExplainer)
    joblib.dump({"explainer": explainer, "shap_values": shap_values, "X_test_subset": subset}, "../models/shap_explainer_and_values.pkl")
    print("Saved TreeExplainer and SHAP values to ../models/shap_explainer_and_values.pkl")
except Exception as e:
    # fallback: still try to save shap_values array only
    print("TreeExplainer save failed:", e)
    try:
        expl = shap.TreeExplainer(best_model)
        sv = expl(X_test[:200])
        joblib.dump({"shap_values": sv, "X_test_subset": X_test[:200]}, "../models/shap_explainer_and_values.pkl")
        print("Saved shap_values (fallback) to ../models/shap_explainer_and_values.pkl")
    except Exception as e2:
        print("SHAP fallback save failed:", e2)


  from .autonotebook import tqdm as notebook_tqdm


Saved TreeExplainer and SHAP values to ../models/shap_explainer_and_values.pkl


In [7]:
# Cell 7 - verify scaler & model feature expectations
scaler = joblib.load("../models/scaler.pkl")
print("Scaler dims:", len(scaler.mean_) if hasattr(scaler, "mean_") else "unknown")
m = joblib.load("../models/best_model.pkl")
# For XGBoost sklearn wrapper, feature names expected can be in booster:
try:
    feats_expected = getattr(m, "feature_names_in_", None)
    print("Model feature_names_in_:", feats_expected)
except Exception:
    print("Model loaded; feature name checking skipped.")


Scaler dims: 19
Model feature_names_in_: ['a1_score' 'a2_score' 'a3_score' 'a4_score' 'a5_score' 'a6_score'
 'a7_score' 'a8_score' 'a9_score' 'a10_score' 'age' 'gender' 'ethnicity'
 'jundice' 'contry_of_res' 'used_app_before' 'age_desc' 'relation'
 'age_group']
