%% [markdown]
# 🚀 Notebook 6 — Advanced Experiments & Accuracy Boosting

Building on Notebooks 3–5, we now:
- Try advanced models (XGBoost, LightGBM)
- Apply class balancing (SMOTE / class_weight)
- Use feature importance for feature selection
- Compare all results against baseline

Goal: Push accuracy >90% while keeping interpretability in mind.

---

In [1]:
# %% [markdown]
# ## 6.1 Imports

import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report, roc_auc_score, confusion_matrix,
    f1_score, accuracy_score
)

# Imbalanced data handling
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Advanced models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

---

## 6.2 Load Saved Data Splits

We use the same splits saved in Notebook 4 to ensure fair comparison.

In [2]:
X_train, X_val, X_test, y_train, y_val, y_test = joblib.load("models/splits.pkl")
print("✅ Data splits loaded:", X_train.shape, X_val.shape, X_test.shape)

categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numeric_features = X_train.select_dtypes(include=['int64','float64']).columns.tolist()


✅ Data splits loaded: (644, 13) (138, 13) (138, 13)


---

## 6.3 Define Preprocessor

Same as before (scaling numeric, OHE categorical).

In [3]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(drop='first', sparse_output=False), categorical_features)
])

---

## 6.4 Experiment 1: XGBoost + Preprocessing

In [4]:
pipeline_xgb = Pipeline([
    ("preprocessor", preprocessor),
    ("xgb", XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="logloss"
    ))
])

pipeline_xgb.fit(X_train, y_train)
y_pred_xgb = pipeline_xgb.predict(X_val)
y_proba_xgb = pipeline_xgb.predict_proba(X_val)[:,1]

print("\n📊 XGBoost Validation Performance")
print(classification_report(y_val, y_pred_xgb, digits=3))
print("ROC-AUC:", roc_auc_score(y_val, y_proba_xgb))

  if is_sparse(data):



📊 XGBoost Validation Performance
              precision    recall  f1-score   support

           0      0.656     0.656     0.656        61
           1      0.727     0.727     0.727        77

    accuracy                          0.696       138
   macro avg      0.692     0.692     0.692       138
weighted avg      0.696     0.696     0.696       138

ROC-AUC: 0.7585692995529061


In [5]:
pipeline_lgb = Pipeline([
    ("preprocessor", preprocessor),
    ("lgb", LGBMClassifier(
        n_estimators=300,
        max_depth=-1,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ))
])

pipeline_lgb.fit(X_train, y_train)
y_pred_lgb = pipeline_lgb.predict(X_val)
y_proba_lgb = pipeline_lgb.predict_proba(X_val)[:,1]

print("\n📊 LightGBM Validation Performance")
print(classification_report(y_val, y_pred_lgb, digits=3))
print("ROC-AUC:", roc_auc_score(y_val, y_proba_lgb))

[LightGBM] [Info] Number of positive: 356, number of negative: 288
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 325
[LightGBM] [Info] Number of data points in the train set: 644, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.552795 -> initscore=0.211970
[LightGBM] [Info] Start training from score 0.211970

📊 LightGBM Validation Performance
              precision    recall  f1-score   support

           0      0.702     0.656     0.678        61
           1      0.741     0.779     0.759        77

    accuracy                          0.725       138
   macro avg      0.721     0.717     0.719       138
weighted avg      0.724     0.725     0.723       138

ROC-AUC: 0.7451564828614008


In [6]:

from sklearn.linear_model import LogisticRegression

pipeline_smote_lr = ImbPipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("log_reg", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42))
])

pipeline_smote_lr.fit(X_train, y_train)
y_pred_smote_lr = pipeline_smote_lr.predict(X_val)
y_proba_smote_lr = pipeline_smote_lr.predict_proba(X_val)[:,1]

print("\n📊 Logistic Regression + SMOTE Validation Performance")
print(classification_report(y_val, y_pred_smote_lr, digits=3))
print("ROC-AUC:", roc_auc_score(y_val, y_proba_smote_lr))


📊 Logistic Regression + SMOTE Validation Performance
              precision    recall  f1-score   support

           0      0.615     0.656     0.635        61
           1      0.712     0.675     0.693        77

    accuracy                          0.667       138
   macro avg      0.664     0.666     0.664       138
weighted avg      0.669     0.667     0.668       138

ROC-AUC: 0.7715563125399191


In [7]:
# Fit XGBoost on full train
pipeline_xgb.fit(X_train, y_train)
feature_names = (
    numeric_features + 
    list(pipeline_xgb.named_steps["preprocessor"]
         .named_transformers_["cat"]
         .get_feature_names_out(categorical_features))
)
importances = pipeline_xgb.named_steps["xgb"].feature_importances_

feat_imp = pd.DataFrame({"feature": feature_names, "importance": importances})
feat_imp = feat_imp.sort_values(by="importance", ascending=False)
sns.barplot(x="importance", y="feature", data=feat_imp.head(15))
plt.title("XGBoost Feature Importances")
plt.show()

# Drop weak features
selected_features = feat_imp[feat_imp["importance"] > 0.01]["feature"].tolist()
print("Keeping", len(selected_features), "features out of", len(feature_names))


  if is_sparse(data):


NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.