# Gradient Boosting Classifier ‚Äî Optimized (SMOTE + GridSearchCV)

**Final: Wide (accurate but slower) GridSearchCV with SMOTE**

This notebook re-creates the training pipeline used in `Model_Phase3.ipynb`:
- SMOTE oversampling on the training set
- GridSearchCV with a wide parameter grid (accurate but slower)
- Full evaluation and model saving

> Make sure `data.csv` is in the same folder and replace `target` with your label column name if different.

In [None]:
# 1) Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve, auc
)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')


In [None]:
# 2) Load dataset
# Make sure data.csv is the same dataset used originally.
data = pd.read_csv("data.csv")
print("Data shape:", data.shape)
data.head()

In [None]:
# 3) Separate features and target
# Replace 'target' with the actual target column name if different.
TARGET_COL = 'target'
X = data.drop(TARGET_COL, axis=1)
y = data[TARGET_COL]

print("X shape:", X.shape)
print("y distribution:\n", y.value_counts())

In [None]:
# 4) Train / Test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train distribution:", np.bincount(y_train))

In [None]:
# 5) Setup pipeline: SMOTE -> Scaler -> GradientBoostingClassifier
smote = SMOTE(random_state=42)
scaler = StandardScaler()
gb = GradientBoostingClassifier(random_state=42)

pipeline = ImbPipeline([
    ('smote', smote),
    ('scaler', scaler),
    ('gb', gb)
])

# We'll search hyperparameters for the 'gb' step using GridSearchCV.


In [None]:
# 6) Wide parameter grid for accurate (but slower) search
param_grid = {
    'gb__n_estimators': [100, 200, 300, 500],
    'gb__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'gb__max_depth': [3, 4, 5, 6],
    'gb__subsample': [0.6, 0.8, 1.0],
    'gb__min_samples_leaf': [1, 3, 5]
}

print("Grid sizes (approx):", 
      len(param_grid['gb__n_estimators']) * len(param_grid['gb__learning_rate']) *
      len(param_grid['gb__max_depth']) * len(param_grid['gb__subsample']) *
      len(param_grid['gb__min_samples_leaf'])
)

In [None]:
# 7) GridSearchCV (accurate but slower)
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1_macro',  # optimize for balanced F1 across classes
    cv=5,
    n_jobs=-1,
    verbose=2,
    refit=True
)

# Run the grid search (this may take a long time)
grid_search.fit(X_train, y_train)

print("\nBest params:")
print(grid_search.best_params_)
print("\nBest CV score (f1_macro):", grid_search.best_score_)
best_model = grid_search.best_estimator_

In [None]:
# 8) Evaluate on the test set
# The pipeline includes SMOTE so the fitted pipeline has already resampled inside CV.
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, 'predict_proba') else None

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='binary' if len(np.unique(y))==2 else 'macro', zero_division=0)
rec = recall_score(y_test, y_pred, average='binary' if len(np.unique(y))==2 else 'macro', zero_division=0)
f1 = f1_score(y_test, y_pred, average='binary' if len(np.unique(y))==2 else 'macro', zero_division=0)
roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None

print("‚úÖ Optimized Gradient Boosting (with SMOTE + GridSearchCV)")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
if roc_auc is not None:
    print(f"ROC-AUC  : {roc_auc:.4f}")

print("\nüîç Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# 9) Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title("Confusion Matrix - Optimized Gradient Boosting")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

# 10) ROC curve
if y_proba is not None:
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc_val = auc(fpr, tpr)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc_val:.4f})')
    plt.plot([0,1],[0,1],'--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve - Optimized Gradient Boosting')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.show()


In [None]:
# 11) Feature importance (from the 'gb' step)
# Extract the GradientBoostingClassifier inside the pipeline
gb_step = best_model.named_steps['gb']
importances = pd.Series(gb_step.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(10,6))
importances.head(20).plot(kind='bar')
plt.title("Top 20 Feature Importances - Optimized Gradient Boosting")
plt.tight_layout()
plt.show()

# 12) Save the best pipeline/model
joblib.dump(best_model, "gradient_boosting_optimized_pipeline.pkl")
print("\nüíæ Saved optimized pipeline as 'gradient_boosting_optimized_pipeline.pkl'")