In [None]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

print("="*60)
print("GRADIENT BOOSTING CLASSIFICATION")
print("="*60 + "\n")

# Generate classification dataset
X_clf, y_clf = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    random_state=42
)

# Split data
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)

print(f"Training samples: {X_train_clf.shape[0]}")
print(f"Test samples: {X_test_clf.shape[0]}")
print(f"Features: {X_train_clf.shape[1]}\n")

# Create and train Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    min_samples_split=2,
    min_samples_leaf=1,
    subsample=0.8,
    random_state=42,
    verbose=1
)

print("Training Gradient Boosting Classifier...")
gb_clf.fit(X_train_clf, y_train_clf)

# Make predictions
y_pred_clf = gb_clf.predict(X_test_clf)
y_pred_proba = gb_clf.predict_proba(X_test_clf)

# Evaluate
accuracy = accuracy_score(y_test_clf, y_pred_clf)
print(f"\nAccuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_clf, y_pred_clf))

# Cross-validation score
cv_scores = cross_val_score(gb_clf, X_clf, y_clf, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Feature importance
feature_importance = gb_clf.feature_importances_
importance_df = pd.DataFrame({
    'Feature': [f'Feature_{i}' for i in range(len(feature_importance))],
    'Importance': feature_importance
}).sort_values('Importance', ascending=False).head(10)

print("\nTop 10 Feature Importances:")
print(importance_df.to_string(index=False))

# Training deviance (loss over iterations)
print(f"\nTraining deviance: {gb_clf.train_score_[-1]:.4f}")
print(f"Number of iterations: {len(gb_clf.train_score_)}")

print("\n" + "="*60)
print("GRADIENT BOOSTING REGRESSION")
print("="*60 + "\n")

# Generate regression dataset
X_reg, y_reg = make_regression(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    noise=10,
    random_state=42
)

# Split data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Create and train Gradient Boosting Regressor
gb_reg = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    min_samples_split=2,
    min_samples_leaf=1,
    subsample=0.8,
    loss='squared_error',
    random_state=42,
    verbose=0
)

print("Training Gradient Boosting Regressor...")
gb_reg.fit(X_train_reg, y_train_reg)

# Make predictions
y_pred_reg = gb_reg.predict(X_test_reg)

# Evaluate
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"\nMean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

# Feature importance for regression
feature_importance_reg = gb_reg.feature_importances_
importance_df_reg = pd.DataFrame({
    'Feature': [f'Feature_{i}' for i in range(len(feature_importance_reg))],
    'Importance': feature_importance_reg
}).sort_values('Importance', ascending=False).head(10)

print("\nTop 10 Feature Importances (Regression):")
print(importance_df_reg.to_string(index=False))

print("\n" + "="*60)
print("STAGED PREDICTIONS (Prediction at each boosting stage)")
print("="*60 + "\n")

# Staged predictions - useful for finding optimal n_estimators
staged_preds = list(gb_clf.staged_predict(X_test_clf))
staged_accuracy = [accuracy_score(y_test_clf, pred) for pred in staged_preds]

print(f"Accuracy at stage 10: {staged_accuracy[9]:.4f}")
print(f"Accuracy at stage 50: {staged_accuracy[49]:.4f}")
print(f"Accuracy at stage 100: {staged_accuracy[99]:.4f}")

print("\nGradient Boosting model training complete!")

GRADIENT BOOSTING CLASSIFICATION

Training samples: 800
Test samples: 200
Features: 20

Training Gradient Boosting Classifier...
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.2987           0.0740            3.37s
         2           1.2306           0.0726            2.38s
         3           1.1721           0.0484            2.04s
         4           1.1186           0.0642            1.92s
         5           1.0795           0.0820            1.94s
         6           1.0254           0.0082            2.06s
         7           0.9938           0.0770            2.04s
         8           0.9484          -0.0091            1.92s
         9           0.9173           0.0226            1.81s
        10           0.8977           0.0543            2.57s
        20           0.6769          -0.0245            1.64s
        30           0.5510           0.0199            1.24s
        40           0.4465           0.0024            1.01s
  