<a href="https://colab.research.google.com/github/abinavharsath41-ctrl/FOML-exp/blob/main/Another_copy_of_Gradient_Booster_with_decision_Tree_Regressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    classification_report
)

# --- Configuration ---
RANDOM_STATE = 42
N_ESTIMATORS = 100    # Number of weak learners (M)
LEARNING_RATE = 0.1   # Shrinkage factor (nu)
MAX_DEPTH = 3         # Depth of the individual Decision Tree Regressor (the weak learner)
# ---------------------

def main():
    # Step 2: Load data
    data = load_breast_cancer()
    X = pd.DataFrame(data.data, columns=data.feature_names)
    y = data.target

    print("Dataset loaded: Breast Cancer Wisconsin (Diagnostic)")

    # Step 3: Split train_test_split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.3,
        random_state=RANDOM_STATE,
        stratify=y  # Ensures class proportions are maintained
    )

    # Step 4: Scale the features: StandardScaler
    # Scaling is applied AFTER the split to prevent data leakage
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Features scaled using StandardScaler.")

    # Step 5 & 6: Define and Train Gradient Boosting Classifier
    # GradientBoostingClassifier automatically handles:
    # - Initialization with the mean (for regression targets, or log-odds for classification) (Step 5)
    # - Defining parameters (Step 6)
    # - Training by computing residuals, fitting weak learners, and updating predictions (Step 7)

    gb_clf = GradientBoostingClassifier(
        n_estimators=N_ESTIMATORS,
        learning_rate=LEARNING_RATE,
        max_depth=MAX_DEPTH,
        random_state=RANDOM_STATE,
        # Setting subsample < 1.0 (e.g., 0.8) adds Stochastic Gradient Boosting for robustness
        subsample=1.0
    )

    print(f"\nGradient Boosting Model Defined: N_Estimators={N_ESTIMATORS}, Learning_Rate={LEARNING_RATE}")

    # Step 7: Train Gradient Boosting
    print("Training Gradient Boosting model...")
    gb_clf.fit(X_train_scaled, y_train)
    print("Training complete.")

    # Step 8: Convert final predictions to binary class (0/1) using 0.5 threshold
    # The .predict() method performs this conversion automatically.
    y_pred = gb_clf.predict(X_test_scaled)

    # Get probability for the positive class (1) for ROC AUC
    y_pred_proba = gb_clf.predict_proba(X_test_scaled)[:, 1]

    # Step 9: Evaluate

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()

    # Basic Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # ROC AUC Score
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    print("\n==============================================")
    print("  EVALUATION RESULTS (Gradient Boosting)     ")
    print("==============================================")

    print("\nMetrics:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  ROC AUC Score: {roc_auc:.4f}")

    print("\nConfusion Matrix:")
    print(cm)
    print(f"  TN={TN}, FP={FP}, FN={FN}, TP={TP}")

    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=data.target_names))

    # Step 10: ROC curve plot
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkgreen', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR) / Recall')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.show()

if __name__ == "__main__":
    # Ensure matplotlib is set to a non-interactive mode if running in a headless environment
    try:
        plt.switch_backend('Agg')
    except ImportError:
        pass

    main()

Dataset loaded: Breast Cancer Wisconsin (Diagnostic)
Features scaled using StandardScaler.

Gradient Boosting Model Defined: N_Estimators=100, Learning_Rate=0.1
Training Gradient Boosting model...
Training complete.

  EVALUATION RESULTS (Gradient Boosting)     

Metrics:
  Accuracy: 0.9474
  Precision: 0.9455
  Recall: 0.9720
  F1-Score: 0.9585
  ROC AUC Score: 0.9899

Confusion Matrix:
[[ 58   6]
 [  3 104]]
  TN=58, FP=6, FN=3, TP=104

Classification Report:
              precision    recall  f1-score   support

   malignant       0.95      0.91      0.93        64
      benign       0.95      0.97      0.96       107

    accuracy                           0.95       171
   macro avg       0.95      0.94      0.94       171
weighted avg       0.95      0.95      0.95       171

