# Gradient Boosting Regression Workflow
This notebook demonstrates a complete regression workflow using GradientBoostingRegressor and the cleaned data file.

## 1. Import Required Libraries
Import pandas, numpy, matplotlib, seaborn, and scikit-learn for data analysis, visualization, and modeling.

In [6]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor

## 2. Load and Inspect Cleaned Data
Load the cleaned CSV file and display basic information.

In [8]:
# Load the cleaned data
df = pd.read_csv(r'C:\MLData\8psx_data_120_cleaned.csv')
df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3602 entries, 0 to 3601
Data columns (total 6 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   DBL4_0.65D_NLVT_D2X1S_60PP[NLVT_D2X1S_60PP]@50%@ETEST             3602 non-null   float64
 1   DBL4_0.65D_PLVT_D2X1S_60PP[PLVT_D2X1S_60PP]@50%@ETEST             3602 non-null   float64
 2   REXTB4_0.05D_NLVT_D2X1S_60PP[NLVT_D2X1S_60PP]@50%@ETEST           3602 non-null   float64
 3   RLMGN_GATE/NS/DFMAX/L_60PP_SL[GATE/NS/DFMAX/L_60PP_SL]@50%@ETEST  3602 non-null   float64
 4   RL_0.1V_M0_22B_SL[M0_22B_SL]@50%@ETEST                            3602 non-null   float64
 5   IDV_2204_XNOM3GNES12_FULLDIE_0950_MED_MEAN@SORT                   900 non-null    float64
dtypes: float64(6)
memory usage: 169.0 KB


Unnamed: 0,DBL4_0.65D_NLVT_D2X1S_60PP[NLVT_D2X1S_60PP]@50%@ETEST,DBL4_0.65D_PLVT_D2X1S_60PP[PLVT_D2X1S_60PP]@50%@ETEST,REXTB4_0.05D_NLVT_D2X1S_60PP[NLVT_D2X1S_60PP]@50%@ETEST,RLMGN_GATE/NS/DFMAX/L_60PP_SL[GATE/NS/DFMAX/L_60PP_SL]@50%@ETEST,RL_0.1V_M0_22B_SL[M0_22B_SL]@50%@ETEST,IDV_2204_XNOM3GNES12_FULLDIE_0950_MED_MEAN@SORT
count,3602.0,3602.0,3602.0,3602.0,3602.0,900.0
mean,62.616373,81.97936,43.350124,2.846914,362.531748,14990.903028
std,3.841165,6.936161,2.1455,0.209396,31.52358,243.140863
min,51.333005,62.14246,35.604915,2.202289,284.82565,14275.63086
25%,59.964984,77.211357,41.999036,2.705536,338.6392,14827.781665
50%,62.400548,81.589375,43.382275,2.837588,360.2165,14989.114615
75%,65.078806,86.391449,44.758429,2.976218,383.962713,15157.202
max,77.762155,118.7919,52.55793,3.783915,503.8898,15763.91338


## 3. Preprocess Data
Handle missing values, encode categorical variables, and scale features if needed.

In [9]:
# Drop rows with missing target and fill other missing values with column mean
target_col = 'IDV_2204_XNOM3GNES12_FULLDIE_0950_MED_MEAN@SORT'  # Update if needed
if df[target_col].isnull().any():
    df = df.dropna(subset=[target_col])
df = df.fillna(df.mean(numeric_only=True))
# Encode categorical variables if any
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.drop(target_col, errors='ignore')
df = pd.get_dummies(df, columns=categorical_cols)
# Feature scaling (commented out for now)
# feature_cols = df.columns.drop(target_col)
# scaler = StandardScaler()
# df[feature_cols] = scaler.fit_transform(df[feature_cols])
df.head()

Unnamed: 0,DBL4_0.65D_NLVT_D2X1S_60PP[NLVT_D2X1S_60PP]@50%@ETEST,DBL4_0.65D_PLVT_D2X1S_60PP[PLVT_D2X1S_60PP]@50%@ETEST,REXTB4_0.05D_NLVT_D2X1S_60PP[NLVT_D2X1S_60PP]@50%@ETEST,RLMGN_GATE/NS/DFMAX/L_60PP_SL[GATE/NS/DFMAX/L_60PP_SL]@50%@ETEST,RL_0.1V_M0_22B_SL[M0_22B_SL]@50%@ETEST,IDV_2204_XNOM3GNES12_FULLDIE_0950_MED_MEAN@SORT
0,62.411005,76.98818,44.74765,2.802551,392.51925,15014.38007
1,58.43473,88.828865,44.93934,3.116242,380.5992,15167.33525
3,58.878395,74.881965,45.07416,2.508621,408.1627,14556.75514
4,62.129235,89.73223,45.279775,2.925623,377.4858,14920.56144
5,57.41321,78.80551,47.28787,3.037888,420.0151,14443.94676


## 4. Split Data into Training and Test Sets
Divide the data for model training and evaluation.

In [10]:
# Prepare X and y, then split into train and test sets
X = df.drop(columns=[target_col])
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}')

Training samples: 720, Test samples: 180


## 5. Train Gradient Boosting Model
Train a baseline GradientBoostingRegressor on the training data.

In [11]:
# Train baseline Gradient Boosting model
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Baseline GradientBoostingRegressor: MAE={mae:.3f}, R2={r2:.3f}')

Baseline GradientBoostingRegressor: MAE=154.031, R2=0.389


## 6. Hyperparameter Tuning
Use GridSearchCV to tune hyperparameters for GradientBoostingRegressor.

In [None]:
# Enhanced hyperparameter grid for GradientBoostingRegressor
param_grid = {
    'n_estimators': [400, 600, 800, 1000],
    'max_depth': [2, 3, 4, 5, 6, 7, 8],
    'learning_rate': [0.005, 0.01, 0.02, 0.05, 0.1],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['auto', 'sqrt', 'log2', None]
}
gbr_grid = GridSearchCV(
    GradientBoostingRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
    )
gbr_grid.fit(X_train, y_train)
print('Best GradientBoostingRegressor parameters:', gbr_grid.best_params_)
print('Best cross-validated R2:', gbr_grid.best_score_)
# Evaluate on test set
gbr_grid_y_pred = gbr_grid.predict(X_test)
gbr_grid_mae = mean_absolute_error(y_test, gbr_grid_y_pred)
gbr_grid_r2 = r2_score(y_test, gbr_grid_y_pred)
print(f'GridSearchCV GradientBoostingRegressor: MAE={gbr_grid_mae:.3f}, R2={gbr_grid_r2:.3f}')
# Compare R2 scores before and after tuning for Gradient Boosting
r2_before_gbr = r2  # from baseline model cell
r2_after_gbr = gbr_grid_r2  # from GridSearchCV cell
plt.figure(figsize=(6, 4))
plt.bar(['Before Tuning', 'After Tuning'], [r2_before_gbr, r2_after_gbr], color=['C3', 'C2'])
plt.ylabel('R2 Score')
plt.title('Gradient Boosting R2 Score: Before vs After Hyperparameter Tuning')
plt.ylim(min(-1, r2_before_gbr, r2_after_gbr), 1)
for i, v in enumerate([r2_before_gbr, r2_after_gbr]):
    plt.text(i, v, f'{v:.2f}', ha='center', va='bottom' if v>=0 else 'top')
plt.axhline(0, color='gray', linewidth=1)
plt.show()

Fitting 5 folds for each of 53760 candidates, totalling 268800 fits


## 7. Feature Importance Visualization
Plot the feature importances from the trained GradientBoostingRegressor.

In [None]:
# Plot feature importances from the best estimator
importances = gbr_grid.best_estimator_.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title('Feature Importances (GradientBoostingRegressor)')
sns.barplot(x=importances[indices][:10], y=feature_names[indices][:10], palette='viridis')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

## 6a. Compare Accuracy Before and After Tuning
Visualize and compare R2 scores before and after hyperparameter tuning.

In [None]:
# Compare R2 scores before and after tuning for Gradient Boosting
r2_before_gbr = r2  # from baseline model cell
r2_after_gbr = gbr_grid_r2  # from GridSearchCV cell
plt.figure(figsize=(6, 4))
plt.bar(['Before Tuning', 'After Tuning'], [r2_before_gbr, r2_after_gbr], color=['C3', 'C2'])
plt.ylabel('R2 Score')
plt.title('Gradient Boosting R2 Score: Before vs After Hyperparameter Tuning')
plt.ylim(min(-1, r2_before_gbr, r2_after_gbr), 1)
for i, v in enumerate([r2_before_gbr, r2_after_gbr]):
    plt.text(i, v, f'{v:.2f}', ha='center', va='bottom' if v>=0 else 'top')
plt.axhline(0, color='gray', linewidth=1)
plt.show()