# Boosting with Python

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_table('diabetesdata.txt')
X = df.drop('Diabetes',axis=1)
y = df['Diabetes']
X_col=X.columns
df

Unnamed: 0,Preg,Glucose,BP,SkinThick,Insul,BMI,DPF,Age,Diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1,stratify=y)

# Gradient Boosting

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

In [5]:
model = GradientBoostingClassifier(n_estimators=1000,max_depth=1,learning_rate=1)

In [6]:
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.6818181818181818


# There is a trade-off between number of trees and learning rate. 
# You should never check the test performance when tuning parameters!!! This is only for illustrative purposes

In [7]:
model = GradientBoostingClassifier(n_estimators=1000,max_depth=1,learning_rate=0.1)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.7467532467532467


In [8]:
model = GradientBoostingClassifier(n_estimators=1000,max_depth=1,learning_rate=0.01)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.7337662337662337


In [9]:
model = GradientBoostingClassifier(n_estimators=5000,max_depth=1,learning_rate=0.01)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.7402597402597403


# Parameter Tuning

In [10]:
model = GradientBoostingClassifier(max_depth=1)
#number of trees fit
n_estimators = [100,1000,5000,10000]
#learning rate
learning_rate = [1,0.1,0.01,0.001]
# create grid
params = {
 'n_estimators': n_estimators,
 'learning_rate': learning_rate,
 }
# Random search of parameters
boost_grid = GridSearchCV(estimator = model, param_grid = params, 
                                cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
# Fit the model
boost_grid.fit(X_train, y_train)
# print results
print(boost_grid.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
{'learning_rate': 0.01, 'n_estimators': 5000}


In [11]:
model=GradientBoostingClassifier(n_estimators=5000,max_depth=1,learning_rate=0.01)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.7402597402597403


# Adaboost Classifier

In [12]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()

In [13]:
n_estimators = [50,100,1000,5000]
# number of features at every split
learning_rate = [1,0.1,0.01,0.001]
# create grid
params = {
 'n_estimators': n_estimators,
 'learning_rate': learning_rate,
 }
# Random search of parameters
boost_grid = GridSearchCV(estimator = model, param_grid = params, 
                                cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
# Fit the model
boost_grid.fit(X_train, y_train)
# print results
print(boost_grid.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
{'learning_rate': 0.1, 'n_estimators': 100}


In [None]:
model=AdaBoostClassifier(n_estimators=100,learning_rate=0.1)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

# Extreme Gradient Booster: A popular library that implements boosting faster and generally provides better performance

In [None]:
#pip install XGBoost

In [14]:
from xgboost import XGBClassifier
model = XGBClassifier()
n_estimators = [50,100,1000,5000]
# number of features at every split
learning_rate = [1,0.1,0.01,0.001]
# create grid
params = {
 'n_estimators': n_estimators,
 'learning_rate': learning_rate,
 }
# Random search of parameters
boost_grid = GridSearchCV(estimator = model, param_grid = params, 
                                cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
# Fit the model
boost_grid.fit(X_train, y_train)
# print results
print(boost_grid.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits




{'learning_rate': 0.01, 'n_estimators': 100}


In [None]:
model=XGBClassifier(n_estimators=100,learning_rate=0.01)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

# Regression with Gradient boosting

In [None]:
df = pd.read_csv('Hitters_Data.csv')
df=df.dropna()
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = np.log(df.Salary)

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis = 1).astype('float64')

# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis = 1)

In [None]:
# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_trainStandard = scaler.transform(X_train)
X_testtransformed = scaler.transform(X_test)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(max_depth=1)
n_estimators = [100,1000,5000,10000]
# number of features at every split
learning_rate = [1,0.1,0.01,0.001]
# create grid
params = {
 'Regressor__n_estimators': n_estimators,
 'Regressor__learning_rate': learning_rate,
 }

pipe = Pipeline([('scaler',preprocessing.StandardScaler()),('Regressor', model)])
# Random search of parameters
boost_grid = GridSearchCV(estimator = pipe, param_grid = params, 
                                cv = 5, verbose=2, scoring='neg_mean_squared_error',n_jobs = -1)
# Fit the model
boost_grid.fit(X_train, y_train)
# print results
print(boost_grid.best_params_)

In [None]:
model=GradientBoostingRegressor(n_estimators=1000,max_depth=1,learning_rate=0.01)
model.fit(X_trainStandard,y_train)
y_pred=model.predict(X_testtransformed)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))

# Adaboost Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()

In [None]:
n_estimators = [100,1000,5000,10000]
# number of features at every split
learning_rate = [1,0.1,0.01,0.001]
# create grid
params = {
 'Regressor__n_estimators': n_estimators,
 'Regressor__learning_rate': learning_rate,
 }

pipe = Pipeline([('scaler',preprocessing.StandardScaler()),('Regressor', model)])
# Random search of parameters
boost_grid = GridSearchCV(estimator = pipe, param_grid = params, 
                                cv = 5, verbose=2, scoring='neg_mean_squared_error',n_jobs = -1)
# Fit the model
boost_grid.fit(X_train, y_train)
# print results
print(boost_grid.best_params_)

In [None]:
model=AdaBoostRegressor(n_estimators=5000,learning_rate=0.1)
model.fit(X_trainStandard,y_train)
y_pred=model.predict(X_testtransformed)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))

# XGBoostRegressor

In [None]:
from xgboost import XGBRegressor
model=XGBRegressor()
n_estimators = [100,1000,5000,10000]
# number of features at every split
learning_rate = [1,0.1,0.01,0.001]
# create grid
params = {
 'Regressor__n_estimators': n_estimators,
 'Regressor__learning_rate': learning_rate,
 }

pipe = Pipeline([('scaler',preprocessing.StandardScaler()),('Regressor', model)])
# Random search of parameters
boost_grid = GridSearchCV(estimator = pipe, param_grid = params, 
                                cv = 5, verbose=2, scoring='neg_mean_squared_error',n_jobs = -1)
# Fit the model
boost_grid.fit(X_train, y_train)
# print results
print(boost_grid.best_params_)

In [None]:
model=XGBRegressor(n_estimators=5000,learning_rate=0.01)
model.fit(X_trainStandard,y_train)
y_pred=model.predict(X_testtransformed)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))