In [1]:
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import numpy as np

X, y =load_boston(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [2]:
from scipy.special import expit
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor

def grad(y, h):
    return y - h

def fit(X, y, models):
    
    models_trained = []
    
    #starting model, don't use smart model in the first, it will overfit
    first_model = DummyRegressor(strategy='mean')
    first_model.fit(X, y)
    models_trained.append(first_model)
    
    # fit the estimators
    for i, model in enumerate(models):
        y_pred = predict(X, models_trained)
    
        residual = grad(y, y_pred)
    
        model.fit(X, residual)

        models_trained.append(model)
    return models_trained

def predict(X, models):
    learning_rate = 0.1
    f0 = models[0].predict(X)
    boosting = sum(learning_rate * model.predict(X) for model in models[1:])
    return f0 + boosting

In [3]:
n_estimators = 200
tree_params = {'max_depth':1}
models = [DecisionTreeRegressor(**tree_params) for _ in range(n_estimators)]

models = fit(X_train, y_train, models)

y_pred = predict(X_test, models)

print("Our MSE: ", mean_squared_error(y_test, y_pred))


Our MSE:  12.945557601580584


In [4]:
sklearn_model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=0.1, max_depth=1, loss='ls')

y_pred_sk = sklearn_model.fit(X_train, y_train).predict(X_test)

print("sklearn MSE: ", mean_squared_error(y_test, y_pred_sk))

sklearn MSE:  12.945557601580582


In [5]:
import xgboost

xgb_reg = xgboost.XGBRegressor()

xgb_reg.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

[0]	validation_0-rmse:16.15458
[1]	validation_0-rmse:11.84377
[2]	validation_0-rmse:8.79602
[3]	validation_0-rmse:6.72584
[4]	validation_0-rmse:5.46526
[5]	validation_0-rmse:4.65454
[6]	validation_0-rmse:4.08462
[7]	validation_0-rmse:3.76129
[8]	validation_0-rmse:3.54313
[9]	validation_0-rmse:3.37742
[10]	validation_0-rmse:3.24836
[11]	validation_0-rmse:3.18872
[12]	validation_0-rmse:3.10860
[13]	validation_0-rmse:3.09993
[14]	validation_0-rmse:3.08393
[15]	validation_0-rmse:3.08760
[16]	validation_0-rmse:3.06310
[17]	validation_0-rmse:3.05292
[18]	validation_0-rmse:3.05715


### Task 1: changing max_depth and min_samples_split

In [6]:
X, y = load_boston(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

n_estimators = 200
max_depth =[1, 2, 3, 4, 5]
min_samples_split = [2, 3, 4]
for depth in max_depth:
    for mss in min_samples_split:
        tree_params = {'max_depth':depth, 'min_samples_split': mss}
        models = [DecisionTreeRegressor(**tree_params) for _ in range(n_estimators)]

        models = fit(X_train, y_train, models)

        y_pred = predict(X_test, models)
        
        print("="*30)
        print("max depth: ", depth, 'and min samples split:', mss)
        print("Our MSE: ", mean_squared_error(y_test, y_pred))

max depth:  1 and min samples split: 2
Our MSE:  12.945557601580584
max depth:  1 and min samples split: 3
Our MSE:  12.945557601580582
max depth:  1 and min samples split: 4
Our MSE:  12.945557601580582
max depth:  2 and min samples split: 2
Our MSE:  10.78955061410898
max depth:  2 and min samples split: 3
Our MSE:  10.764879800502445
max depth:  2 and min samples split: 4
Our MSE:  11.002511681053244
max depth:  3 and min samples split: 2
Our MSE:  7.604283280819447
max depth:  3 and min samples split: 3
Our MSE:  7.720158483589702
max depth:  3 and min samples split: 4
Our MSE:  7.535172299548788
max depth:  4 and min samples split: 2
Our MSE:  8.301372033899987
max depth:  4 and min samples split: 3
Our MSE:  8.515133448989273
max depth:  4 and min samples split: 4
Our MSE:  9.555556825028603
max depth:  5 and min samples split: 2
Our MSE:  7.531099503536801
max depth:  5 and min samples split: 3
Our MSE:  8.554678596905008
max depth:  5 and min samples split: 4
Our MSE:  8.382247

### Task 2: add code for binary classification

In [7]:
from scipy.special import expit
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor

def grad(y, h):
    return y - h

def fit(X, y, models):
    
    models_trained = []
    
    #starting model, don't use smart model in the first, it will overfit
    first_model = DummyRegressor(strategy='mean')
    first_model.fit(X, y)
    models_trained.append(first_model)
    
    # fit the estimators
    for i, model in enumerate(models):
        y_pred = predict(X, models_trained)
    
        residual = grad(y, y_pred)
    
        model.fit(X, residual)

        models_trained.append(model)
    return models_trained

def predict(X, models):
    learning_rate = 0.1
    f0 = models[0].predict(X)
    boosting = sum(learning_rate * model.predict(X) for model in models[1:])
    yhat = f0 + boosting
    # sigmoid
    y_pred = 1 / (1 + np.exp(-yhat))
    y_pred = np.round(y_pred)
    return y_pred

In [8]:
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

n_estimators = 200
tree_params = {'max_depth':1}
models = [DecisionTreeRegressor(**tree_params) for _ in range(n_estimators)]

models = fit(X_train, y_train, models)

y_pred = predict(X_test, models)

print("Our MSE: ", accuracy_score(y_test, y_pred))


Our MSE:  0.8947368421052632


In [9]:
sklearn_model = GradientBoostingClassifier(
    n_estimators=n_estimators,
    learning_rate = 0.1,
    max_depth=1
)

yhat_sk = sklearn_model.fit(X_train, y_train).predict(X_test)
print("Sklearn accuracy: ", accuracy_score(y_test, yhat_sk))

Sklearn accuracy:  0.9649122807017544


### Task 3 : adding code for multinomial classification

In [10]:
from scipy.special import expit
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor

def grad(y, h):
    return y - h

def fit(X, y, models):
    
    models_trained = []
    
    #starting model, don't use smart model in the first, it will overfit
    first_model = DummyRegressor(strategy='mean')
    first_model.fit(X, y)
    models_trained.append(first_model)
    
    # fit the estimators
    for i, model in enumerate(models):
        y_pred = predict(X, models_trained)
    
        residual = grad(y, y_pred)
    
        model.fit(X, residual)

        models_trained.append(model)
    return models_trained

def predict(X, models):
    learning_rate = 0.1
    f0 = models[0].predict(X)
    boosting = sum(learning_rate * model.predict(X) for model in models[1:])
    yhat = f0 + boosting
    #softmax
    y_pred = np.exp(yhat)/np.sum(np.exp(yhat), axis=1,keepdims=True)
    #y_pred = np.argmax(y_pred, axis=1)
    return y_pred

In [11]:
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

X, y = load_digits(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))
for each_class in range(len(set(y))):
    cond = y_train==each_class
    y_train_encoded[np.where(cond), each_class] = 1

n_estimators = 200
tree_params = {'max_depth':1}
models = [DecisionTreeRegressor(**tree_params) for _ in range(n_estimators)]

models = fit(X_train, y_train_encoded, models)

y_pred = predict(X_test, models)
y_pred = np.argmax(y_pred, axis=1)

print("Our MSE: ", accuracy_score(y_test, y_pred))

Our MSE:  0.8055555555555556


In [12]:
sklearn_model = GradientBoostingClassifier(
    n_estimators=n_estimators,
    learning_rate = 0.1,
    max_depth=1
)

yhat_sk = sklearn_model.fit(X_train, y_train).predict(X_test)
print("Sklearn accuracy: ", accuracy_score(y_test, yhat_sk))

Sklearn accuracy:  0.9481481481481482


## Put everything into the class

In [13]:
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor

class GradientBoosting:
    
    def __init__(self, n_estimators=200, max_depth=1, learning_rate=0.1, min_samples_split=2, regression=True):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.regression = regression
        
        tree_params = {'max_depth' : self.max_depth, 'min_samples_split' : self.min_samples_split}
        self.models = [DecisionTreeRegressor(**tree_params) for _ in range(self.n_estimators)]
    
    def grad(self, y, h):
        return y - h
    
    def fit(self, X, y):
        
        self.models_trained = []
        first_model = DummyRegressor(strategy='mean')
        first_model.fit(X, y)
        self.models_trained.append(first_model)
        
        for i, model in enumerate(self.models):
            y_pred = self.predict(X, with_argmax=False)
            residual = self.grad(y, y_pred)
            model.fit(X, residual)
            self.models_trained.append(model)
    
    def predict(self, X, with_argmax=True):
        models = self.models_trained
        f0 = models[0].predict(X)
        boosting = sum(self.learning_rate * model.predict(X) for model in models[1:])
        yhat = f0 + boosting
        if not self.regression:
            yhat = np.exp(yhat) / np.sum(np.exp(yhat), axis=1, keepdims=True)
            if with_argmax:
                yhat = np.argmax(yhat, axis=1)
        return yhat

In [14]:
#regression
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error

X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                        test_size=0.3, random_state=42)

model = GradientBoosting(n_estimators=200, learning_rate=0.1, max_depth = 3, 
                 min_samples_split = 2,
                 regression=True)
model.fit(X_train, y_train)
yhat = model.predict(X_test)

#print metrics
print("MSE: ", mean_squared_error(y_test, yhat))

n_estimators = 200

#=====SKlearn========
#Compare to sklearn: ls is the same as our mse
sklearn_model = GradientBoostingRegressor(
    n_estimators=n_estimators,
    learning_rate = 0.1,
    max_depth=3,
    loss='ls'
)

yhat_sk = sklearn_model.fit(X_train, y_train).predict(X_test)
print("Sklearn MSE: ", mean_squared_error(y_test, yhat_sk))

MSE:  7.803138676200792
Sklearn MSE:  7.7902844330749765


In [15]:
# Binary classification

from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.3, random_state=42)
y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))
for each_class in range(len(set(y))):
    cond = y_train==each_class
    y_train_encoded[np.where(cond), each_class] = 1

model = GradientBoosting(n_estimators=200, learning_rate=0.1, max_depth = 3, 
                 min_samples_split = 2,
                 regression=False)
model.fit(X_train, y_train_encoded)
yhat = model.predict(X_test)

# #print metrics
print("Our accuracy: ", accuracy_score(y_test, yhat))

#=====SKlearn========
#Compare to sklearn: ls is the same as our accuracy
sklearn_model = GradientBoostingClassifier(
    n_estimators=n_estimators,
    learning_rate = 0.1,
    max_depth=1
)

yhat_sk = sklearn_model.fit(X_train, y_train).predict(X_test)
print("Sklearn accuracy: ", accuracy_score(y_test, yhat_sk))

Our accuracy:  0.9649122807017544
Sklearn accuracy:  0.9649122807017544


In [16]:
# Multiclass classification

from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

X, y = load_digits(return_X_y=True)

X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.3, random_state=42)
y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))
for each_class in range(len(set(y))):
    cond = y_train==each_class
    y_train_encoded[np.where(cond), each_class] = 1

model = GradientBoosting(n_estimators=200, learning_rate=0.1, max_depth = 3, 
                 min_samples_split = 2,
                 regression=False)
model.fit(X_train, y_train_encoded)
yhat = model.predict(X_test)

# #print metrics
print("Our accuracy: ", accuracy_score(y_test, yhat))

#=====SKlearn========
#Compare to sklearn: ls is the same as our accuracy
sklearn_model = GradientBoostingClassifier(
    n_estimators=n_estimators,
    learning_rate = 0.1,
    max_depth=1
)

yhat_sk = sklearn_model.fit(X_train, y_train).predict(X_test)
print("Sklearn accuracy: ", accuracy_score(y_test, yhat_sk))

Our accuracy:  0.9314814814814815
Sklearn accuracy:  0.9481481481481482
