In [None]:
##################################################
### Author: Anthony Igel                       ###
### Team: Category Management Transformation   ###
### Project: Developing practical Python Tools ###
### Purpose: Gradient Boosted Regression       ###
### Date: 06/01/2018                           ###
##################################################

# http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-regression-py

######################################################################
########                     Import Modules                   ########
######################################################################
import py_effo as py_effo

### pandas
# Pandas is for structured data operations and manipulations, extensively used for data preparation
import pandas as pd

### numpy
# NumPy stands for Numerical Python, a library contains basic linear algebra functions, Fourier Transforms and advanced random
# number capabilities
import numpy as np 

### Scipy
# Scipy performs a host of statistical calculations, built on top of Numpy, thus we do not need to import Numpy as all Numpy
# functions are contained in Scipy
# https://oneau.wordpress.com/2011/02/28/simple-statistics-with-scipy/
import scipy as sp

### sklearn
# Sklearn contains basic statistical models
from sklearn import datasets 

# As well as a module to calculate model performance statistics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from sklearn import ensemble
from sklearn import metrics, model_selection, tree

### Statsmodels
# Sklearn contains basic statistical models and data sets
import statsmodels.api as sm

### Matplotlib
# Matplotlib is a Python based plotting library with complete 2D support and limited 3D support
%matplotlib inline
import matplotlib as mlb
import matplotlib.pyplot as plt

### Seaborn
# Seaborn is a Python visualization library based on Matplolib, providing high-level interface for statistcial graphing
# Seaborn supports numpy and pandas data structures as well as statistical routines from scipy and statsmodels
# Note: https://seaborn.pydata.org/introduction.html
import seaborn as sns

### String
# Allows for more flexible solutions for dealing with string characters
import string as st

In [None]:
######################################################################
########                     Import Data                      ########
######################################################################

### Boston Data Frame
boston = datasets.load_boston()

### Assign independent and dependent values
x, y = shuffle(boston.data, boston.target, random_state = 13)

### Convert data type of independent variables
x = x.astype(np.float32)
offset = int(x.shape[0] * 0.9)
x_train, y_train = x[:offset], y[:offset]
x_test, y_test = x[offset:], y[offset:]

######################################################################
########              Gradient Boosting Regression            ########
######################################################################

######## Generate Model ######## 
### Generate GBM Regression Model 
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}

### Assign model parameters to GBR model
gbm_reg = ensemble.GradientBoostingRegressor(n_estimators = 500, learning_rate = 0.01, max_depth = 4, min_samples_split = 2,
                                            loss = 'ls')

### Fit GBM model to training data 
gbm_reg.fit(x_train, y_train)
mse = mean_squared_error(y_test, gbm_reg.predict(x_test))

print("MSE: %.4f" % mse)

In [None]:
######## Plot training deviance ######## 
### compute test set deviance
test_score = np.zeros((params['n_estimators'],), dtype = np.float64)

for i, y_pred in enumerate(gbm_reg.staged_predict(x_test)):
    test_score[i] = gbm_reg.loss_(y_test, y_pred)

plt.figure(figsize = (12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, gbm_reg.train_score_, 'b-',
         label = 'Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label = 'Test Set Deviance')
plt.legend(loc = 'upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
plt.show()

In [None]:
######## Feature Importance ########

feature_importance = gbm_reg.feature_importances_

### Make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align = 'center')
plt.yticks(pos, boston.feature_names[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

In [None]:
######## Model Tuning ########
### Warm start will allow you to add more estimators to an already fitted model
_ = gbm_reg.set_params(n_estimators = 600, warm_start = True)

### Fit additional 600 trees to est
_ = gbm_reg.fit(x_train, y_train) 

### Test if hypothesis (adding more estimators and trees) is valid
mean_squared_error(y_test, gbm_reg.predict(x_test))   