# Load features and targets

In [None]:
# load features
import pickle
features = pickle.load( open( "Onehotfeatures.pkl", "rb" ) )

# load associated targets
from numpy import load
y = load('target.npy')

Choose features and prepare data for scikit-learn prototyping

In [None]:
# keep features of interest
imp_feature = ['User_ID', 'Product_ID', 'Gender_Prod_cat123']
# imp_feature = ['User_ID', 'Product_ID', 'Gender', 'Prod_cat123']
# only keep corresponding features
X_features = tuple(f[0] for f in features if f[1] in imp_feature)

In [None]:
from scipy.sparse import hstack
X = hstack( X_features )
X.shape, type(X)

In [None]:
# Load features and targets

# load features
import pickle
features = pickle.load( open( "Onehotfeatures.pkl", "rb" ) )

# load associated targets
from numpy import load
y = load('target.npy')

Choose features and prepare data for scikit-learn prototyping

# keep features of interest
imp_feature = ['User_ID', 'Product_ID', 'Gender_Prod_cat123']
# imp_feature = ['User_ID', 'Product_ID', 'Gender', 'Prod_cat123']
# only keep corresponding features
X_features = tuple(f[0] for f in features if f[1] in imp_feature)

from scipy.sparse import hstack
X = hstack( X_features )
X.shape, type(X)

# Gradient Boosting Regression Tree (GBRT)

As opposed to Random Forest, GBRT does `'best'` split for each node of each tree. `'best'` finds the optimal feature and threshold for each new node. However, we have control over the sample size (number of training instances to include at each new tree) which will add randomness to this sequential process (see '`subsample`')

There is a `'learning_rate'` to control quantity of residuals to cancel out from one tree to another.
`'warm_start'` is here so that when fit() is called in a loop, it doesn't start over. Instead, it builds up from the last tree created.

In summary, here are the hyperparameters:
1. **`n_estimators`**, number of sequential trees, should be higher if learning rate low (generally tuned by cv)
2. **`learning_rate`**, controls the amount of change in your target. The lower the better. Default value of 0.1 is a good start
3. **`subsample`**, select randomly a fraction of training samples for each tree. It trades a higher bias for a lower variance. This is called Stochastic Gradient Boosting. Typical value is 0.8

[Sklearn Gradient Boosting Regressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) 

Create GBRT

In [None]:
# Gradient Boosting for decision trees where residuals are used as a target for the next tree
from sklearn.ensemble import GradientBoostingRegressor
# init parameter: input a model to get started, initial tree only (Optional)
gb_regtree = GradientBoostingRegressor(n_estimators = 100, learning_rate = 0.05,\
                                       warm_start = True, subsample = 0.8,\
                                       min_samples_leaf = 5, max_depth = 925,\
                                       random_state = 29, verbose = 1)
# run the entire 
# gb_regtree.fit(X,y)

Split data

In [None]:
# metric needed to stop loop
from sklearn.metrics import mean_squared_error
# train/ val split, test_size=0.25 by default
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=4)

Run in a loop to find the optimal number of estimators

In [None]:
# monitor the error
min_val_error = float('Inf')
# counter when error goes up again
error_going_up = 0
# must use the same train/val data, split just once
# create up to 500 trees
for n_repetition in range(1, 500):
    # set number of estimators
    gb_regtree.n_estimators = n_repetition

    # train model
    gb_regtree.fit(X_train, y_train)
    
    # predict on validation set
    y_val_pred = gb_regtree.predict(X_val)
    
    # assess error (MSE)
    val_score = mean_squared_error(y_val, y_val_pred)
    
    # decide to continue or stop loop
    if val_score < min_val_error:
        min_val_error = val_score
        error_going_up = 0
    else:
        error_going_up += 1
        # stop when error is going up 5 times consecutively
        if error_going_up == 5:
            break

Performance result on cross-validation (1 fold here)

In [None]:
# result after optimization
print('MSE (mean cross-validation) = {:.4f}'.format(min_val_error) )
print('RMSE (mean cross-validation) = {:.4f}'.format(np.sqrt(min_val_error)) )

# Recent results:
# RMSE (mean cross-validation) = 2596.6717, subsample = 0.25
# RMSE (mean cross-validation) = 2528.0135, learning_rate=0.05 and subsample = 0.8

Save model

In [None]:
# SAVE MODEL
from sklearn.externals import joblib
joblib.dump(gb_regtree, 'GBRT_Model.pkl')

# example to load model
# gb_regtree = joblib.load('GBRT_Model.pkl') 

Plot the error as a function of number of estimators. Error should flatten out toward the end.

In [None]:
# recover all cross validation errors
errors_val = [mean_squared_error(y_s, y_val_pred) for y_s in gb_regtree.staged_predict(X_val)]

# keep matplotlib interactive
%matplotlib notebook
import matplotlib.pyplot as plt
# use ggplot style
plt.style.use('ggplot')

# error vs number of trees (iteration)
_, axgbrt = plt.subplots()
axgbrt.plot(range(gb_regtree.n_estimators),errors_val)
# add title and axes labels
axgbrt.set_title('Error as a function of estimators')
axgbrt.set_xlabel('Number of estimators (tree)')
axgbrt.set_ylabel('MSE')
plt.tight_layout()