# Load features and targets

In [1]:
# load features
import pickle
features = pickle.load( open( "Onehotfeatures.pkl", "rb" ) )

# load associated targets
from numpy import load
y = load('target.npy')

Choose features and prepare data for scikit-learn prototyping

In [2]:
# keep features of interest
imp_feature = ['User_ID', 'Product_ID', 'Gender_Prod_cat123']
# imp_feature = ['User_ID', 'Product_ID', 'Gender', 'Prod_cat123']
# only keep corresponding features
X_features = tuple(f[0] for f in features if f[1] in imp_feature)

In [3]:
from scipy.sparse import hstack
X = hstack( X_features )
X.shape, type(X)

((550068, 9993), scipy.sparse.coo.coo_matrix)

# Random forest

### Random forest - Grid search (Takes too long on my machine, think days)

In [None]:
# DEFINE PARAMETERS VALUES FOR GRID SEARCH
import numpy as np
depth_arr = np.array([ 10**x for x in range(2,4+1)])
# depth_arr = np.array([ x for x in range(100,1000+100, 100)])
min_leaf_arr = np.array([ x+1 for x in range(0, 100+10, 10)])
min_leaf_arr[0]=1
# min_leaf_arr = np.array([ x for x in range(0, 20+1, 1)])
# min_leaf_arr = min_leaf_arr[1:]
est_arr = np.array([ 200, 500, 3000, 5000 ])

print(est_arr)
print(depth_arr)
print(min_leaf_arr)

In [None]:
# Random forest are bags of decision trees, running this grid will take DAYS on a quad-core i7!
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_estimators=10,\
                            min_samples_leaf = 5, max_depth = 900,\
                            n_jobs=-1, random_state=29, bootstrap = True, verbose = 1)

# Grid search on max_depth, min_samples_leaf
from sklearn.model_selection import GridSearchCV, ShuffleSplit
# n_splits is the number of times you split data after shuffling
cv = ShuffleSplit(n_splits=1, test_size=1/5, random_state=4)
# cv could be a fixed number of partitions but there would be no shuffling in that case
# it will just rotate on partitions (k-1) parts and 1 part for cross-val
param_grid_rfr = [{'max_depth': depth_arr,\
                   'min_samples_leaf': min_leaf_arr,\
                  'n_estimators': est_arr}]
rfr_grid = GridSearchCV(rfr, param_grid_rfr, cv=cv,\
                          scoring='neg_mean_squared_error', n_jobs=-1)
# run grid search
# rfr_grid.fit(X,y)

# best of mean test score
print( 'MSE, best param, mean cross-val = {:.4f}'.format(-rfr_grid.best_score_) )
print( 'RMSE, best param, mean cross-val = {:.4f}'.format(np.sqrt(-rfr_grid.best_score_)) )

### Random Forest, single model

In [None]:
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
# Each tree has as many samples as the original training set (cannot change that setting
# unless you use Bagging class in sklearn), bootstrap adds randomness
# Each tree is build by splitting node randomly. This ensures each tree is different
# (cannot change that setting unless you use Bagging class then decision tree default
# is 'best' for node spiltting)
rfr = RandomForestRegressor(n_estimators = 500,\
                            min_samples_leaf = 5, max_depth = 900,\
                            n_jobs = -1, random_state = 29, bootstrap = True, verbose = 1)
# rfr = BaggingRegressor(
#                 DecisionTreeRegressor(spiltter='best', min_samples_leaf = 5, max_depth = 900)
#                 n_estimators = 10, max_samples = 1.0,\
#                 n_jobs = -1, , bootstrap = True, random_state = 29, verbose = 1)


from sklearn.model_selection import cross_validate, ShuffleSplit
# n_splits is the number of times you split data after shuffling
cv = ShuffleSplit(n_splits=5, test_size=1/5, random_state=4)
# cv could be a fixed number of partitions but there would be no shuffling in that case
# it will just rotate on partitions (k-1) parts and 1 part for cross-val
cv_results_forest = cross_validate(rfr, X, y=y, cv=cv,\
                    scoring = 'neg_mean_squared_error', n_jobs = -1, verbose = 1)
print('MSE (mean cross-validation) = {:.4f}'.format(\
                                    -np.mean(cv_results_forest['test_score'])))
print('RMSE (mean cross-validation) = {:.4f}'.format(\
                                    np.sqrt(-np.mean(cv_results_forest['test_score']))))
# 500 trees => RSME 2633.5295, 8h
# 20 trees => RSME 2635.2640, 20 min
# 10 trees => RSME 2637

# NOTE: Out-Of-Bag samples only tells you about the performance of each tree on aggregate and
# not on the entire random forest, it won't be a very good indicator.
rfr.fit(X,y)
print('Training Done')

In [None]:
# feature importance shows Product categories are the best indicator followed by Product ID
# and user ID (barely)
# _, axrf = plt.subplots()
# axrf.plot(rfr.feature_importances_)
# rfr.estimators_
# SAVE MODEL
# from sklearn.externals import joblib
# joblib.dump(rfr, 'RandomForest_500_Model.pkl')
#rfr = joblib.load('RandomForest_500_Model.pkl') 