# Load features and targets

In [1]:
# load features
import pickle
features = pickle.load( open( "Onehotfeatures.pkl", "rb" ) )

# load associated targets
from numpy import load
y = load('target.npy')

Choose features and prepare data for scikit-learn prototyping

In [2]:
# keep features of interest
imp_feature = ['User_ID', 'Product_ID', 'Gender_Prod_cat123']
# imp_feature = ['User_ID', 'Product_ID', 'Gender', 'Prod_cat123']
# only keep corresponding features
X_features = tuple(f[0] for f in features if f[1] in imp_feature)

In [3]:
from scipy.sparse import hstack
X = hstack( X_features )
X.shape, type(X)

((550068, 9993), scipy.sparse.coo.coo_matrix)

# Decision Tree
Trying decision trees, very easy to overfit if you don't specify hyperparameters.

### Decision Tree - Grid search

In [81]:
# DEFINE PARAMETER VALUES
import numpy as np
# depth_arr = np.array([ 10**x for x in range(2,4+1)])
# depth_arr = np.array([ x for x in range(100,1000+100, 100)])
depth_arr = np.array([ x for x in range(850,1000, 25)])
# min_leaf_arr = np.array([ x+1 for x in range(0, 100+10, 10)])
# min_leaf_arr[0]=1
min_leaf_arr = np.array([ x for x in range(0, 20+1, 1)])
min_leaf_arr = min_leaf_arr[1:]
# print arrays
print(depth_arr)
print(min_leaf_arr)

[850 875 900 925 950 975]
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]


Possible order of tunning:
1. max_depth and min_samples_leaf 
2. min_samples_split
3. max_features

In [82]:
# instantiate decision tree model for regression
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(min_samples_leaf = 5, random_state = 29)

# Grid search on max_depth, min_samples_leaf
from sklearn.model_selection import GridSearchCV, ShuffleSplit
# n_splits is the number of times you split data after shuffling
cv = ShuffleSplit(n_splits=5, test_size=1/5, random_state=4)
# cv could be a fixed number of partitions but there would be no shuffling in that case
# it will just rotate on partitions (k-1) parts and 1 part for cross-val
# param_grid_tree = [{'max_depth': depth_arr, 'min_samples_leaf': min_leaf_arr }]
param_grid_tree = [{'max_depth': depth_arr}]
tree_reg_grid = GridSearchCV(tree_reg, param_grid_tree, cv=cv,\
                          scoring='neg_mean_squared_error', n_jobs=-1)
# run grid search
tree_reg_grid.fit(X,y)

# best of mean test score
print( 'MSE, best param, mean cross-val = {:.4f}'.format(-tree_reg_grid.best_score_) )
print( 'RMSE, best param, mean cross-val = {:.4f}'.format(np.sqrt(-tree_reg_grid.best_score_)) )

# # Corresponding TRAIN score (subset)
# print( '\nMSE, best param, mean training set = {:.4f}'.format(\
#                                     min(-tree_reg_grid.cv_results_['mean_train_score']))) 
# print( 'RMSE, best param, mean training set = {:.4f}'.format(\
#                             np.sqrt(min(-tree_reg_grid.cv_results_['mean_train_score']))) )

# optimal parameters: {'min_samples_leaf': 5, 'max_depth': 900}
# finer search yields {'max_depth': 875}
print(tree_reg_grid.best_params_)

MSE, best param, mean cross-val = 7021371.1369
RMSE, best param, mean cross-val = 2649.7870
{'max_depth': 875}


### Decision Tree Single run

In [73]:
# instantiate decision tree model for regression
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor( min_samples_leaf = 5, max_depth = 900, random_state=29)

from sklearn.model_selection import cross_validate, ShuffleSplit
# n_splits is the number of times you split data after shuffling
cv = ShuffleSplit(n_splits=5, test_size=1/5, random_state=4)
# cv could be a fixed number of partitions but there would be no shuffling in that case
# it will just rotate on partitions (k-1) parts and 1 part for cross-val
cv_results_tree = cross_validate(tree_reg, X, y=y, cv=cv,\
                                 scoring = 'neg_mean_squared_error', n_jobs = -1)
print('MSE (mean cross-validation) = {:.4f}'.format(-np.mean(cv_results_tree['test_score'])))
print('RMSE (mean cross-validation) = {:.4f}'.format(np.sqrt(-np.mean(cv_results_tree['test_score']))))
# RMSE (mean cross-validation) = 2649.8007

# train one decision tree on entire dataset, cross_validate does it on k-1 splits
tree_reg.fit(X,y)
print('Training Done')

MSE (mean cross-validation) = 7021443.8239
RMSE (mean cross-validation) = 2649.8007
Training Done
