# Load features and targets

In [1]:
# load features
import pickle
features = pickle.load( open( "Onehotfeatures.pkl", "rb" ) )

# load associated targets
from numpy import load
y = load('target.npy')

Choose features and prepare data for scikit-learn prototyping

In [2]:
# keep features of interest
imp_feature = ['User_ID', 'Product_ID', 'Gender_Prod_cat123']
# imp_feature = ['User_ID', 'Product_ID', 'Gender', 'Prod_cat123']
# only keep corresponding features
X_features = tuple(f[0] for f in features if f[1] in imp_feature)

In [3]:
from scipy.sparse import hstack
X = hstack( X_features )
X.shape, type(X)

((550071, 9996), scipy.sparse.coo.coo_matrix)

# Decision Tree
Trying decision trees, very easy to overfit if you don't specify hyperparameters.

### Decision Tree - Grid search

The following hyperparameters control our tree growth.<br>
Possible order of tunning from most drastic constraint to least effective constraint:
1. **`max_depth`**, maximum number of depth allowed
2. **`min_samples_leaf`**, minimum number of samples required for a node to become a leaf, minimizes the number of splits in essence.
3. **`min_samples_split`**, minimum number of samples a node must have to split a nodes (Tuned by cv)
4. **`max_features`**, maximum number of features considered for a split (use sqrt or 30-40% of the total number of features)

Notes: 
 - `min_samples_leaf` and `min_weight_fraction_leaf` are redundants. Use one or the other,
 it has the same effect. Value depends on your data
 - `max_depth` and `max_leaf_nodes` are redundants. `max_leaf_nodes` is preponderant over `max_depth`, 
if `max_leaf_nodes` is not none then it will use it and ignore `max_depth`. Tuned by cv.

[Sklearn Decision Tree Regressor](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html) 

Define parameter grid

In [4]:
# DEFINE PARAMETER VALUES
import numpy as np
# depth_arr = np.array([ 10**x for x in range(1,3+1)])
# depth_arr = np.array([ x for x in range(100,1000+100, 100)])
depth_arr = np.array([ x for x in range(900,1026, 25)])

# min_leaf_arr = np.array([ x for x in range(0, 100+10, 10)])
# min_leaf_arr[0]=1
min_leaf_arr = np.array([ x for x in range(4, 6+1, 1)])
# min_leaf_arr = min_leaf_arr[1:]

# param_grid_tree = [{'max_depth': depth_arr, 'min_samples_leaf': min_leaf_arr }]
param_grid_tree = [{'max_depth': depth_arr, 'min_samples_leaf': [5] }]
param_grid_tree

[{'max_depth': array([ 900,  925,  950,  975, 1000, 1025]),
  'min_samples_leaf': [5]}]

Run grid search

In [5]:
# instantiate decision tree model for regression
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(random_state = 29)

# Grid search on max_depth, min_samples_leaf
from sklearn.model_selection import GridSearchCV, ShuffleSplit
# n_splits is the number of times you split data after shuffling
cv = ShuffleSplit(n_splits=5, test_size=1/5, random_state=4)

# cv could be a fixed number of partitions but there would be no shuffling in that case
# it will just rotate on partitions (k-1) parts and 1 part for cross-val
tree_reg_grid = GridSearchCV(tree_reg, param_grid_tree, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose = 1)

# run grid search
tree_reg_grid.fit(X,y)

# Show winning parameters
tree_reg_grid.best_estimator_
# tree_reg_grid.best_params_
# tree_reg_grid.cv_results_

# {'min_samples_leaf': 5, 'max_depth': 900}
# finer search on max depth yields 925

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  7.8min finished


DecisionTreeRegressor(criterion='mse', max_depth=925, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=5,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=29, splitter='best')

Cross-validation best results

In [6]:
# best score is the lowest MSE (averaged over Kfold cross-validation for each parameter value)
print( 'MSE, best param, mean cross-val = {:.4f}'.format(-tree_reg_grid.best_score_) )
print( 'RMSE, best param, mean cross-val = {:.4f}'.format(np.sqrt(-tree_reg_grid.best_score_)) )

MSE, best param, mean cross-val = 7004739.5383
RMSE, best param, mean cross-val = 2646.6468


Performance on entire training set

In [7]:
# get predictions from tree_reg_grid, grid search refit tree_reg_grid on entire training set using best params
y_pred_grid = tree_reg_grid.predict(X)
# plot metrics on training set and compare to cross-validation metrics
from sklearn.metrics import mean_squared_error
print('MSE (whole training set) = {:.4f}'.format(mean_squared_error(y, y_pred_grid)))
print('RMSE (whole training set) = {:.4f}'.format(np.sqrt(mean_squared_error(y, y_pred_grid))))

MSE (whole training set) = 6670563.7151
RMSE (whole training set) = 2582.7434


### Decision Tree - Single run

Cross-validation

In [8]:
# instantiate decision tree model for regression
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor( min_samples_leaf = 5, max_depth = 925, random_state=29)

from sklearn.model_selection import cross_validate, ShuffleSplit
# n_splits is the number of times you split data after shuffling
cv = ShuffleSplit(n_splits=5, test_size=1/5, random_state=4)

# cv could be a fixed number of partitions but there would be no shuffling in that case
# it will just rotate on partitions (k-1) parts and 1 part for cross-val
cv_results_tree = cross_validate(tree_reg, X, y = y, cv = cv, scoring = 'neg_mean_squared_error', n_jobs = -1)

print('MSE (mean cross-validation) = {:.4f}'.format(-np.mean(cv_results_tree['test_score'])))
print('RMSE (mean cross-validation) = {:.4f}'.format(np.sqrt(-np.mean(cv_results_tree['test_score']))))

MSE (mean cross-validation) = 7004739.5383
RMSE (mean cross-validation) = 2646.6468


Run on entire training set

In [9]:
# train one decision tree on entire dataset, cross_validate does it on k-1 splits
tree_reg.fit(X,y)

# metrics on entire dataset, must be higher that Xval scores
y_pred = tree_reg.predict(X)
from sklearn.metrics import mean_squared_error
print('\nMSE (whole training set) = {:.4f}'.format(mean_squared_error(y, y_pred)))
print('RMSE (whole training set) = {:.4f}'.format(np.sqrt(mean_squared_error(y, y_pred))))


MSE (whole training set) = 6670563.7151
RMSE (whole training set) = 2582.7434


# Run model on test set

Read test set in memory and recover encoders from file and derive one-hot encoded features

In [10]:
from utils import fextract as ft
import pandas as pd
import pickle
import importlib

filename = './test_HujdGe7/test.csv'
df_test = pd.read_csv(filename)

# Load encoders
encoders = pickle.load( open( "Onehotencoders.pkl", "rb" ) )
catcoders = pickle.load( open( "Category_encoders.pkl", "rb" ) )

# reload is necessary if one makes changes in fextract. Indeed modules are loaded once only, this forces a reload.
importlib.reload(ft)

# get one-hot encoded features and their names
features_test = ft.prepare_Data(df_test, (catcoders, encoders))

Select necessary features (must match your feature model obviously)

In [11]:
X_features_test = tuple(f[0] for f in features_test if f[1] in imp_feature)
X_test = hstack( X_features_test )
# check shape
X_test.shape

(233599, 9996)

Make predictions and save them to file

In [12]:
y_pred_test = tree_reg.predict(X_test)

# format result save to csv for submission
df_results = df_test.loc[:,('User_ID','Product_ID')]
df_results['Purchase'] = y_pred_test.reshape(-1,1)
df_results.to_csv('Submission_DecisionTree.csv', index=False)