# Load features and targets

In [1]:
# load features
import pickle
features = pickle.load( open( "Onehotfeatures.pkl", "rb" ) )

# load associated targets
from numpy import load
y = load('target.npy')

Choose features and prepare data for scikit-learn prototyping

In [2]:
# keep features of interest
imp_feature = ['User_ID', 'Product_ID', 'Gender_Prod_cat123']
# imp_feature = ['User_ID', 'Product_ID', 'Gender', 'Prod_cat123']
# only keep corresponding features
X_features = tuple(f[0] for f in features if f[1] in imp_feature)

In [3]:
from scipy.sparse import hstack
X = hstack( X_features )
X.shape, type(X)

((550071, 9996), scipy.sparse.coo.coo_matrix)

# Ridge Regularization
Learning curves from linear regression model (see associated notebook) shows there is not much of a gap so overfitting is minimal. So there shouldn't be a big improvement by adding regularization.

### Ridge - Grid search on regularization constant

Create model [Sklearn Ridge](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html) 

In [4]:
from sklearn.linear_model import Ridge
# closed-form ('svd') l2 regularization ('Ridge')
# Because X is sparse and fit_intercept is true, it must use Stochastic Gradient Descent ('sag', see doc)
# max_iter=None and tol=0.001 can help speeding up search but may not find optimal solution
model_ridge = Ridge(random_state=29, solver='sag')

Create parameter grid

In [5]:
import numpy as np
# 0.01, 0.1, 1, 10
al = np.array([ 10**x for x in range(0,2)])
# add multiple of three
al = np.concatenate([al, 3*al])
# Started with al then narrowed it down to 5.
param_grid = [{'alpha': [4, 5, 6] }]
param_grid

[{'alpha': [4, 5, 6]}]

Run grid search

In [6]:
# Grid search on alpha
from sklearn.model_selection import GridSearchCV, ShuffleSplit
# n_splits is the number of times you split data after shuffling
cv = ShuffleSplit(n_splits=5, test_size=1/5, random_state=4)

# cv could be a fixed number of partitions but there would be no shuffling in that case
# it will just rotate on partitions (k-1) parts and 1 part for cross-val
ridge_grid = GridSearchCV(model_ridge, param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs = -1, verbose = 1)

# run grid search
ridge_grid.fit(X,y)
# alpha = 3 is optimal with default tol and max_iter

# Show winning parameters
ridge_grid.best_estimator_
# tree_reg_grid.best_params_
# ridge_grid.cv_results_

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   19.0s finished


Ridge(alpha=5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=29, solver='sag', tol=0.001)


Cross-validation best results

In [7]:
# best score is the lowest MSE (averaged over Kfold cross-validation for each parameter value)
print( 'MSE, best param, mean cross-val = {:.4f}'.format(-ridge_grid.best_score_) )
print( 'RMSE, best param, mean cross-val = {:.4f}'.format(np.sqrt(-ridge_grid.best_score_)) )

MSE, best param, mean cross-val = 6251711.1638
RMSE, best param, mean cross-val = 2500.3422


Entire training set results

In [8]:
# get predictions from ridge_grid, grid search refit ridge_grid on entire training set using best params
y_pred_grid = ridge_grid.predict(X)
# plot metrics on training set and compare to cross-validation metrics
from sklearn.metrics import mean_squared_error
print('MSE (whole training set) = {:.4f}'.format(mean_squared_error(y, y_pred_grid)))
print('RMSE (whole training set) = {:.4f}'.format(np.sqrt(mean_squared_error(y, y_pred_grid))))

MSE (whole training set) = 6031510.6500
RMSE (whole training set) = 2455.9134


### Ridge - Single run

Cross-validation

In [9]:
# create ridge model (or use the one above)
from sklearn.linear_model import Ridge
model_ridge = Ridge(alpha=5, random_state=29, solver='sag')

# run cross-validation to get expected error
from sklearn.model_selection import cross_validate, ShuffleSplit
# n_splits is the number of times you split data after shuffling
cv = ShuffleSplit(n_splits=5, test_size=1/5, random_state=4)

# cv could be a fixed number of partitions but there would be no shuffling in that case
# it will just rotate on partitions (k-1) parts and 1 part for cross-val
cv_results = cross_validate(model_ridge, X, y, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)

# print results
print('MSE (mean cross-validation) = {:.4f}'.format(-np.mean(cv_results['test_score'])))
print('RMSE (mean cross-validation) = {:.4f}'.format(np.sqrt(-np.mean(cv_results['test_score']))))

MSE (mean cross-validation) = 6251711.1638
RMSE (mean cross-validation) = 2500.3422


Run again on entire training set

In [10]:
# fit on entire dataset
model_ridge.fit(X,y)

# metrics on entire dataset, must be higher that Xval scores
y_pred = model_ridge.predict(X)
from sklearn.metrics import mean_squared_error
print('\nMSE (whole training set) = {:.4f}'.format(mean_squared_error(y, y_pred)))
print('RMSE (whole training set) = {:.4f}'.format(np.sqrt(mean_squared_error(y, y_pred))))
# Learning curves on linear model yielded: 2445 train < RSME < 2520 test

Training done

MSE (whole training set) = 6031510.6500
RMSE (whole training set) = 2455.9134


# Run model on test set

Read test set in memory

In [11]:
import pandas as pd
filename = './test_HujdGe7/test.csv'
df_test = pd.read_csv(filename)

Recover encoders from file and derive one-hot encoded features

In [12]:
# Load encoders
import pickle
encoders = pickle.load( open( "Onehotencoders.pkl", "rb" ) )
catcoders = pickle.load( open( "Category_encoders.pkl", "rb" ) )

In [13]:
import importlib
from utils import fextract as ft
# reload is necessary if one makes changes in fextract. Indeed modules are loaded once only, this forces a reload.
importlib.reload(ft)

# get one-hot encoded features and their names
features_test = ft.prepare_Data(df_test, (catcoders, encoders))

Select necessary features (must match your feature model obviously)

In [14]:
X_features_test = tuple(f[0] for f in features_test if f[1] in imp_feature)
X_test = hstack( X_features_test )
# check shape
X_test.shape

(233599, 9996)

Make predictions and save them to file

In [15]:
y_pred_test = model_ridge.predict(X_test)

In [16]:
# format result save to csv for submission
df_results = df_test.loc[:,('User_ID','Product_ID')]
df_results['Purchase'] = y_pred_test.reshape(-1,1)
df_results.to_csv('Submission_Ridge.csv', index=False)