# Load Data post feature extraction

In [46]:
# load features
# from scipy.sparse import load_npz
# X = load_npz('features.npz')
import pickle
features = pickle.load( open( "Onehotfeatures.pkl", "rb" ) )
# load associated targets
from numpy import load
y = load('target.npy')

Prepare data

In [47]:
print([ f[1] for f in features])

['User_ID', 'Product_ID', 'Gender', 'Marital_Status_Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Prod_cat123', 'Gender_Prod_cat123']


In [49]:
# keep features of interest
imp_feature = ['User_ID', 'Product_ID', 'Gender_Prod_cat123']
# only keep corresponding features
X_features = tuple(f[0] for f in features if f[1] in imp_feature)

# X_features_filt = tuple( X_features[i]  for i,f in enumerate(feature_names) if f in imp_feature )
## Alternative: unpack via list of indices
# from operator import itemgetter
# %timeit itemgetter(*[i  for i,f in enumerate(feature_names) if f in imp_feature])(X_features)

In [50]:
from scipy.sparse import hstack
X = hstack( X_features )
X.shape, type(X)

((550068, 9993), scipy.sparse.coo.coo_matrix)

# Linear Regression 
Scikit-learn implements the closed-form version: $$ \hat{\theta} = (X^T \cdot X)^{-1} \cdot X^T \cdot y$$ 
Model parameters are optimal (i.e. there is no gradient descent)

In [5]:
from sklearn.linear_model import LinearRegression
# fit_intercept=True adds bias/intercept so that we don't have to worry about it.
# n_jobs doesn't speed up because I only have one target (Purchase)
model = LinearRegression(n_jobs=-1, normalize=False, fit_intercept=True)

Run cross validation on training data

In [6]:
# report score on cross-validation
from sklearn.model_selection import cross_validate, ShuffleSplit

# n_splits is the number of times you split data after shuffling
cv = ShuffleSplit(n_splits=5, test_size=1/5, random_state=4)

# cv could be a fixed number of partitions but there would be no shuffling in that case
# it will just rotate on partitions (k-1) parts and 1 part for cross-val
cv_results_linear = cross_validate(model, X, y=y, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)

# plot metrics
import numpy as np
MSE_Xval = -np.mean(cv_results_linear['test_score'])
print('MSE (mean cross-validation) = {:.4f}'.format(MSE_Xval))
print('RMSE (mean cross-validation) = {:.4f}'.format(np.sqrt(MSE_Xval)))

MSE (mean cross-validation) = 6283021.9802
RMSE (mean cross-validation) = 2506.5957


Re-run model on entire training set and get predictions

In [7]:
# train one more time on all instances
model.fit(X,y)

# get predictions
y_pred = model.predict(X)

# plot metrics on training set and compare to cross-validation metrics
from sklearn.metrics import mean_squared_error
print('MSE (whole training set) = {:.4f}'.format(mean_squared_error(y, y_pred)))
print('RMSE (whole training set) = {:.4f}'.format(np.sqrt(mean_squared_error(y, y_pred))))

MSE (whole training set) = 6017192.8707
RMSE (whole training set) = 2452.9967


### Am I overfitting (high variance)? underfitting? (high bias)

There is a gap of about 100 RSME which is small compared to our 2500 RSME of our model but it does indicates our variance is a little elevated. Consider using ridge regularization.

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

def plot_learning_curve(model,X,y):
    # split data/target in train and test sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
    train_errors, val_errors = [], []
    train_sizes = np.linspace(1, X_train.shape[0], 100, dtype='int64')
    for m in train_sizes:
        # train with m samples
        model.fit(X_train[:m], y_train[:m])
        # predict training data and val data
        y_train_pred = model.predict(X_train[:m])
        y_val_pred = model.predict(X_val)
        # save MSE metric
        train_errors.append(mean_squared_error(y_train[:m], y_train_pred))
        val_errors.append(mean_squared_error(y_val, y_val_pred))
    _ , ax = plt.subplots()
    ax.plot(train_sizes, np.sqrt(train_errors), 'r-+', linewidth=2, label='train')
    ax.plot(train_sizes, np.sqrt(val_errors), 'b-', linewidth=2, label='validation')

plot_learning_curve(model,X,y)

### Examples where the error is very large or very low
Best and worst example have the same magnitude! (1e-8 to 1e8 error)

In [None]:
import pandas as pd
import numpy as np

# square error, ascending order
sqerr_asc = np.sort((y-y_pred)**2)

# read dataframe post feature extraction

# At square_error = 5e7, error rises dramatically
df_worst = df.loc[sqerr_asc > 5e7, :].copy()

# look at a few examples
df_worst.describe(include='all')

In [None]:
def print_rare_cases(Rare):
    Rare_HighErr = df_worst[Rare.name].isin(Rare.index).sum()
    print( 'Number of rare {} having high error = {} out of {} ({:.2f}%)'.format(\
    Rare.name, Rare_HighErr, Rare.count(), Rare_HighErr/Rare.count()*100) )

# Almost None of the rare product ID corresponds to the worst cases
RareProdID = ProdID_distrib[ProdID_distrib < 5]
print_rare_cases(RareProdID)
# what about rare users?
# rare user is defined as less than 10-16 occurrence (see distribution of users)
RareUsers = UserID_distrib[UserID_distrib < 16]
print_rare_cases(RareUsers)
# what about Prod cat123?
RareProdCat123 = ProdCat_distrib[ProdCat_distrib < 100]
print_rare_cases(RareProdCat123)

# print total number identified as high error
print('Number of worst instances = {}'.format(df_worst.Purchase.count()))

In [None]:
# Make plots for each features from worst subset
cat_list = ['Gender', 'Age', 'Occupation',\
            'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status']
_ , axw = plt.subplots(3, 2)
axw = axw.flatten()
# for k in range(len(df_worst.columns)-1):
for k, colname in enumerate(cat_list):
    dis = df_worst.loc[:, colname]
    dis.value_counts().plot(kind='bar', ax=axw[k])
    axw[k].set_title(dis.name)
plt.tight_layout()

_ , axw2 = plt.subplots()
# value_counts() sort value in descending order by default
UserWorst_distrib = df_worst.loc[:, 'User_ID'].value_counts()
# UserWorst_distrib.plot(kind='bar', ax=axw2)
axw2.plot(range(UserWorst_distrib.count()) , UserWorst_distrib.values)
axw2.set_title(df_worst.User_ID.name)

_ , axw3 = plt.subplots()
ProdIDWorst_distrib = df_worst.Product_ID.value_counts()
# ProdIDWorst_distrib.plot(kind='bar', ax=axw3)
axw3.plot(range(ProdIDWorst_distrib.count()) , ProdIDWorst_distrib.values)
axw3.set_title(df_worst.Product_ID.name)

_ , axw4 = plt.subplots()
ProdCatWorst_distrib = df_worst.Prod_cat123.value_counts()
# ProdCatWorst_distrib.plot(kind='bar', ax=axw4)
axw4.plot(range(ProdCatWorst_distrib.count()) , ProdCatWorst_distrib.values)
axw4.set_title(df_worst.Prod_cat123.name)

_ , axsq = plt.subplots()
axsq.plot(sqerr_asc)
axsq.set_title('Error Per Sample vs Sample Index')

plt.show()