In [4]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from surprise import *
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import KNNBaseline
from surprise import accuracy
from surprise import BaselineOnly
from surprise.model_selection import train_test_split
from helpers import *
from surprise.model_selection import GridSearchCV
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data loading

In [5]:
from helpers import *

DATA_PATH = 'data/'
dataset_base, dataframe = load_data_surprise(DATA_PATH + 'data_train.csv',0)

In [6]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(dataset_base, test_size=.1,random_state=0)
sample_dataset, sample_dataframe = load_data_surprise(DATA_PATH + 'sampleSubmission.csv',0)
predset = sample_dataset.build_full_trainset().build_testset()
realval_testset = [p[2] for p in testset]

In [7]:
GM_test,GM_pred = global_mean_pred(trainset,testset,predset)
print("RMSE :"+ str(calculate_rmse_round(realval_testset,GM_test.T)[0][0]))

RMSE :1.1272860586714628


In [8]:
UserM_test,UserM_pred =user_mean_pred(trainset,testset,predset)
print("RMSE :"+ str(calculate_rmse_round(realval_testset,UserM_test.T)[0][0]))
ItemM_test,ItemM_pred =item_mean_pred(trainset,testset,predset)
print("RMSE :"+ str(calculate_rmse_round(realval_testset,ItemM_test.T)[0][0]))

RMSE :1.1177375701359025
RMSE :1.0700549858059532


In [9]:
bsl_options = {'method': 'als',
                   'n_epochs': 50,
                   'reg_u': 10,
                   'reg_i': 15
               }

Base_test,Base_pred=train_model(BaselineOnly(bsl_options=bsl_options),trainset,testset,predset)
print("RMSE :"+ str(calculate_rmse_round(realval_testset,Base_test.T)[0][0]))

Fit model...
Estimating biases using als...
Make predictions...
RMSE :1.041290035228425


In [48]:
bsl_options = {'method': 'sgd'}

Basesgd_test,Basesgd_pred=train_model(BaselineOnly(bsl_options=bsl_options),trainset,testset,predset)
print("RMSE :"+ str(calculate_rmse_round(realval_testset,Basesgd_test.T)[0][0]))

Fit model...
Estimating biases using sgd...
Make predictions...
RMSE :1.0451830262952428


In [53]:
SVD_test,SVD_pred = train_model(SVD(n_factors=50,lr_all=0.005,random_state=1),trainset,testset,predset)
print("RMSE :"+ str(calculate_rmse_round(realval_testset,SVD_test.T)[0][0]))

Fit model...
Make predictions...
RMSE :1.0554927990941678


In [13]:
SVDpp_test,SVDpp_pred=train_model(SVDpp(random_state=1),trainset,testset,predset)
print("RMSE :"+ str(calculate_rmse_round(realval_testset,SVDpp_test.T)[0][0]))

Fit model...
Make predictions...
RMSE :1.0536881096457897


In [14]:
SlopeOne_test,SlopeOne_pred=train_model(SlopeOne(),trainset,testset,predset)
print("RMSE :"+ str(calculate_rmse_round(realval_testset,SlopeOne_test.T)[0][0]))

Fit model...
Make predictions...
RMSE :1.0425458497499698


In [None]:
NMF_test,NMF_pred=train_model(NMF(biased=True,n_factors=7,random_state=1),trainset,testset,predset)
print("RMSE :"+ str(calculate_rmse_round(realval_testset,NMF_test.T)[0][0]))

Fit model...


In [16]:
KNNu_test,KNNu_pred=train_model(KNNBaseline(k=30),trainset,testset,predset)
print("RMSE :"+ str(calculate_rmse_round(realval_testset,KNNu_test.T)[0][0]))

Fit model...
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Make predictions...
RMSE :1.0527644284531106


In [17]:
sim_options = {'user_based': False  # compute  similarities between items
               }
KNNi_test,KNNi_pred=train_model(KNNBaseline(k=30,sim_options=sim_options),trainset,testset,predset)
print("RMSE :"+ str(calculate_rmse_round(realval_testset,KNNi_test.T)[0][0]))

Fit model...
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Make predictions...
RMSE :1.0372595733528975


In [75]:
pred=np.concatenate((GM_test,UserM_test,Base_test,Basesgd_test,SVD_test,SVDpp_test,SlopeOne_test,KNNu_test,KNNi_test,NMF_test),axis=1)

In [76]:
x=coeff(pred,realval_testset)
print(x)

[ 0.09623969 -0.09286618 -0.0824033  -0.44546868  0.17419018  0.31015904
  0.58216986  0.11737435  0.3485882  -0.00851098]


In [77]:
calculate_poud_rmse_round(x,realval_testset,pred.T)

1.0271845379681213

In [57]:
pred_t=np.concatenate((GM_pred,UserM_pred,Base_pred,Basesgd_pred,SVD_pred,SVDpp_pred,SlopeOne_pred,KNNu_pred,KNNi_pred,NMF_pred),axis=1)

In [58]:
pred_pound=x@pred_t.T
pred_final=[]
for p in pred_pound:
    r=round(p)
    if r>5:
        r=5
    if r<1:
        r=1
    pred_final.append(r)

## 4.3 Write output submission to CSV

In [59]:
import csv
with open('submission.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Prediction'])
    for i in range(len(predset)):
        writer.writerow([predset[i][0] + '_' + predset[i][1], pred_final[i]])

# 5.Analyse Submission

In [60]:
sub_dataset, sub_dataframe = load_data_surprise('submission.csv',0)


In [61]:
g   = sub_dataframe.groupby('Prediction')
num_sub = g.apply(lambda x: len(x)/len(sub_dataframe))
print(num_sub)

Prediction
1.0    0.000169
2.0    0.012051
3.0    0.230218
4.0    0.653010
5.0    0.104551
dtype: float64
