## Imports

In [68]:
import csv
import numpy as np
import scipy as sp
import pandas as pd
from run_helpers import *
from surprise import BaselineOnly, SVD, SVDpp, SlopeOne, NMF, KNNBaseline
from surprise.model_selection import train_test_split

## Loading data

In [18]:
DATA_PATH = 'data/'
original_dataset, _ = load_data_surprise(DATA_PATH + 'data_train.csv', 0)
sample_dataset, _   = load_data_surprise(DATA_PATH + 'sampleSubmission.csv', 0)

## Generating sets

In [19]:
trainset, testset = train_test_split(original_dataset, test_size=.1, random_state=0)
predset = sample_dataset.build_full_trainset().build_testset()
realval_testset = [p[2] for p in testset]

## Sub-models
### Global mean

In [21]:
GM_test, GM_pred = global_mean_pred(trainset,testset,predset)
print("RMSE: "+ str(calculate_rmse_round(realval_testset, GM_test.T)[0][0]))

### User mean

In [29]:
UserM_test, UserM_pred = user_mean_pred(trainset, testset, predset)
print("RMSE: "+ str(calculate_rmse_round(realval_testset, UserM_test.T)[0][0]))

### Item mean

In [42]:
ItemM_test, ItemM_pred = item_mean_pred(trainset, testset, predset)
print("RMSE: "+ str(calculate_rmse_round(realval_testset, ItemM_test.T)[0][0]))

### Baseline

In [49]:
bsl_options = {'method': 'als',
                   'n_epochs': 50,
                   'reg_u': 10,
                   'reg_i': 15
               }
Base_test, Base_pred = train_model(BaselineOnly(bsl_options=bsl_options), trainset, testset, predset)
print("RMSE: "+ str(calculate_rmse_round(realval_testset, Base_test.T)[0][0]))

Estimating biases using als...


### Baseline SGD

In [51]:
bsl_options = {'method': 'sgd'}
Basesgd_test, Basesgd_pred = train_model(BaselineOnly(bsl_options=bsl_options), trainset, testset, predset)
print("RMSE: "+ str(calculate_rmse_round(realval_testset, Basesgd_test.T)[0][0]))

Estimating biases using sgd...
RMSE :1.0451830262952428


### SVD

In [54]:
SVD_test, SVD_pred = train_model(SVD(n_factors=50, lr_all=0.005, random_state=1), trainset, testset, predset)
print("RMSE: "+ str(calculate_rmse_round(realval_testset, SVD_test.T)[0][0]))

RMSE :1.0554927990941678


### SVD++

In [56]:
SVDpp_test, SVDpp_pred = train_model(SVDpp(random_state=1), trainset, testset, predset)
print("RMSE: "+ str(calculate_rmse_round(realval_testset, SVDpp_test.T)[0][0]))

RMSE :1.0536881096457897


### SlopeOne

In [59]:
SlopeOne_test, SlopeOne_pred = train_model(SlopeOne(), trainset, testset, predset)
print("RMSE: "+ str(calculate_rmse_round(realval_testset, SlopeOne_test.T)[0][0]))

RMSE :1.0425458497499698


### NMF

In [62]:
NMF_test, NMF_pred = train_model(NMF(biased=True, n_factors=7, random_state=1), trainset, testset, predset)
print("RMSE: "+ str(calculate_rmse_round(realval_testset, NMF_test.T)[0][0]))

RMSE :1.0447195611225344


### KNN User based

In [64]:
KNNu_test, KNNu_pred = train_model(KNNBaseline(k=30), trainset, testset, predset)
print("RMSE: "+ str(calculate_rmse_round(realval_testset, KNNu_test.T)[0][0]))

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0527644284531106


### KNN Item based

In [65]:
sim_options = {'user_based': False} #compute similarities between items
KNNi_test, KNNi_pred = train_model(KNNBaseline(k=30, sim_options=sim_options), trainset, testset, predset)
print("RMSE: "+ str(calculate_rmse_round(realval_testset, KNNi_test.T)[0][0]))

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0372595733528975


## Final model

In [66]:
pred = np.concatenate((GM_test, UserM_test, ItemM_test, Base_test, Basesgd_test, SVD_test, SVDpp_test, SlopeOne_test, KNNu_test, KNNi_test, NMF_test), axis=1)

In [69]:
theta = coeff(pred, realval_testset)

## Prediction

In [70]:
pred_t = np.concatenate((GM_pred, UserM_pred, ItemM_pred, Base_pred, Basesgd_pred, SVD_pred, SVDpp_pred, SlopeOne_pred, KNNu_pred, KNNi_pred, NMF_pred), axis=1)

In [71]:
pred_pound = theta@pred_t.T
pred_final = []
for p in pred_pound:
    r = round(p)
    if r > 5:
        r = 5
    if r < 1:
        r = 1
    pred_final.append(r)

## Output

In [72]:
with open('submission.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Prediction'])
    for i in range(len(predset)):
        writer.writerow([predset[i][0] + '_' + predset[i][1], pred_final[i]])