In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from CustomSVD import CustomSVD
from CustomSVD import Hypothesis1

In [2]:
a = np.load('./preprocessed/collaborative_input/train_STC.npy')
b = np.load('./preprocessed/collaborative_input/test_STC.npy')
train_sample = pd.read_pickle('./preprocessed/collaborative_input/new_train_sample.pkl')
test_sample = pd.read_pickle('./preprocessed/collaborative_input/new_test_sample.pkl')
train_sample['STC'] = a
test_sample['STC'] = b
from sklearn.model_selection import train_test_split

val_sample, test_sample = train_test_split(test_sample, test_size=0.5, random_state=42)

In [11]:
print(f"The length of the training data: {len(train_sample)}")
print(f"The length of the validation data: {len(val_sample)}")
print(f"The length of the testing data: {len(test_sample)}")

The length of the training data: 88116
The length of the validation data: 12356
The length of the testing data: 12357


In [3]:
import math
#custom grid search for CustomKNN classes
def grid_search(model, params_list):
    lowest_mse = math.inf
    lowest_params = []
    if(model is CustomSVD):
        m = model(train_sample, 'reviewerID', 'movieID', 'overall')
    else:
        m = model(train_sample, 'reviewerID', 'movieID', 'overall', 'STC')
        
    for d in params_list:
        m.train(**d)
        predictions = m.predict(val_sample)
        val_loss = mean_squared_error(val_sample['overall'],predictions)
        if(val_loss < lowest_mse):
            lowest_mse = val_loss
            lowest_params = d
    return lowest_params

In [4]:
grid_search(CustomSVD, [{"n_epochs":20,"learning_rate":0.005,"regularization_parameter":0.02},{"n_epochs":25,"learning_rate":0.005,"regularization_parameter":0.02},{"n_epochs":20,"learning_rate":0.01,"regularization_parameter":0.02},{"n_epochs":25,"learning_rate":0.01,"regularization_parameter":0.02}])

Epoch 0, training mse: 1.4131732627684541
Epoch 1, training mse: 1.2751305974993763
Epoch 2, training mse: 1.1777969896033063
Epoch 3, training mse: 1.1017225828368848
Epoch 4, training mse: 1.039197737879077
Epoch 5, training mse: 0.9860238463808805
Epoch 6, training mse: 0.9396157366545961
Epoch 7, training mse: 0.8982680571582023
Epoch 8, training mse: 0.860803701275353
Epoch 9, training mse: 0.8263774204542927
Epoch 10, training mse: 0.794365656123357
Epoch 11, training mse: 0.7642958237264782
Epoch 12, training mse: 0.7358048458336237
Epoch 13, training mse: 0.7086118232390218
Epoch 14, training mse: 0.6824979351803626
Epoch 15, training mse: 0.6572928743670662
Epoch 16, training mse: 0.6328653026156255
Epoch 17, training mse: 0.609115667148962
Epoch 18, training mse: 0.5859705071126333
Epoch 19, training mse: 0.5633784597096526
Epoch 0, training mse: 0.5413059064903455
Epoch 1, training mse: 0.5197337130471211
Epoch 2, training mse: 0.49865472030700875
Epoch 3, training mse: 0.47

{'n_epochs': 20, 'learning_rate': 0.005, 'regularization_parameter': 0.02}

In [5]:
grid_search(Hypothesis1, [{"n_epochs":20,"learning_rate":0.005,"regularization_parameter":0.02},{"n_epochs":25,"learning_rate":0.005,"regularization_parameter":0.02},{"n_epochs":20,"learning_rate":0.01,"regularization_parameter":0.02},{"n_epochs":25,"learning_rate":0.01,"regularization_parameter":0.02}])

Epoch 0, training mse: 1.4029889573325638
Epoch 1, training mse: 1.2915865290684596
Epoch 2, training mse: 1.2168872049711887
Epoch 3, training mse: 1.1609876606331584
Epoch 4, training mse: 1.1170032708343507
Epoch 5, training mse: 1.0812418683032474
Epoch 6, training mse: 1.0514564431277176
Epoch 7, training mse: 1.026181212547586
Epoch 8, training mse: 1.0044138748313902
Epoch 9, training mse: 0.9854423564363405
Epoch 10, training mse: 0.968743645494522
Epoch 11, training mse: 0.9539216268190273
Epoch 12, training mse: 0.9406717644317594
Epoch 13, training mse: 0.9287542876010118
Epoch 14, training mse: 0.9179790577040001
Epoch 15, training mse: 0.908191167484785
Epoch 16, training mse: 0.8992641359869505
Epoch 17, training mse: 0.8910923226219276
Epoch 18, training mse: 0.8835871162881156
Epoch 19, training mse: 0.8766733213647099
Epoch 0, training mse: 0.8702864099058525
Epoch 1, training mse: 0.8643710803825885
Epoch 2, training mse: 0.8588803538997156
Epoch 3, training mse: 0.85

{'n_epochs': 20, 'learning_rate': 0.005, 'regularization_parameter': 0.02}

Final Test MSE based on hyperparams found through grid search on val data.

In [7]:
svd = CustomSVD(train_sample, 'reviewerID', 'movieID', 'overall')
svd.train(n_epochs=20, learning_rate=0.005, regularization_parameter=0.02)
predictions = svd.predict(test_sample)
mean_squared_error(test_sample['overall'].to_numpy(),predictions)

Epoch 0, training mse: 1.4119780040804284
Epoch 1, training mse: 1.2740364346541702
Epoch 2, training mse: 1.1768367680365022
Epoch 3, training mse: 1.1008816071088952
Epoch 4, training mse: 1.038459283023134
Epoch 5, training mse: 0.9853678007011002
Epoch 6, training mse: 0.9390255460627237
Epoch 7, training mse: 0.897730713521161
Epoch 8, training mse: 0.8603080494714159
Epoch 9, training mse: 0.8259143534141451
Epoch 10, training mse: 0.7939269136788257
Epoch 11, training mse: 0.76387606814935
Epoch 12, training mse: 0.7354008784978711
Epoch 13, training mse: 0.7082219995909327
Epoch 14, training mse: 0.6821218132627246
Epoch 15, training mse: 0.6569314691834406
Epoch 16, training mse: 0.6325209451731799
Epoch 17, training mse: 0.608791973494372
Epoch 18, training mse: 0.5856722644906754
Epoch 19, training mse: 0.5631111075209523


1.1661065135510786

In [8]:
svd = Hypothesis1(train_sample, 'reviewerID', 'movieID', 'overall','STC')
svd.train(n_epochs=20, learning_rate=0.005, regularization_parameter=0.02)
predictions = svd.predict(test_sample)
mean_squared_error(test_sample['overall'].to_numpy(),predictions)

Epoch 0, training mse: 1.4029634836930367
Epoch 1, training mse: 1.2915883971866933
Epoch 2, training mse: 1.2168885694609868
Epoch 3, training mse: 1.1609886058854417
Epoch 4, training mse: 1.1170039418305604
Epoch 5, training mse: 1.0812423608511894
Epoch 6, training mse: 1.0514568151279997
Epoch 7, training mse: 1.02618150194233
Epoch 8, training mse: 1.004414105000013
Epoch 9, training mse: 0.9854425438027135
Epoch 10, training mse: 0.9687437999251627
Epoch 11, training mse: 0.9539217547416774
Epoch 12, training mse: 0.9406718740505935
Epoch 13, training mse: 0.928754379117605
Epoch 14, training mse: 0.9179791341826377
Epoch 15, training mse: 0.9081912325828618
Epoch 16, training mse: 0.8992641899574788
Epoch 17, training mse: 0.8910923677490135
Epoch 18, training mse: 0.8835871544264196
Epoch 19, training mse: 0.8766733527786923


1.151145451787785

In [None]:
mean_squared_error(test_sample['overall'].to_numpy(),predictions)

In [None]:
h1 = Hypothesis1(train_sample, 'reviewerID', 'movieID', 'overall', 'STC')

In [None]:
h1.train()

In [None]:
predictions_h1 = h1.predict(test_sample)

In [None]:
mean_squared_error(test_sample['overall'],predictions_h1)

In [9]:
'''
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise import SVD'''

In [10]:
'''
reader = Reader(rating_scale=(1, 5))
# Loads Pandas dataframe
train_data = Dataset.load_from_df(train_sample[["reviewerID", "movieID", "overall"]], reader)
trainset = train_data.build_full_trainset()

algo3 = SVD(biased=True, random_state=42)
algo3.fit(trainset)

test_data = Dataset.load_from_df(test_sample[['reviewerID','movieID','overall']], reader)
NA, test = train_test_split(test_data, test_size=1.0)
predictions3 = algo3.test(test)
accuracy.mse(predictions3)'''

MSE: 1.1736


1.1735943646079383