In [1]:
import pandas as pd
import numpy as np
import time
from surprise import SVD, Reader, Dataset, accuracy, dump
from surprise.model_selection import GridSearchCV, train_test_split

In [2]:
df_train = pd.read_csv('../Data/train.csv')
df_train = df_train[~df_train['rating'].isna()]
df_train = df_train.sort_values('rating',ascending=False).drop_duplicates(['user_id', 'movieid'])
df_train.head()

Unnamed: 0,user_id,movieid,watch_time,rating
3421634,792595,the+great+train+robbery+1903,9,5.0
2369885,232684,aliens+1986,99,5.0
530027,656313,winged+migration+2001,99,5.0
530026,656281,too+many+cooks+2014,9,5.0
1147582,746802,dodsworth+1936,99,5.0


In [3]:
df_val = pd.read_csv('../Data/val.csv')
df_val = df_val[~df_val['rating'].isna()]
df_val = df_val.sort_values('rating',ascending=False).drop_duplicates(['user_id', 'movieid'])
df_val.head()

Unnamed: 0,user_id,movieid,rating
348803,190010,midnight+run+1988,5
276806,829519,alien+1979,5
276820,83418,excuse+me+for+living+2012,5
56491,634490,harry+potter+and+the+deathly+hallows+part+2+2011,5
106203,864830,about+elly+2009,5


In [4]:
reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(df_train[['user_id','movieid','rating']], reader).build_full_trainset()
val_data = Dataset.load_from_df(df_val[['user_id','movieid','rating']], reader).build_full_trainset().build_testset()

In [5]:
lowest_err = np.inf
best_hyperparams_setting = None
hyperparams_setting = [(n_epochs, lr_all, reg_all)
                       for n_epochs in [10,20,30]
                       for lr_all in [0.001, 0.002, 0.005]
                       for reg_all in [0.4, 0.6, 0.8]]

for hyperparam in hyperparams_setting:
    n_epochs, lr_all, reg_all = hyperparam
    model = SVD(n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all, random_state=42)
    model.fit(train_data)
    predictions = model.test(val_data)
    rmse = accuracy.rmse(predictions)
    if rmse < lowest_err:
        lowest_err = rmse
        best_hyperparams_setting = hyperparam

RMSE: 0.7377
RMSE: 0.7393
RMSE: 0.7404
RMSE: 0.7558
RMSE: 0.7548
RMSE: 0.7538
RMSE: 0.7899
RMSE: 0.7817
RMSE: 0.7753
RMSE: 0.7277
RMSE: 0.7298
RMSE: 0.7314
RMSE: 0.7430
RMSE: 0.7427
RMSE: 0.7426
RMSE: 0.7700
RMSE: 0.7639
RMSE: 0.7593
RMSE: 0.7209
RMSE: 0.7234
RMSE: 0.7254
RMSE: 0.7341
RMSE: 0.7345
RMSE: 0.7350
RMSE: 0.7567
RMSE: 0.7524
RMSE: 0.7492


In [6]:
best_hyperparams_setting 

(30, 0.001, 0.4)

In [10]:
lowest_err

0.7209394985039652

In [7]:
model = SVD(n_epochs = best_hyperparams_setting[0],
            lr_all = best_hyperparams_setting[1],
            reg_all = best_hyperparams_setting[2],
            random_state = 42)
model.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10ed7dcc0>

# Train the model on full training data

In [8]:
# Concatenate training and validation dataset to build full training set
full_train_set = Dataset.load_from_df(pd.concat([df_train,df_val])[['user_id','movieid','rating']], reader).build_full_trainset()

In [9]:
%%time
# Train the model
model = SVD(n_epochs = best_hyperparams_setting[0],
            lr_all = best_hyperparams_setting[1],
            reg_all = best_hyperparams_setting[2],
            random_state = 42)
model.fit(full_train_set)

CPU times: user 39.7 s, sys: 729 ms, total: 40.5 s
Wall time: 40.8 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10ed7f880>

In [None]:
# Save
dump.dump('SVD_V1', algo = model)