In [10]:
import numpy as np
import pandas as pd
from surprise import SVD
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
import random
from surprise.model_selection import GridSearchCV

random.seed(48)

In [5]:
#loading training data
train_df = pd.read_csv("data/ml-100k/u1.base", sep = "\t", header = None, engine = "python",
                    usecols = [0,1,2], names = ["userID","itemID", "rating"],
                    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

#loading test data
test_df = pd.read_csv("data/ml-100k/u1.test", sep = "\t", header = None, engine = "python",
                    usecols = [0,1,2], names = ["userID","itemID", "rating"],
                    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [6]:
reader = Reader(rating_scale=(1, 5))

#getting the data into appropriate format
train_dataset = Dataset.load_from_df(train_df, reader)
trainset = train_dataset.build_full_trainset()
testset = list(zip(*map(test_df.get, test_df)))

### Tuning for SVD algorithm.

In [15]:
param_grid = {'n_factors':[50,100,150],'n_epochs':[20,30],  'lr_all':[0.005,0.01],'reg_all':[0.02,0.1]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(train_dataset)
params = gs.best_params['rmse']

In [20]:
#create SVD with best parameters
svdtuned = SVD(n_factors=params['n_factors'], n_epochs=params['n_epochs'],lr_all=params['lr_all'], reg_all=params['reg_all'], biased = False)

In [16]:
params

{'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}

In [21]:
#actually train the algorithm and note the error
svdtuned.fit(trainset)
predictions = svdtuned.test(testset)
error = accuracy.rmse(predictions)

RMSE: 0.9394


In [None]:
#0.9268 - biased
#0.9394 - unbiased

We will be using SVD for both scenarios with the following parameters:<br>
*n_factors* = 150, *n_epochs* = 30, *lr_all* = 0.01, *reg_all* = 0.1, *biased* = True

### Choosing the best kNN algorithm

In [22]:
knn_list = [("basic",KNNBasic), ("means",KNNWithMeans), ("z score",KNNWithZScore), ("baseline",KNNBaseline)]

In [25]:
#all variations of kNN here are item-based
for name, algorithm in knn_list:
    knn = algorithm(verbose = False, sim_options = {"name": "cosine", "user_based": False})
    knn.fit(trainset)
    predictions = knn.test(testset)
    error = accuracy.rmse(predictions)
    print("{} with rmse: {:.4}".format(name, error))

RMSE: 1.0491
basic with rmse: 1.049
RMSE: 0.9540
means with rmse: 0.954
RMSE: 0.9559
z score with rmse: 0.9559
RMSE: 0.9578
baseline with rmse: 0.9578


In [24]:
#all variations are user-based
for name, algorithm in knn_list:
    knn = algorithm(verbose = False, sim_options = {"name": "cosine", "user_based": True})
    knn.fit(trainset)
    predictions = knn.test(testset)
    error = accuracy.rmse(predictions)
    print("{} with rmse: {:.4}".format(name, error))

RMSE: 1.0229
basic with rmse: 1.023
RMSE: 0.9703
means with rmse: 0.9703
RMSE: 0.9687
z score with rmse: 0.9687
RMSE: 0.9462
baseline with rmse: 0.9462


*user-based* KNNBaseline performed the best with RMSE = 0.9462