In [1]:
from surprise import SVD
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import KFold
from surprise.model_selection import PredefinedKFold
from surprise import accuracy
from scipy.stats.stats import pearsonr

import os
import csv
import numpy as np
from statistics import mean 

In [2]:
# Load dataset
# 

files_dir = os.path.expanduser('dataset/')
reader = Reader(line_format='user item rating', sep=',')
train_file = files_dir + 'train-50-%d.csv'
test_file = files_dir + 'test-50-%d.csv'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

In [3]:
svd = SVD(random_state=123)

In [4]:
k = 10

In [5]:
#Start Training

i = 1
for trainset, testset in pkf.split(data): 
    # create test matrix
    test_matrix = np.zeros((6040, 4000))
    for row in testset:
        test_matrix[int(row[0])-1, int(row[1])-1] = int(row[2])
    
    # train and test algorithm.
    svd.fit(trainset)
    
    # create recommendations' container
    recs = np.zeros((len(svd.trainset.all_users()), len(svd.trainset.all_items())))
    
    # make pearson correlation among users
    corr = np.corrcoef(svd.pu)
    
    # sort the correlations, get the sorted ids
    sim_score_user_indices = np.argsort(-corr)
    
    # for each user, compute recommendation ranking
    for user_id in range(len(test_matrix)):
        # check if user is available
        if svd.trainset.knows_user(user_id):
        
            # get 10 most similar users to user target
            sim_user_ids = sim_score_user_indices[user_id][1:k + 1]

            # create similar users container
            similar_users = np.empty([1, len(svd.trainset.all_items())])

            # compute similar users' rating values
            for sim_user_id in sim_user_ids:
                similar_users = np.append(similar_users, [svd.trainset.global_mean + svd.bu[sim_user_id] + svd.bi + np.dot(svd.qi, svd.pu[sim_user_id])], axis=0)
            similar_users = np.delete(similar_users, 0, axis=0)

            # compute scores of each user
            scores = np.tile(np.arange(0, len(svd.trainset.all_items()))[::-1], (10,1)) - np.tile(np.arange(0, len(svd.trainset.all_items())), (10,1))
            for index, value in enumerate(np.argsort(-similar_users)):
                scores[index] = scores[index, value]

            # compute recommendation ranking by aggregating then sorting the scores
            recs[user_id] = np.argsort(-np.sum(scores, axis=0))
        else:
            all_users = (np.dot(svd.pu, svd.qi.transpose()).transpose() + svd.bu).transpose() + svd.bi + svd.trainset.global_mean
            scores = np.tile(np.arange(0,len(svd.trainset.all_items()))[::-1], (len(svd.trainset.all_users()),1)) - np.tile(np.arange(0,len(svd.trainset.all_items())), (len(svd.trainset.all_users()),1))
            for index, value in enumerate(np.argsort(-all_users)):
                scores[index] = scores[index, value]
                
            # compute recommendation ranking by aggregating then sorting the scores
            recs[user_id] = np.argsort(-np.sum(scores, axis=0))
    # evaluate
    testscores = np.tile(np.arange(1, 4001)[::-1], (6040,1))
    for index, _ in enumerate(testscores):
        testscores[index] = testscores[index, np.argsort(np.argsort(-test_matrix[index]))]

    ndcgs = []
    for index, value in enumerate(recs):
        rank = []
        for item in value[:10]:
            rank.append(int(svd.trainset.to_raw_iid(item)))
        dcg = np.sum(testscores[int(svd.trainset.to_raw_uid(index)), rank] / np.log2(np.arange(1,11) + 1))
        idcg = np.sum(-np.sort(-testscores[int(svd.trainset.to_raw_uid(index)), rank]) / np.log2(np.arange(1,11) + 1))
        ndcgs.append(dcg/idcg)

    print("ndcgs fold-{} = {}".format(i, np.mean(ndcgs)))

IndexError: index 6040 is out of bounds for axis 0 with size 6040