In [1]:
from collections import defaultdict
import pandas as pd
from surprise import Reader, Dataset
from surprise import KNNWithMeans
from surprise import accuracy
from surprise.model_selection import train_test_split, cross_validate

In [2]:
class MyAlgo02():
    def __init__(self, rating_data=''):
        if rating_data:
            reader = Reader(line_format='user item rating timestamp', sep=',')
            self.ratings = Dataset.load_from_file(rating_data, reader)
#             self.trainset, self.testset = train_test_split(self.ratings, test_size=0.25)
            self.trainset = self.ratings.build_full_trainset()
            self.sim_options = {'name': 'cosine','user_based': False}

        
    def set_k(self, k_value):
        algo = KNNWithMeans(k=k_value, sim_options=self.sim_options)
        self.algo = algo
        self.algo.fit(self.trainset)
        
        
    def find_best_k(self):
        for k_value in [2, 3, 5, 10, 20, 30, 40]:
            print('K = {}'.format(k_value))
            algo = KNNWithMeans(k=k_value, sim_options=self.sim_options)
            cross_validate(algo, self.ratings, measures=['RMSE', 'MAE'], cv=3, verbose=True)
            print('\n\n')
        
        
    def get_top_n(self, predictions, n=10):
        '''Return the top-N recommendation for each user from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        '''

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n


    def predict_ratings(self):
        # Predict ratings for all pairs (u, i) that are NOT in the training set.
        testset = self.trainset.build_anti_testset()
        self.testset = testset
        predictions = self.algo.test(self.testset)
        self.predictions = predictions
        
        
    def recs_for_user(self, uid):
        user_filtered = list(filter(lambda x: x.uid == str(uid), self.predictions))
        print(len(user_filtered))
        top_n = self.get_top_n(predictions=user_filtered, n=10)
        
        return top_n

In [3]:
bla = MyAlgo02('ml-latest-small/ratings.csv')
bla.sim_options

{'name': 'cosine', 'user_based': False}

In [4]:
bla.set_k(k_value=10)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [5]:
bla.predict_ratings()
len(bla.predictions)

5830804

In [6]:
# my_recs = bla.get_top_n(predictions=user_filtered, n=10)
my_recs = bla.recs_for_user(uid=2)
my_recs

9695


defaultdict(list,
            {'2': [('5746', 5),
              ('6835', 5),
              ('3851', 5),
              ('1631', 5),
              ('2075', 5),
              ('176601', 5),
              ('67618', 5),
              ('53', 5),
              ('1140', 5),
              ('3795', 5)]})

In [7]:
# Read itens info
movies = pd.read_csv('ml-latest-small/movies.csv', low_memory=False)
# movies.head()
my_movie = movies.loc[movies['movieId'] == 4]
my_movie.values[0][1]

'Waiting to Exhale (1995)'

In [16]:
# Print the recommended items for each user
for uid, user_ratings in my_recs.items():
#     print(uid, [iid for (iid, _) in user_ratings])
    print(uid)
    for (iid, _) in user_ratings:
        print("Title: {}, PREDICTED score: {}".format(movies.loc[movies['movieId'] == int(iid)].values[0][1], (iid, _)[1]))
#         print((iid, _)[1])
#         print(movies.loc[movies['movieId'] == int(iid)].values[0][1])

2
Title: Galaxy of Terror (Quest) (1981), PREDICTED score: 5
Title: Alien Contamination (1980), PREDICTED score: 5
Title: I'm the One That I Want (2000), PREDICTED score: 5
Title: Assignment, The (1997), PREDICTED score: 5
Title: Mephisto (1981), PREDICTED score: 5
Title: Black Mirror, PREDICTED score: 5
Title: Strictly Sexual (2008), PREDICTED score: 5
Title: Lamerica (1994), PREDICTED score: 5
Title: Entertaining Angels: The Dorothy Day Story (1996), PREDICTED score: 5
Title: Five Senses, The (1999), PREDICTED score: 5


In [17]:
metadata = pd.read_csv('ml-latest-small/ratings.csv', low_memory=False, names=['userId', 'movieId', 'rating', 'timestamp'])

In [18]:
user_ids = metadata['userId'].unique()
movie_ids = metadata['movieId'].unique()

In [19]:
dataframe = pd.pivot_table(metadata, values='rating', index=['userId'], columns=['movieId'], fill_value=0)
dataframe.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0


In [49]:
group_sparse_mtx = dataframe.loc[[77,596,442,243,420],[1, 110, 480]]
group_sparse_mtx

movieId,1,110,480
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
77,0.0,0.0,0.0
596,4.0,0.0,0.0
442,0.0,0.0,0.0
243,0.0,0.0,0.0
420,4.0,0.0,3.5


In [28]:
pred_77_110 = list(filter(lambda x: x.uid=='77' and x.iid=='110', bla.predictions))
pred_77_110

[Prediction(uid='2', iid='5746', r_ui=3.501556983616962, est=5, details={'actual_k': 0, 'was_impossible': False})]

In [50]:
group_perf_mtx = group_sparse_mtx.copy()
group_perf_mtx

movieId,1,110,480
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
77,0.0,0.0,0.0
596,4.0,0.0,0.0
442,0.0,0.0,0.0
243,0.0,0.0,0.0
420,4.0,0.0,3.5


In [51]:
for index, row in group_perf_mtx.iterrows():
    for col in list(group_perf_mtx):
        if(group_perf_mtx.loc[index,col] == 0.0):
            aux = list(filter(lambda x: x.uid==str(index) and x.iid==str(col), bla.predictions))
            group_perf_mtx.loc[index,col] = aux[0].est

group_perf_mtx
        

movieId,1,110,480
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
77,4.113485,3.913911,4.066868
596,4.0,4.130257,3.941667
442,1.435873,1.853554,1.509079
243,3.915623,4.161877,4.224056
420,4.0,4.102919,3.5


In [83]:
group_perf = pd.DataFrame(index=[12345], columns=[1, 110, 480])
group_perf

Unnamed: 0,1,110,480
12345,,,


In [70]:
my_col = group_perf_mtx.iloc[ : ,2]
my_col = list(my_col)
print(float(min(my_col)))

1.5090788639272277


In [92]:
for i in range(0,len(list(group_perf_mtx))):
    my_col = group_perf_mtx.iloc[ : ,i]
    my_col = list(my_col)

    group_perf.iloc[0, i] = float(min(my_col))

In [93]:
group_perf

Unnamed: 0,1,110,480
12345,1.43587,1.85355,1.50908
