In [2]:
import pandas as pd
import numpy as np

In [15]:
from surprise import Dataset, Reader
from surprise import SVD # implementation of Funk's SVD (gradient descent-based matrix factorization)
from surprise import accuracy # metric
from surprise.model_selection import train_test_split, GridSearchCV #train/test splits, crossval

In [3]:
### ratings.csv

ratings = pd.read_csv('DATA/ratings.csv', index_col=False)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [4]:
### movies.csv

movies = pd.read_csv('DATA/movies.csv', index_col=False)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
mr = pd.merge(ratings, movies, on='movieId', how='outer')
mr.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1.0,1,4.0,1225735000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2.0,1,5.0,835816000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7.0,1,4.0,974518000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,10.0,1,3.0,1430666000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,12.0,1,5.0,862500700.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [6]:
mr = mr.drop('timestamp', axis=1)
mr

Unnamed: 0,userId,movieId,rating,title,genres
0,1.0,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2.0,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7.0,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,10.0,1,3.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,12.0,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...
33835455,47791.0,288967,3.5,State of Siege: Temple Attack (2021),Action|Drama
33835456,98408.0,288971,0.5,Ouija Japan (2021),Action|Horror
33835457,154483.0,288975,4.0,The Men Who Made the Movies: Howard Hawks (1973),Documentary
33835458,291389.0,288977,3.0,Skinford: Death Sentence (2023),Crime|Thriller


In [7]:
mr = mr.dropna()

In [8]:
mr['userId'] = mr['userId'].astype(int)
mr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mr['userId'] = mr['userId'].astype(int)


Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,10,1,3.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,12,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...
33835455,47791,288967,3.5,State of Siege: Temple Attack (2021),Action|Drama
33835456,98408,288971,0.5,Ouija Japan (2021),Action|Horror
33835457,154483,288975,4.0,The Men Who Made the Movies: Howard Hawks (1973),Documentary
33835458,291389,288977,3.0,Skinford: Death Sentence (2023),Crime|Thriller


In [9]:
sample = mr.sample(n=3000000, random_state=1)
sample

Unnamed: 0,userId,movieId,rating,title,genres
33182245,243485,190973,2.5,Hidden Reserves (2017),Drama|Sci-Fi
16176049,71415,3005,4.0,"Bone Collector, The (1999)",Thriller
32084944,171084,138036,4.0,The Man from U.N.C.L.E. (2015),Action|Adventure|Comedy
6082545,256240,805,5.0,"Time to Kill, A (1996)",Drama|Thriller
16125818,93175,2997,3.5,Being John Malkovich (1999),Comedy|Drama|Fantasy
...,...,...,...,...,...
29704749,312288,90645,3.0,Anonymous (2011),Drama
11211111,40095,1704,5.0,Good Will Hunting (1997),Drama|Romance
3233117,22946,357,3.0,Four Weddings and a Funeral (1994),Comedy|Romance
9600547,217248,1307,3.0,When Harry Met Sally... (1989),Comedy|Romance


In [10]:
count = sample['userId'].value_counts()

In [11]:
valid_user = count[count > 150].index

In [12]:
sample_f = sample[sample['userId'].isin(valid_user)]

In [13]:
sample_f['userId'].value_counts()

userId
189614    2949
48766      851
76618      837
207216     817
175998     809
          ... 
9012       151
136471     151
328058     151
278369     151
36774      151
Name: count, Length: 1006, dtype: int64

In [14]:
sample_f.head(20)

Unnamed: 0,userId,movieId,rating,title,genres
18886269,20459,4034,3.5,Traffic (2000),Crime|Drama|Thriller
21129870,10129,5459,3.5,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi
30090305,211422,96110,4.0,"Campaign, The (2012)",Comedy
33624911,114227,224869,1.0,Winnie the Pooh: Springtime with Roo (2004),Animation|Children
27288136,189614,59273,3.0,Delirious (2006),Comedy|Drama
31174192,213479,112552,4.5,Whiplash (2014),Drama
11877966,153377,1939,3.0,"Best Years of Our Lives, The (1946)",Drama|War
23038858,166208,7139,5.0,In America (2002),Drama|Romance
24568186,49000,27728,0.5,Ghost in the Shell 2: Innocence (a.k.a. Innoce...,Action|Animation|Drama|Sci-Fi|Thriller
30352038,262716,99415,2.5,Parental Guidance (2012),Comedy


-----------

In [16]:
reader = Reader(rating_scale=(0, 5))  # Define the rating scale
data = Dataset.load_from_df(sample_f[['userId', 'movieId', 'rating']], reader)

In [17]:
trainset, testset = train_test_split(data, test_size=0.2)

In [18]:
svd = SVD(n_epochs=50)

In [19]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10e94dc10>

In [20]:
prediction = svd.test(testset)

In [21]:
rmse = accuracy.rmse(prediction)
print(f"RMSE: {rmse:.4f}")

RMSE: 0.8690
RMSE: 0.8690


In [22]:
print(accuracy.mae(prediction))

MAE:  0.6639
0.6638588532932639


-----------

In [26]:
from collections import defaultdict

# given prediction for a set of users, get the top n ranked for each user 

def get_top_n(prediction, n=15):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in prediction:
        top_n[uid].append((iid, est, true_r))

    # Then sort the predictions for each user and retrieve the n highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [27]:
top_n_preds_test = get_top_n(prediction, 15)

In [28]:
top_n_preds_test[49000]

[(52967, 3.536810095254962, 0.5),
 (27801, 2.8569927904146395, 0.5),
 (2459, 2.7604220497595247, 0.5),
 (2011, 2.7033912605502284, 4.0),
 (64969, 2.6258618040959854, 5.0),
 (27846, 2.580654299371722, 2.5),
 (33817, 2.5402929106318632, 0.5),
 (96616, 2.488456151844381, 5.0),
 (77206, 2.4344472267391275, 1.0),
 (106766, 2.413176418719831, 0.5),
 (59336, 2.409697520907153, 0.5),
 (7364, 2.3329270767475476, 2.0),
 (122904, 2.263847665182809, 5.0),
 (40959, 2.2571366398704265, 4.0),
 (55872, 2.2248809387606174, 0.5)]

In [29]:
movie_id_list = np.array(list(zip(*top_n_preds_test[49000]))[0], dtype = 'int')
movie_id_list

array([ 52967,  27801,   2459,   2011,  64969,  27846,  33817,  96616,
        77206, 106766,  59336,   7364, 122904,  40959,  55872])

In [32]:
movies.loc[movie_id_list]['title']

KeyError: '[96616, 106766, 122904] not in index'