##### Trial recommendation

In [1]:
# import statements

import pandas as pd
import numpy as np

In [2]:
from surprise import Dataset, Reader
from surprise import SVD # implementation of gradient descent-based matrix factorization
from surprise import accuracy  # metric
from surprise.model_selection import train_test_split, GridSearchCV #train/test splits, crossval

----------------------

In [3]:
# read in file

data = pd.read_csv('DATA/ratings.csv', index_col=False)
data.sample(n=5, random_state=1)

Unnamed: 0,userId,movieId,rating,timestamp
33179850,324684,3898,1.5,1144760809
16176049,158793,922,4.0,1450700720
32084054,313695,2423,4.0,1133582037
6082545,59202,5218,3.0,1033884899
16125818,158269,1779,2.0,1660913035


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [5]:
dataNew = data.sample(n=100000, random_state=1)

In [6]:
dataNew.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 33179850 to 21265524
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100000 non-null  int64  
 1   movieId    100000 non-null  int64  
 2   rating     100000 non-null  float64
 3   timestamp  100000 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.8 MB


----------------------

In [19]:
reader = Reader(rating_scale=(1,5), sep='::')
reader # will be used to parse the ratings textfile

<surprise.reader.Reader at 0x1255d4f50>

In [20]:
data_gen = Dataset.load_from_df(dataNew[['userId', 'movieId', 'rating']], reader=reader)
data_gen

<surprise.dataset.DatasetAutoFolds at 0x124228620>

In [21]:
trainset, testset = train_test_split(data_gen, test_size=.2, random_state = 42)

In [22]:
svd = SVD(reg_all = .05, lr_all = 0.0025, n_factors = 100, n_epochs = 30)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1255d6db0>

In [23]:
predictions = svd.test(testset)

In [24]:
print(accuracy.mae(predictions))

MAE:  0.7664
0.7664339603032462


In [25]:
predictions[0:6]

[Prediction(uid=320615, iid=457, r_ui=4.0, est=3.9540836992993413, details={'was_impossible': False}),
 Prediction(uid=85899, iid=48516, r_ui=4.0, est=3.950555839265697, details={'was_impossible': False}),
 Prediction(uid=16521, iid=4896, r_ui=3.5, est=3.339700773291092, details={'was_impossible': False}),
 Prediction(uid=199688, iid=67255, r_ui=4.5, est=3.6143024644502937, details={'was_impossible': False}),
 Prediction(uid=274004, iid=1672, r_ui=4.0, est=3.7014455327460327, details={'was_impossible': False}),
 Prediction(uid=276446, iid=2792, r_ui=4.0, est=3.2139501037568623, details={'was_impossible': False})]

In [26]:
accuracy.fcp(predictions)

FCP:  0.4865


0.486509000810877

In [27]:
from collections import defaultdict

# given prediction for a set of users, get the top n ranked for each user 

def get_top_n(predictions, n=15):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est, true_r))

    # Then sort the predictions for each user and retrieve the n highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [28]:
top_n_preds_test = get_top_n(predictions, 15)

In [29]:
top_n_preds_test['10']

[]

In [30]:
movie_id_list = np.array(
    list(zip(*top_n_preds_test['10']))[0], dtype = 'int')
movie_id_list

IndexError: list index out of range

In [None]:
movie_metadata.loc[movie_id_list]['movie_name']