In [1]:
# Collaborative Filtering

In [2]:
# Similar users can used to predict how much a user like a product never used before

In [3]:
# Model-based CF filtering

In [4]:
# use Surprise library with SVD and min RMSE 

In [5]:
# https://surprise.readthedocs.io/en/stable/getting_started.html
# https://surprise.readthedocs.io/en/stable/FAQ.html#raw-inner-note

In [6]:
from surprise import Reader
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise import accuracy

In [7]:
import pandas as pd
import numpy as np

# import datasets - 100k small
MovieLens_movies = pd.read_csv('movies.csv')
ML_ratings = pd.read_csv('ratings.csv')
ML_links = pd.read_csv('links.csv')
ML_tags = pd.read_csv('tags.csv')

In [8]:
reader = Reader()

In [9]:
# ML rating - users ratings of movices
ML_ratings.head(3).append(ML_ratings.tail(3)) 

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [10]:
# Load the movielens-100k dataset 
data = Dataset.load_from_df(ML_ratings[['userId', 'movieId', 'rating']], reader)

In [11]:
svd = SVD()   # using SVD algorithm

# Run 5-fold cross-validation and print results
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8720  0.8777  0.8718  0.8703  0.8775  0.8739  0.0031  
MAE (testset)     0.6668  0.6751  0.6691  0.6703  0.6735  0.6710  0.0030  
Fit time          7.41    7.05    7.59    7.90    8.49    7.69    0.49    
Test time         0.19    0.17    0.19    0.19    0.20    0.19    0.01    


{'test_rmse': array([0.87202472, 0.87772171, 0.87180455, 0.87033014, 0.87750568]),
 'test_mae': array([0.66680929, 0.67514787, 0.6690874 , 0.67034057, 0.67351582]),
 'fit_time': (7.405597925186157,
  7.050730228424072,
  7.594558477401733,
  7.9014551639556885,
  8.490288496017456),
 'test_time': (0.18693828582763672,
  0.1749434471130371,
  0.1879410743713379,
  0.1939399242401123,
  0.20493412017822266)}

In [12]:
# we got a RMSE of 0.8812, which is good enough
# to predict, we train on the full dataset

In [13]:
# Retrieve the trainset - train on full set
trainset = data.build_full_trainset()  
# Build an algorithm, and train it.
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x21a0b2bf388>

In [14]:
ML_ratings[ML_ratings['userId'] == 29]

Unnamed: 0,userId,movieId,rating,timestamp
4764,29,50,3.5,1308007653
4765,29,150,4.0,1362016794
4766,29,165,4.0,1362016840
4767,29,170,3.0,1307905797
4768,29,296,3.5,1362016789
...,...,...,...,...
4840,29,98961,4.5,1362016565
4841,29,99114,4.5,1362016549
4842,29,104841,4.0,1405816277
4843,29,111362,4.5,1405816130


In [15]:
# get a prediction for specific users and items.
svd.predict(29, 165, 4)

Prediction(uid=29, iid=165, r_ui=4, est=4.094294351795714, details={'was_impossible': False})

In [63]:
# revised from ttps://surprise.readthedocs.io/en/stable/FAQ.html#raw-inner-note

from collections import defaultdict

from surprise import SVD
from surprise import Dataset


def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
#data = Dataset.load_builtin('ml-100k')
data = Dataset.load_from_df(ML_ratings[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()
svd = SVD()
svd.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = svd.test(testset)

top_n = get_top_n(predictions, n=10)

l = []
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    l.append((uid, [iid for (iid, _) in user_ratings]))

In [64]:
top10_recommendations = pd.DataFrame(l, columns=['user id', 'movie ids'])
top10_recommendations

Unnamed: 0,user id,movie ids
0,1,"[318, 1704, 48516, 58559, 899, 904, 908, 912, ..."
1,2,"[1197, 1215, 1193, 1387, 57669, 1225, 1172, 95..."
2,3,"[56782, 1204, 58559, 1266, 7451, 1213, 2360, 3..."
3,4,"[318, 74458, 2289, 48516, 1223, 115713, 922, 6..."
4,5,"[1204, 1104, 1276, 3275, 1217, 56782, 898, 142..."
...,...,...
605,606,"[1104, 1283, 1276, 1262, 246, 741, 1197, 1233,..."
606,607,"[3451, 177593, 2160, 6711, 1035, 1704, 48516, ..."
607,608,"[78499, 3468, 46578, 48516, 933, 4973, 1266, 4..."
608,609,"[1204, 50, 475, 908, 1197, 1223, 293, 1178, 12..."


In [65]:
def top10(id):
    ml = top10_recommendations.loc[top10_recommendations['user id'] == int(id), 'movie ids'].values[0]
    return MovieLens_movies[MovieLens_movies['movieId'].isin(ml)]

In [66]:
top10(1)  # top 10 predicted movies based for user id 1

Unnamed: 0,movieId,title,genres
277,318,"Shawshank Redemption, The (1994)",Crime|Drama
681,899,Singin' in the Rain (1952),Comedy|Musical|Romance
686,904,Rear Window (1954),Mystery|Thriller
690,908,North by Northwest (1959),Action|Adventure|Mystery|Romance|Thriller
694,912,Casablanca (1942),Drama|Romance
965,1266,Unforgiven (1992),Drama|Western
1284,1704,Good Will Hunting (1997),Drama|Romance
2623,3508,"Outlaw Josey Wales, The (1976)",Action|Adventure|Drama|Thriller|Western
6315,48516,"Departed, The (2006)",Crime|Drama|Thriller
6710,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX


In [60]:
#MovieLens_movies[MovieLens_movies['movieId'].isin(ml)]

In [61]:
#top10_recommendations.loc[top10_recommendations['user id'] == 1, 'movie ids'].values[0]