In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from surprise import SVD, KNNBasic, KNNBaseline
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold

sns.set_style('whitegrid')

# Load the files

In [2]:
# load the movies file
df_movies = pd.read_csv('data/movielens100k/u.item', sep='|', header=None)
df_movies = df_movies[[0,1]]
df_movies.columns = ['item_id', 'movie']
df_movies.head()

Unnamed: 0,item_id,movie
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [3]:
# load the ratings file
df_ratings = pd.read_csv('data/movielens100k/u.data', sep='\t', header=None)
df_ratings.columns = ['user_id', 'item_id', 'rating', 'timestamp']
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
# load the ratings file as a dataset

file_path = os.path.expanduser('data/movielens100k/u.data')
reader = Reader(line_format='user item rating timestamp', sep='\t')
# file_path = os.path.expanduser('data/movielens1M/ratings.dat')
# reader = Reader(line_format='user item rating timestamp', sep='::')

data = Dataset.load_from_file(file_path, reader=reader)

# Model Training and Evaluation

In [5]:
# Split train and test
trainset, testset = train_test_split(data, test_size=0.1, shuffle=True)
# Instantiate a model and train
model = KNNBasic()
model.fit(trainset)
# Predict the test set
pred = model.test(testset)
# Evaluate the results
accuracy.rmse(pred)
accuracy.mae(pred)
pred

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9851
MAE:  0.7738


[Prediction(uid='936', iid='286', r_ui=5.0, est=3.684865964409235, details={u'actual_k': 40, u'was_impossible': False}),
 Prediction(uid='655', iid='1101', r_ui=2.0, est=3.6886702454989977, details={u'actual_k': 40, u'was_impossible': False}),
 Prediction(uid='78', iid='813', r_ui=2.0, est=4.086467101635114, details={u'actual_k': 40, u'was_impossible': False}),
 Prediction(uid='505', iid='584', r_ui=4.0, est=3.431897394034035, details={u'actual_k': 40, u'was_impossible': False}),
 Prediction(uid='286', iid='1053', r_ui=4.0, est=3.4862672954764324, details={u'actual_k': 21, u'was_impossible': False}),
 Prediction(uid='661', iid='272', r_ui=4.0, est=4.12016165482806, details={u'actual_k': 40, u'was_impossible': False}),
 Prediction(uid='826', iid='187', r_ui=4.0, est=4.680189560452186, details={u'actual_k': 40, u'was_impossible': False}),
 Prediction(uid='846', iid='1069', r_ui=4.0, est=3.401908849726036, details={u'actual_k': 16, u'was_impossible': False}),
 Prediction(uid='264', iid='2

In [7]:
# define a cross-validation iterator
kf = KFold(n_splits=10, shuffle=True)
model = KNNBasic()
for trainset, testset in kf.split(data):
    # train and test algorithm.
    model.fit(trainset)
    predictions = model.test(testset)
    # evaluate the model
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9523
MAE:  0.7505
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9664
MAE:  0.7614
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9733
MAE:  0.7683
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9790
MAE:  0.7724
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9766
MAE:  0.7741
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9741
MAE:  0.7715
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9648
MAE:  0.7606
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9801
MAE:  0.7734
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9842
MAE:  0.7757
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9745
MAE:  0.7683


# Generate recommendations

In [9]:
#model = KNNBasic()
#model = KNNBaseline()
#model = SVD()
trainset = data.build_full_trainset()
model.fit(trainset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x11176ce10>

In [10]:
# select a random user
user_id = df_ratings.sample(1)['user_id'].values[0]
print('selected user', user_id)

# select the past votes for the random user
df_user = df_ratings.copy()[df_ratings['user_id']==user_id]
print(df_user.merge(df_movies, how='left', on='item_id')[['item_id','rating','movie']].sort_values(by='rating', ascending=False))

# select the potential recommendations
not_voted = set(df_movies['item_id'].values)-set(df_user['item_id'].values)
print('number of potential recommendations', len(not_voted))

# calculate the prediction
pred = []
for i in list(not_voted):
    pred.append((user_id, i, model.predict(str(user_id), str(i))))
    
# show the list of recommendations
recommendations = []
for uid, iid, est in pred:
    recommendations.append([uid, iid, est[3]])
df_recommendations = pd.DataFrame(recommendations)
df_recommendations.columns = ['user_id', 'item_id', 'prediction']
df_recommendations['item_id'] = df_recommendations['item_id'].astype(float)
df_recommendations = df_recommendations.sort_values('prediction', ascending=False)
df_recommendations.merge(df_movies, how='left', on='item_id')

('selected user', 653)
     item_id  rating                                              movie
184      531       5                                       Shine (1996)
16        83       5                      Much Ado About Nothing (1993)
98       195       5                             Terminator, The (1984)
113      746       5                                 Real Genius (1985)
90       127       5                              Godfather, The (1972)
211      191       5                                     Amadeus (1984)
244      333       5                                   Game, The (1997)
142       56       5                                Pulp Fiction (1994)
145       89       5                                Blade Runner (1982)
49        22       5                                  Braveheart (1995)
47       748       5                                  Saint, The (1997)
46       239       5                                    Sneakers (1992)
255      188       5                     

Unnamed: 0,user_id,item_id,prediction,movie
0,653,1536,4.618933,Aiqing wansui (1994)
1,653,814,4.568083,"Great Day in Harlem, A (1994)"
2,653,1599,4.293890,Someone Else's America (1995)
3,653,1500,4.285672,Santa with Muscles (1996)
4,653,1467,4.254932,"Saint of Fort Washington, The (1993)"
5,653,1653,4.194240,Entertaining Angels: The Dorothy Day Story (1996)
6,653,1642,4.092436,Some Mother's Son (1996)
7,653,1449,4.087327,Pather Panchali (1955)
8,653,1398,3.994015,Anna (1996)
9,653,1122,3.941311,They Made Me a Criminal (1939)
