In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from surprise import SVD, KNNBasic, KNNBaseline
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold

sns.set_style('whitegrid')

# Load the files

In [None]:
# load the movies file
df_movies = pd.read_csv('data/movielens100k/u.item', sep='|', header=None)
df_movies = df_movies[[0,1]]
df_movies.columns = ['item_id', 'movie']
df_movies.head()

In [None]:
# load the ratings file
df_ratings = pd.read_csv('data/movielens100k/u.data', sep='\t', header=None)
df_ratings.columns = ['user_id', 'item_id', 'rating', 'timestamp']
df_ratings.head()

In [None]:
# load the ratings file as a dataset

file_path = os.path.expanduser('data/movielens100k/u.data')
reader = Reader(line_format='user item rating timestamp', sep='\t')
# file_path = os.path.expanduser('data/movielens1M/ratings.dat')
# reader = Reader(line_format='user item rating timestamp', sep='::')

data = Dataset.load_from_file(file_path, reader=reader)

# Model Training and Evaluation

In [None]:
# Split train and test
trainset, testset = train_test_split(data, test_size=0.1, shuffle=True)
# Instantiate a model and train
model = KNNBasic()
model.fit(trainset)
# Predict the test set
pred = model.test(testset)
# Evaluate the results
accuracy.rmse(pred)
accuracy.mae(pred)
pred

In [None]:
# define a cross-validation iterator
kf = KFold(n_splits=10, shuffle=True)
model = KNNBasic()
for trainset, testset in kf.split(data):
    # train and test algorithm.
    model.fit(trainset)
    predictions = model.test(testset)
    # evaluate the model
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

# Generate recommendations

In [None]:
#model = KNNBasic()
#model = KNNBaseline()
#model = SVD()
trainset = data.build_full_trainset()
model.fit(trainset)

In [None]:
# select a random user
user_id = df_ratings.sample(1)['user_id'].values[0]
print('selected user', user_id)

# select the past votes for the random user
df_user = df_ratings.copy()[df_ratings['user_id']==user_id]
print(df_user.merge(df_movies, how='left', on='item_id')[['item_id','rating','movie']].sort_values(by='rating', ascending=False))

# select the potential recommendations
not_voted = set(df_movies['item_id'].values)-set(df_user['item_id'].values)
print('number of potential recommendations', len(not_voted))

# calculate the prediction
pred = []
for i in list(not_voted):
    pred.append((user_id, i, model.predict(str(user_id), str(i))))
    
# show the list of recommendations
recommendations = []
for uid, iid, est in pred:
    recommendations.append([uid, iid, est[3]])
df_recommendations = pd.DataFrame(recommendations)
df_recommendations.columns = ['user_id', 'item_id', 'prediction']
df_recommendations['item_id'] = df_recommendations['item_id'].astype(float)
df_recommendations = df_recommendations.sort_values('prediction', ascending=False)
df_recommendations.merge(df_movies, how='left', on='item_id')