In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import random
import json
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
sns.set()

In [13]:
from surprise import SVD, NMF, NormalPredictor,KNNBasic
from surprise import accuracy, Reader, Dataset, dump 
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split

In [8]:
with open('vegas_reviews.pkl','rb') as f:
    vegas_reviews = pickle.load(f)

bLbSNkLggFnqwNNzzq-Ijw    1117
PKEzKWv_FktMm2mGPjwd0Q     893
U4INQZOPSUaj8hMjLlZ3KA     870
UYcmGbelzRa0Q6JqzLoguw     604
n86B7IkbU20AkxlFX_5aew     541
                          ... 
-xDW3gYiYaoeVASXywTPgw     186
53bZ_EsXH71L7iFs5MP9_w     185
F_5_UNX-wrAFCXuAkBZRDw     184
0hZfE2He1YssM_wEvKhfcA     183
LwVUO_5jjHuJRhvl6hlHVg     182
Name: user_id, Length: 100, dtype: int64

In [5]:
unique_users = vegas_reviews['user_id'].unique()
unique_restaurants = vegas_reviews['business_id'].unique()

In [6]:
user_dict = {unique_users[i]:i + 1 for i in range(len(unique_users))}
user_rest = {unique_restaurants[i]:i + 1 for i in range(len(unique_restaurants))}

## We need the user_id, business_id and the stars columns to work with the Surprise library

In [14]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(vegas_reviews[['user_id', 'business_id', 'stars']], reader)

In [9]:
# We'll use three algorithms here and check which one of these performs better
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), NMF(), NormalPredictor()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],index=['Algorithm']))
    benchmark.append(tmp)

In [10]:
pd.DataFrame(benchmark).set_index('Algorithm')

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,1.18973,46.609256,4.210331
NMF,1.350399,66.50614,3.521168
NormalPredictor,1.684775,1.490442,3.496053


### SVD seems to be doing the best job of the three! We'll use this for the rest!

In [11]:
param_grid = {'n_factors': [ 30, 35], 'n_epochs': [ 20, 25], 'lr_all': [0.001, 0.003],
              'reg_all': [0.08, 0.1]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

In [12]:
algo = gs.best_estimator['rmse']
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.170404160000227
{'n_factors': 30, 'n_epochs': 25, 'lr_all': 0.003, 'reg_all': 0.1}


In [14]:
# Build the whole dataset as a training set

In [15]:
# Retrieve the trainset.
trainset = data.build_full_trainset()

In [16]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd65a8b1438>

In [10]:
# Dump algorithm and reload it.
file_name = os.path.expanduser('dump_file')
dump.dump(file_name, algo=algo)


NameError: name 'algo' is not defined

In [11]:
_, loaded_algo = dump.load(file_name)

In [72]:
# Randomly select 10 USER Ids
unique_user_ids = vegas_reviews['user_id'].unique()
select_user_list = random.choices(unique_user_ids,k = 10)


['bLbSNkLggFnqwNNzzq-Ijw',
 'tH0uKD-vNwMoEc3Xk3Cbdg',
 'YE54kKTuqJJPNYWIKIpOEQ',
 'o68A_3bAJTge-ioGXHdxKA',
 'DoRCeCcJbrsM2BiAKj3trA',
 'Mfmx6DSGOrJjr4jeWmqCuw',
 'vWP8-aQX0rYJszqkWVL7Ew',
 'wXZS42D0L8hoCiSh2Krc_A',
 'NrpzAH3EoNhWUR8OysUhBQ',
 'lQGJcwX105k17081f6pulg']

In [9]:
custom = vegas_reviews['user_id'].value_counts()[:100].index.tolist()
custom_user_list = [custom[i] for i in range(0,len(custom),10)]
custom_user_list

['bLbSNkLggFnqwNNzzq-Ijw',
 'B1829_hxXSEpDPEDJtYeIw',
 'keBv05MsMFBd0Hu98vXThQ',
 '_7PfR6Tvh2xTbiVi1GELoQ',
 'WM7MFrRP-7YFuGBAJ6quRQ',
 'rt3PC7WCgCKsoufmQJELfw',
 'aWXGwath_dSGPwNdFiy9BQ',
 '5wT0vDycEzgq95yVCZ60Aw',
 'Q7IoE5m2heQKThuVq3SYFA',
 'RQlnSCjuqMnhR3Qk6j4KoA']

In [15]:
vegas_small = vegas_reviews[vegas_reviews['user_id'].isin(custom_user_list)]
data_small = Dataset.load_from_df(vegas_small[['user_id', 'business_id', 'stars']], reader)

In [16]:
# Retrieve the smaller trainset.
trainset_small = data_small.build_full_trainset()

In [17]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset_small.build_anti_testset()
#predictions = loaded_algo.test(testset)
loaded_predictions = loaded_algo.test(testset)

In [18]:
# Function for the top-10 recommendations for a user
from collections import defaultdict
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [20]:
#Top 10 recommendations for the randomly chosen 10 user ids
top_n = get_top_n(loaded_predictions, n=10)
for i in range(len(custom_user_list)):
    
    usr_id = custom_user_list[i] 
    top_recommendations = top_n[usr_id]
    print(f'For the user id {usr_id}:')
    for rest in top_recommendations:
        print(f'Rest ID: {rest[0]:3}| Rating: {rest[1]:3}')
    print('------'*12)
        

For the user id bLbSNkLggFnqwNNzzq-Ijw:
Rest ID: 3pSUr_cdrphurO6m1HMP9A| Rating: 4.168396808600322
Rest ID: nKph91qATrPCbJ-QwZjDZw| Rating: 4.106289634554573
Rest ID: OMRYQihVjqqzjoNoQgyhgQ| Rating: 4.070132479173685
Rest ID: Ef5P6C2yHAv08FPif5Rdtg| Rating: 4.053395792888882
Rest ID: sVjL9DQ8hYW_pS-jprYLgg| Rating: 4.017729613410831
Rest ID: aS1qJzTGyluRT2tJ09Jbaw| Rating: 4.001651665811226
Rest ID: 296PZdxSrtH08EUwCsOKMw| Rating: 3.990121864528656
Rest ID: l3joBBpkq0ib11dKUpKMAw| Rating: 3.9874022498712947
Rest ID: SEMvJDVnDB4NoPW6tXXO_Q| Rating: 3.958284504633101
Rest ID: cZO6RmJcqRY9jGbYY71z-A| Rating: 3.9389509845958353
------------------------------------------------------------------------
For the user id B1829_hxXSEpDPEDJtYeIw:
Rest ID: vOMDU31gdylrzBhAKC9QbA| Rating: 4.392461230617754
Rest ID: B-9IuVXitKKwXj0fXUxczQ| Rating: 4.388845494036167
Rest ID: G4hjhtA_wQ-tSOGpgGlDjw| Rating: 4.381008218239597
Rest ID: cePE3rCuUOVSCCAHSjWxoQ| Rating: 4.363797002689774
Rest ID: cOqd9ou3uf

In [21]:
# Dumping the top 10 user recommendations in a json file
with open('rec.json', 'w') as fp:
    json.dump(top_n, fp)

## All the top predictions for a select-few have been saved in rec.json