In [1]:
#!pip install scikit-surprise

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from surprise import Dataset, Reader
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv("./datasets/rating_final.csv")

data.head()

data.shape

(1161, 5)

In [2]:
data = pd.read_csv("./datasets/rating_final.csv")

In [3]:
data.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating
0,U1077,135085,2,2,2
1,U1077,135038,2,2,1
2,U1077,132825,2,2,2
3,U1077,135060,1,2,2
4,U1068,135104,1,1,2


In [4]:
data.shape

(1161, 5)

In [5]:
data.nunique()

userID            138
placeID           130
rating              3
food_rating         3
service_rating      3
dtype: int64

In [6]:
data.rating.unique()

array([2, 1, 0])

In [17]:
reader = Reader(line_format="user item rating", rating_scale = (0, 2))
#reader = Reader()
#restaurant_data = Dataset.load_from_df(data, reader)

In [20]:
restaurant_data = Dataset.load_from_df(data[['userID','placeID','rating']], reader)

In [21]:
print(restaurant_data)
print(type(restaurant_data))

<surprise.dataset.DatasetAutoFolds object at 0x7f5f3770a8d0>
<class 'surprise.dataset.DatasetAutoFolds'>


In [22]:
restaurant_data.raw_ratings[0:10]

[('U1077', 135085, 2.0, None),
 ('U1077', 135038, 2.0, None),
 ('U1077', 132825, 2.0, None),
 ('U1077', 135060, 1.0, None),
 ('U1068', 135104, 1.0, None),
 ('U1068', 132740, 0.0, None),
 ('U1068', 132663, 1.0, None),
 ('U1068', 132732, 0.0, None),
 ('U1068', 132630, 1.0, None),
 ('U1067', 132584, 2.0, None)]

In [23]:
similarity_parameters = {
    'name' : 'cosine',
    'user_based': True,
    'min_support' : 3
}

In [24]:
from surprise import KNNWithMeans

KNN_Algo = KNNWithMeans(k=3, sim_options = similarity_parameters)

In [25]:
from surprise.model_selection import cross_validate

cross_validate(KNN_Algo, 
               restaurant_data, 
               measures=['RMSE', 'MAE'], 
               cv=5, 
               verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7238  0.7015  0.6796  0.7031  0.7089  0.7034  0.0143  
MAE (testset)     0.5389  0.5168  0.4995  0.5242  0.5191  0.5197  0.0127  
Fit time          0.01    0.01    0.01    0.00    0.00    0.01    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


{'test_rmse': array([0.72380989, 0.70150443, 0.67961246, 0.70314946, 0.70890081]),
 'test_mae': array([0.53891921, 0.51682017, 0.4995451 , 0.52421271, 0.51908552]),
 'fit_time': (0.00899052619934082,
  0.0062868595123291016,
  0.005037546157836914,
  0.004825115203857422,
  0.004695415496826172),
 'test_time': (0.008207082748413086,
  0.005965232849121094,
  0.00599360466003418,
  0.005461215972900391,
  0.005173444747924805)}

In [26]:
# Use full data for training

trainset = restaurant_data.build_full_trainset()

KNN_Algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f5f376be510>

In [27]:
# Getting data points where predictions can be made
testset = trainset.build_anti_testset()

In [28]:
# Making predictions
predictions = KNN_Algo.test(testset)

In [29]:
# Verify few predictions
predictions[0:4]

[Prediction(uid='U1077', iid=135104, r_ui=1.1998277347114557, est=1.4, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='U1077', iid=132740, r_ui=1.1998277347114557, est=1.4, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='U1077', iid=132663, r_ui=1.1998277347114557, est=1.4, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='U1077', iid=132732, r_ui=1.1998277347114557, est=1.4, details={'actual_k': 0, 'was_impossible': False})]

In [30]:
# Fetching top 10 predictions for each user
from collections import defaultdict

def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

top_n = get_top_n(predictions, n=10)
take(10, top_n.items())

[('U1077',
  [(135045, 2),
   (132861, 2),
   (135069, 1.8545454545454545),
   (135047, 1.8414473884803486),
   (135079, 1.8267155518855318),
   (135051, 1.6934186173293702),
   (135058, 1.6798990053251242),
   (135062, 1.674811100638013),
   (135053, 1.6307692307692307),
   (135081, 1.6160376805329344)]),
 ('U1068',
  [(132613, 1.1636921554131658),
   (132584, 0.8018777508372117),
   (132667, 0.710920012738908),
   (135085, 0.625),
   (135038, 0.625),
   (132825, 0.625),
   (135060, 0.625),
   (135088, 0.625),
   (132583, 0.625),
   (132665, 0.625)]),
 ('U1067',
  [(132613, 1.6800308183357886),
   (132667, 1.231090336825388),
   (135085, 1.0),
   (135038, 1.0),
   (132825, 1.0),
   (135060, 1.0),
   (135088, 1.0),
   (132583, 1.0),
   (132665, 1.0),
   (132668, 1.0)]),
 ('U1103',
  [(132594, 1.150778034515488),
   (135085, 1.125),
   (135038, 1.125),
   (132825, 1.125),
   (135060, 1.125),
   (135088, 1.125),
   (132583, 1.125),
   (132665, 1.125),
   (132668, 1.125),
   (132715, 1.12

In [31]:
# Printing top predictions
for uid, user_ratings in take(10,top_n.items()):
    print(uid, [iid for (iid, _) in user_ratings])

U1077 [135045, 132861, 135069, 135047, 135079, 135051, 135058, 135062, 135053, 135081]
U1068 [132613, 132584, 132667, 135085, 135038, 132825, 135060, 135088, 132583, 132665]
U1067 [132613, 132667, 135085, 135038, 132825, 135060, 135088, 132583, 132665, 132668]
U1103 [132594, 135085, 135038, 132825, 135060, 135088, 132583, 132665, 132668, 132715]
U1107 [135085, 135038, 132825, 135060, 135104, 132740, 132663, 132732, 132630, 132560]
U1044 [135001, 135085, 135038, 132825, 135060, 135104, 132740, 132663, 132732, 132630]
U1070 [135085, 135038, 132825, 135060, 135104, 132740, 132663, 132732, 132630, 132584]
U1031 [135085, 135038, 132825, 135060, 135104, 132740, 132732, 132630, 132584, 132733]
U1082 [132584, 135085, 135038, 132825, 135060, 135088, 132583, 132665, 132668, 132715]
U1023 [132564, 135085, 135038, 132825, 135060, 135104, 132663, 132732, 132630, 132584]


In [32]:
places = pd.read_csv("./datasets/geoplaces2.csv")

In [35]:
places = places.set_index('placeID')
places.head()

Unnamed: 0_level_0,latitude,longitude,the_geom_meter,name,address,city,state,country,fax,zip,alcohol,smoking_area,dress_code,accessibility,price,url,Rambience,franchise,area,other_services
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC46...,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,?,No_Alcohol_Served,none,informal,no_accessibility,medium,kikucuernavaca.com.mx,familiar,f,closed,none
132825,22.147392,-100.983092,0101000020957F00001AD016568C4858C1243261274BA5...,puesto de tacos,esquina santos degollado y leon guzman,s.l.p.,s.l.p.,mexico,?,78280,No_Alcohol_Served,none,informal,completely,low,?,familiar,f,open,none
135106,22.149709,-100.976093,0101000020957F0000649D6F21634858C119AE9BF528A3...,El Rinc�n de San Francisco,Universidad 169,San Luis Potosi,San Luis Potosi,Mexico,?,78000,Wine-Beer,only at bar,informal,partially,medium,?,familiar,f,open,none
132667,23.752697,-99.163359,0101000020957F00005D67BCDDED8157C1222A2DC8D84D...,little pizza Emilio Portes Gil,calle emilio portes gil,victoria,tamaulipas,?,?,?,No_Alcohol_Served,none,informal,completely,low,?,familiar,t,closed,none
132613,23.752903,-99.165076,0101000020957F00008EBA2D06DC8157C194E03B7B504E...,carnitas_mata,lic. Emilio portes gil,victoria,Tamaulipas,Mexico,?,?,No_Alcohol_Served,permitted,informal,completely,medium,?,familiar,t,closed,none


In [37]:
# Printing top predictions
for uid, user_ratings in take(5,top_n.items()):
    print("For User",uid)
    for  (iid, _) in user_ratings:
        print(iid)
        ids = iid-1
        print(places.loc[iid,"name"])

For User U1077
135045
Restaurante la Gran Via
132861
Carls Jr
135069
Abondance Restaurante Bar
135047
Restaurante Casa de las Flores
135079
Koye Sushi
135051
Restaurante Versalles
135058
Restaurante Tiberius
135062
Restaurante El Cielo Potosino
135053
La Fontana Pizza Restaurante and Cafe
135081
El Club
For User U1068
132613
carnitas_mata
132584
Gorditas Dona Tota
132667
little pizza Emilio Portes Gil
135085
Tortas Locas Hipocampo
135038
Restaurant la Chalita
132825
puesto de tacos
135060
Restaurante Marisco Sam
135088
Cafeteria cenidet
132583
McDonalds Centro
132665
TACOS CORRECAMINOS
For User U1067
132613
carnitas_mata
132667
little pizza Emilio Portes Gil
135085
Tortas Locas Hipocampo
135038
Restaurant la Chalita
132825
puesto de tacos
135060
Restaurante Marisco Sam
135088
Cafeteria cenidet
132583
McDonalds Centro
132665
TACOS CORRECAMINOS
132668
TACOS EL GUERO
For User U1103
132594
tacos de barbacoa enfrente del Tec
135085
Tortas Locas Hipocampo
135038
Restaurant la Chalita
132825


In [38]:
from surprise.model_selection import GridSearchCV

sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}

param_grid = {"sim_options": sim_options}

jokes_gs = GridSearchCV(KNNWithMeans, 
                  param_grid, 
                  measures=["rmse", "mae"], 
                        cv=3)

jokes_gs.fit(restaurant_data)

print(jokes_gs.best_score["rmse"])
print(jokes_gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [39]:
from surprise import SVD
from surprise import Dataset,accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

# Load the in-built movielens-100k dataset (download it if needed).
#ml_data = Dataset.load_builtin('ml-100k')

# sample random trainset and testset
# test set is made of 25% of the ratings.
#trainset, testset = train_test_split(ml_data, test_size=.25)

# We'll use the famous SVD algorithm.
SVD_Algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
SVD_Algo.fit(trainset)
predictions = SVD_Algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.3164


0.31639843081516855

In [41]:
%%time
import random                                                              
                                                                           
# Load your full dataset.                                                  
#ml_data = Dataset.load_builtin('ml-100k')                                     
raw_ratings = restaurant_data.raw_ratings                                             
                                                                           
# shuffle ratings if you want                                              
random.shuffle(raw_ratings)                                                
                                                                           
# 90% trainset, 10% testset                                                
threshold = int(.9 * len(raw_ratings))                                     
trainset_raw_ratings = raw_ratings[:threshold]                             
test_raw_ratings = raw_ratings[threshold:]                                 
                                                                           
restaurant_data.raw_ratings = trainset_raw_ratings  # data is now your trainset                                                           
                                                                           
# Select your best algo with grid search. Verbosity is buggy, I'll fix it. 
print('GRID SEARCH BEGIN...')                                                    
param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}

movie_gs = GridSearchCV(SVD, 
                        param_grid, 
                        measures=["rmse", "mae"], 
                        cv=3)

movie_gs.fit(restaurant_data)
print('GRID SEARCH END...')                                                    

GRID SEARCH BEGIN...
GRID SEARCH END...
CPU times: user 499 ms, sys: 0 ns, total: 499 ms
Wall time: 500 ms


In [43]:
ml_final = movie_gs.best_estimator['rmse']                                  
                                                                           
# retrain on the whole train set                                           
trainset = restaurant_data.build_full_trainset()                                      
ml_final.fit(trainset)                                                       
                                                                           
# now test on the trainset                                                 
testset = restaurant_data.construct_testset(trainset_raw_ratings)                     
predictions = ml_final.test(testset)                                           
print('Accuracy on the trainset:')                                         
accuracy.rmse(predictions)                                                 
                                                                           
# now test on the testset                                                  
testset = restaurant_data.construct_testset(test_raw_ratings)                         
predictions = ml_final.test(testset)                                           
print('Accuracy on the testset:')                                          
accuracy.rmse(predictions)

Accuracy on the trainset:
RMSE: 0.6045
Accuracy on the testset:
RMSE: 0.7167


0.7167323824678885

In [44]:
predictions[0:10]

[Prediction(uid='U1037', iid=132830, r_ui=2.0, est=1.0232639862964414, details={'was_impossible': False}),
 Prediction(uid='U1103', iid=132732, r_ui=0.0, est=1.0081680068496939, details={'was_impossible': False}),
 Prediction(uid='U1085', iid=132825, r_ui=2.0, est=1.1841977075148524, details={'was_impossible': False}),
 Prediction(uid='U1109', iid=135058, r_ui=2.0, est=1.2807392200801475, details={'was_impossible': False}),
 Prediction(uid='U1118', iid=134992, r_ui=0.0, est=1.0324341632740563, details={'was_impossible': False}),
 Prediction(uid='U1125', iid=135076, r_ui=1.0, est=1.1240896663944595, details={'was_impossible': False}),
 Prediction(uid='U1084', iid=132723, r_ui=1.0, est=1.61421210183861, details={'was_impossible': False}),
 Prediction(uid='U1104', iid=135062, r_ui=1.0, est=1.248258360750619, details={'was_impossible': False}),
 Prediction(uid='U1029', iid=135047, r_ui=1.0, est=1.0689887941001297, details={'was_impossible': False}),
 Prediction(uid='U1111', iid=135108, r_u

In [45]:
top_n = get_top_n(predictions, n=10)
take(10, top_n.items())

[('U1037', [(132830, 1.0232639862964414)]),
 ('U1103', [(132732, 1.0081680068496939)]),
 ('U1085', [(135055, 1.2859905256234676), (132825, 1.1841977075148524)]),
 ('U1109', [(135028, 1.482193744170713), (135058, 1.2807392200801475)]),
 ('U1118', [(134992, 1.0324341632740563)]),
 ('U1125', [(135032, 1.1770070473255694), (135076, 1.1240896663944595)]),
 ('U1084', [(132723, 1.61421210183861)]),
 ('U1104', [(132825, 1.269697709390583), (135062, 1.248258360750619)]),
 ('U1029', [(135047, 1.0689887941001297)]),
 ('U1111', [(135108, 1.1357055746053981), (135071, 1.054959889776967)])]

In [47]:
# Printing top predictions
for uid, user_ratings in take(5,top_n.items()):
    print("For User",uid)
    for  (iid, _) in user_ratings:
        print(iid)
        ids = iid-1
        print(places.loc[iid,"name"])

For User U1037
132830
Rincon Huasteco
For User U1103
132732
Taqueria EL amigo 
For User U1085
135055
la Cochinita Pibil Restaurante Yucateco
132825
puesto de tacos
For User U1109
135028
La Virreina
135058
Restaurante Tiberius
For User U1118
134992
Restaurant Teely
