In [1]:
!pip install surprise



In [2]:
import json
import time
import numpy as np
import pandas as pd
from surprise import AlgoBase
from surprise import NormalPredictor
from surprise import PredictionImpossible
from surprise import KNNBasic
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

In [3]:
start = time.time()
with open('MA_users.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    user_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load user data: %d seconds" %duration_without_dr)


Reading MA_users.json
Time taken to load user data: 5 seconds


In [4]:
start = time.time()
with open('MA_restaurants.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    restaurant_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load restaurant data: %d seconds" %duration_without_dr)

Reading MA_restaurants.json
Time taken to load restaurant data: 0 seconds


In [5]:
start = time.time()
with open('MA_reviews.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    review_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load review data: %d seconds" %duration_without_dr)

Reading MA_reviews.json
Time taken to load review data: 12 seconds


In [6]:
print(str(len(user_data)) + ' user records loaded')
print(str(len(review_data)) + ' review records loaded')
print(str(len(restaurant_data)) + ' business records loaded')

125521 user records loaded
914710 review records loaded
10550 business records loaded


In [7]:
print(user_data[0].keys(),'\n')
print(review_data[0].keys(),'\n')
print(restaurant_data[0].keys(),'\n')

dict_keys(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny', 'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos']) 

dict_keys(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']) 

dict_keys(['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours']) 



Sample User

In [8]:
for attr in user_data[0].keys():
    print('{0}: {1}'.format(str(attr), str(user_data[0][attr]) if attr != 'friends' else '[list of user ids]'))

user_id: q_QQ5kBBwlCcbL1s4NVK3g
name: Jane
review_count: 1220
yelping_since: 2005-03-14 20:26:35
useful: 15038
funny: 10030
cool: 11291
elite: 2006,2007,2008,2009,2010,2011,2012,2013,2014
friends: [list of user ids]
fans: 1357
average_stars: 3.85
compliment_hot: 1710
compliment_more: 163
compliment_profile: 190
compliment_cute: 361
compliment_list: 147
compliment_note: 1212
compliment_plain: 5691
compliment_cool: 2541
compliment_funny: 2541
compliment_writer: 815
compliment_photos: 323


Sample review

In [9]:
for attr in review_data[0].keys():
    print('{0}: {1}'.format(str(attr), str(review_data[0][attr])))

review_id: lWC-xP3rd6obsecCYsGZRg
user_id: ak0TdVmGKo4pwqdJSTLwWw
business_id: buF9druCkbuXLX526sGELQ
stars: 4.0
useful: 3
funny: 1
cool: 1
text: Apparently Prides Osteria had a rough summer as evidenced by the almost empty dining room at 6:30 on a Friday night. However new blood in the kitchen seems to have revitalized the food from other customers recent visits. Waitstaff was warm but unobtrusive. By 8 pm or so when we left the bar was full and the dining room was much more lively than it had been. Perhaps Beverly residents prefer a later seating. 

After reading the mixed reviews of late I was a little tentative over our choice but luckily there was nothing to worry about in the food department. We started with the fried dough, burrata and prosciutto which were all lovely. Then although they don't offer half portions of pasta we each ordered the entree size and split them. We chose the tagliatelle bolognese and a four cheese filled pasta in a creamy sauce with bacon, asparagus and g

Create a 2d arary of user IDs and buisness IDs, with ratings(stars) as frequencies

In [6]:
user_id_dict = dict()
for i,u in enumerate(user_data):
    user_id_dict[u['user_id']] = i

business_id_dict = dict()
for i,r in enumerate(restaurant_data):
    business_id_dict[r['business_id']] = i

rating_mat = np.zeros((len(user_data), len(restaurant_data)))
rating_dict = {'user_id': [],
               'business_id': [],
               'rating': []}

for review in review_data:
    user_idx = user_id_dict[review['user_id']]
    business_idx = business_id_dict[review['business_id']]
    rating_mat[user_idx][business_idx] = review['stars']
    
    rating_dict['user_id'].append(review['user_id'])
    rating_dict['business_id'].append(review['business_id'])
    rating_dict['rating'].append(review['stars'])

print(rating_mat.shape)
print('number of non-zero elements(good ratings): ' +str(np.count_nonzero(rating_mat)))

(125521, 10550)
number of non-zero elements(good ratings): 879189


In [7]:
df = pd.DataFrame(rating_dict)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'business_id', 'rating']], reader)

In [16]:
sim_options = {'name': 'cosine',
               'user_based': False
               }
trainset = data.build_full_trainset()
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x24985f59100>

Predition for known review (sample review)

In [17]:
# get a prediction for specific users and items
uid = review_data[0]['user_id']
iid = review_data[0]['business_id']

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: ak0TdVmGKo4pwqdJSTLwWw item: buF9druCkbuXLX526sGELQ r_ui = 4.00   est = 4.40   {'actual_k': 5, 'was_impossible': False}


## Algorithm and Parameter Tuning

In [8]:
from surprise.model_selection import GridSearchCV

In [10]:
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False]
}
param_grid = {"sim_options": sim_options}

In [27]:
grid_search = GridSearchCV(KNNBasic, param_grid, measures = ["rmse", "mae"], cv=3)
grid_search.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix.

In [28]:
print(grid_search.best_score["rmse"])
print(grid_search.best_params["rmse"])

print(grid_search.best_score["mae"])
print(grid_search.best_params["mae"])

1.2798358150920215
{'sim_options': {'name': 'msd', 'min_support': 3, 'user_based': False}}
0.9857407519946314
{'sim_options': {'name': 'msd', 'min_support': 3, 'user_based': False}}


In [11]:
from surprise import KNNWithMeans

grid_search = GridSearchCV(KNNWithMeans, param_grid, measures = ["rmse", "mae"], cv=3)
grid_search.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix.

In [12]:
print(grid_search.best_score["rmse"])
print(grid_search.best_params["rmse"])

print(grid_search.best_score["mae"])
print(grid_search.best_params["mae"])

1.1990062079134562
{'sim_options': {'name': 'cosine', 'min_support': 3, 'user_based': False}}
0.9268730123521696
{'sim_options': {'name': 'cosine', 'min_support': 3, 'user_based': False}}


In [14]:
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [True]
}
param_grid = {"sim_options": sim_options}

grid_search = GridSearchCV(KNNBasic, param_grid, measures = ["rmse", "mae"], cv=3)
grid_search.fit(data)

Computing the msd similarity matrix...


MemoryError: Unable to allocate 86.4 GiB for an array with shape (107666, 107666) and data type float64