In [2]:
!pip install surprise



In [3]:
import json
import time
import numpy as np
import pandas as pd
from surprise import AlgoBase
from surprise import NormalPredictor
from surprise import PredictionImpossible
from surprise import KNNBasic
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

In [4]:
start = time.time()
with open('MA_users.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    user_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load user data: %d seconds" %duration_without_dr)


Reading MA_users.json
Time taken to load user data: 1 seconds


In [5]:
start = time.time()
with open('MA_restaurants.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    restaurant_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load restaurant data: %d seconds" %duration_without_dr)

Reading MA_restaurants.json
Time taken to load restaurant data: 0 seconds


In [6]:
start = time.time()
with open('MA_reviews.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    review_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load review data: %d seconds" %duration_without_dr)

Reading MA_reviews.json
Time taken to load review data: 5 seconds


In [7]:
print(str(len(user_data)) + ' user records loaded')
print(str(len(review_data)) + ' review records loaded')
print(str(len(restaurant_data)) + ' business records loaded')

125521 user records loaded
914710 review records loaded
10550 business records loaded


In [8]:
print(user_data[0].keys(),'\n')
print(review_data[0].keys(),'\n')
print(restaurant_data[0].keys(),'\n')

dict_keys(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny', 'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos']) 

dict_keys(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']) 

dict_keys(['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours']) 



Sample User

In [55]:
user_id_dict = dict()
for i,u in enumerate(user_data):
    user_id_dict[u['user_id']] = i

def print_user_info(user_id):
    for attr in user_data[user_id_dict[user_id]].keys():
        print('{0}: {1}'.format(str(attr), str(user_data[user_id_dict[user_id]][attr]) if attr != 'friends' else '[list of user ids]'))
        
print_user_info('nl8gWLDo6U6MjqzbBmE_9A')

user_id: nl8gWLDo6U6MjqzbBmE_9A
name: Vivian
review_count: 1871
yelping_since: 2008-08-04 15:36:54
useful: 3624
funny: 1018
cool: 1477
elite: 2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,20,20
friends: [list of user ids]
fans: 195
average_stars: 3.57
compliment_hot: 55
compliment_more: 12
compliment_profile: 1
compliment_cute: 2
compliment_list: 10
compliment_note: 47
compliment_plain: 210
compliment_cool: 113
compliment_funny: 113
compliment_writer: 59
compliment_photos: 62


Sample review

In [10]:
for attr in review_data[0].keys():
    print('{0}: {1}'.format(str(attr), str(review_data[0][attr])))

review_id: lWC-xP3rd6obsecCYsGZRg
user_id: ak0TdVmGKo4pwqdJSTLwWw
business_id: buF9druCkbuXLX526sGELQ
stars: 4.0
useful: 3
funny: 1
cool: 1
text: Apparently Prides Osteria had a rough summer as evidenced by the almost empty dining room at 6:30 on a Friday night. However new blood in the kitchen seems to have revitalized the food from other customers recent visits. Waitstaff was warm but unobtrusive. By 8 pm or so when we left the bar was full and the dining room was much more lively than it had been. Perhaps Beverly residents prefer a later seating. 

After reading the mixed reviews of late I was a little tentative over our choice but luckily there was nothing to worry about in the food department. We started with the fried dough, burrata and prosciutto which were all lovely. Then although they don't offer half portions of pasta we each ordered the entree size and split them. We chose the tagliatelle bolognese and a four cheese filled pasta in a creamy sauce with bacon, asparagus and g

Sample business

In [56]:
business_id_dict = dict()
for i,r in enumerate(restaurant_data):
    business_id_dict[r['business_id']] = i

def print_business_info(business_id):
    for attr in restaurant_data[business_id_dict[business_id]].keys():
        print('{0}: {1}'.format(str(attr), str(restaurant_data[business_id_dict[business_id]][attr])))
        
print_business_info('zMpWhHh6Cp1BkFRM8cbPhA')

business_id: zMpWhHh6Cp1BkFRM8cbPhA
name: Tasting Counter
address: 14 Tyler St
city: Somerville
state: MA
postal_code: 02143
latitude: 42.3817829
longitude: -71.1056689
stars: 4.5
review_count: 190
is_open: 1
attributes: {'RestaurantsPriceRange2': '4', 'RestaurantsTakeOut': 'False', 'RestaurantsDelivery': 'False', 'BusinessAcceptsCreditCards': 'True', 'OutdoorSeating': 'False', 'RestaurantsReservations': 'True', 'Alcohol': "u'beer_and_wine'", 'WiFi': "u'no'", 'BusinessAcceptsBitcoin': 'False', 'NoiseLevel': "u'average'", 'HasTV': 'False', 'RestaurantsAttire': "u'dressy'", 'Caters': 'False', 'BusinessParking': "{'garage': False, 'street': True, 'validated': False, 'lot': True, 'valet': False}", 'DogsAllowed': 'False', 'RestaurantsTableService': 'True', 'WheelchairAccessible': 'True', 'ByAppointmentOnly': 'True', 'RestaurantsGoodForGroups': 'False', 'GoodForKids': 'False', 'BikeParking': 'True', 'GoodForMeal': "{'dessert': False, 'latenight': False, 'lunch': False, 'dinner': True, 'brunc

Create a 2d arary of user IDs and buisness IDs, with ratings(stars) as frequencies

In [57]:
rating_mat = np.zeros((len(user_data), len(restaurant_data)))
rating_dict = {'user_id': [],
               'business_id': [],
               'rating': []}

for review in review_data:
    user_idx = user_id_dict[review['user_id']]
    business_idx = business_id_dict[review['business_id']]
    rating_mat[user_idx][business_idx] = review['stars']
    
    rating_dict['user_id'].append(review['user_id'])
    rating_dict['business_id'].append(review['business_id'])
    rating_dict['rating'].append(review['stars'])

print(rating_mat.shape)
print('number of non-zero elements(good ratings): ' +str(np.count_nonzero(rating_mat)))

(125521, 10550)
number of non-zero elements(good ratings): 879189


In [58]:
df = pd.DataFrame(rating_dict)

# filter to only the users/businesses with over 50 reviews
# necessary in order to build the anti testset
filter_user = df['user_id'].value_counts() > 50
filter_user = filter_user[filter_user].index.tolist()

filter_business = df['business_id'].value_counts() > 50
filter_business = filter_business[filter_business].index.tolist()

df_new = df[(df['user_id'].isin(filter_user)) & (df['business_id'].isin(filter_business))]

print('The original data frame shape:\t{}'.format(df.shape))
print('The new data frame shape:\t{}'.format(df_new.shape))

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_new[['user_id', 'business_id', 'rating']], reader)

The original data frame shape:	(914710, 3)
The new data frame shape:	(215691, 3)


In [None]:
# clean up some memory
del rating_mat
del df

sim_options = {'name': 'cosine',
               'user_based': False}
trainset = data.build_full_trainset()
algo = KNNWithMeans(sim_options=sim_options)
algo.fit(trainset)

# note: this takes quite a while...
testset = trainset.build_anti_testset()
predictions = algo.test(testset)
dump.dump('./predictions', predictions, algo=KNNWithMeans())
# predictions = dump.load('./predictions')

In [None]:
user_id = 'nl8gWLDo6U6MjqzbBmE_9A'
business_id = 'CoZmZKv2lCYd'
print(algo.predict(uid=user_id, iid=business_id))
print(predictions[1])

topn = defaultdict(list)
n = 5

for uid, bid, _, est, _ in predictions:
    topn[uid].append((bid, est))

for uid, est in topn.items():
    est.sort(key=lambda x: x[1], reverse=True)
    topn[uid] = est[:n]

user_results = pd.DataFrame.from_dict(topn).transpose().loc[user_id]
print('Top {0} recommendations for user {1}\n{2}'.format(n, user_id, user_results))

## Algorithm and Parameter Tuning

### Validation methods
Parameter grid search given the below error measurements:
#### Root Mean Square Error (RMSE)
* Measures standard deviation of errors in set of predictions
* Goal: minimize RMSE

#### Mean Absolute Error (MAE)
* Measures average magnitude of errors in set of predictions
* Goal: minimize MAE

In [37]:
import random
from surprise.model_selection import GridSearchCV
from surprise import accuracy


Load the full dataset

In [28]:
df = pd.DataFrame(rating_dict)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'business_id', 'rating']], reader)

Get and shuffle ratings

In [29]:
raw_ratings= data.raw_ratings
random.shuffle(raw_ratings)

Split Data (Train 80%-Test 20%)

In [40]:
split_boundary = int(.8 * len(raw_ratings))
trainset_ratings = raw_ratings[:split_boundary]
testset_ratings = raw_ratings[split_boundary:]

#set data to training_set
data.raw_ratings = trainset_ratings

Select best params/algo with Grid search

In [21]:
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False]
}
param_grid = {"sim_options": sim_options}

In [31]:
trainset = data.build_full_trainset()
grid_search = GridSearchCV(KNNBasic, param_grid, measures = ["rmse", "mae"], cv=3)
grid_search.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix.

In [32]:
print(grid_search.best_score["rmse"])
print(grid_search.best_params["rmse"])

print(grid_search.best_score["mae"])
print(grid_search.best_params["mae"])

1.2882728325118868
{'sim_options': {'name': 'msd', 'min_support': 5, 'user_based': False}}
1.003291739981118
{'sim_options': {'name': 'msd', 'min_support': 3, 'user_based': False}}


In [33]:
algo = grid_search.best_estimator['rmse']

In [34]:
#retrain on the whole training set
trainset = data.build_full_trainset()
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x25b8fa0e2e0>

Compute biased accuracy of training set

In [38]:
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on training set: ")
accuracy.rmse(predictions)

Biased accuracy on training set: 
RMSE: 0.6127


0.612657228372552

Compute unbiased accuracy of the test set:

In [41]:
testset = data.construct_testset(testset_ratings)
predictions = algo.test(testset)
print("Unbiased accuracy on training set: ")
accuracy.rmse(predictions)

Unbiased accuracy on training set: 
RMSE: 1.2677


1.2677097435104183

### Repeat process with KNNWithMeans Algorithm

In [47]:
df = pd.DataFrame(rating_dict)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'business_id', 'rating']], reader)

# set data to training set
data.raw_ratings = trainset_ratings

In [48]:
from surprise import KNNWithMeans

grid_search = GridSearchCV(KNNWithMeans, param_grid, measures = ["rmse", "mae"], cv=3)
grid_search.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix.

In [51]:
print(grid_search.best_score["rmse"])
print(grid_search.best_params["rmse"])

print(grid_search.best_score["mae"])
print(grid_search.best_params["mae"])

1.2109716403698287
{'sim_options': {'name': 'cosine', 'min_support': 5, 'user_based': False}}
0.9425676047709648
{'sim_options': {'name': 'cosine', 'min_support': 3, 'user_based': False}}


In [50]:
algo = grid_search.best_estimator['rmse']

In [52]:
# retrain on the whole training set
trainset = data.build_full_trainset()
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x25b8fa01670>

Compute biased accuracy of training set

In [53]:
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on training set: ")
accuracy.rmse(predictions)

Biased accuracy on training set: 
RMSE: 0.7779


0.7778719684014103

Compute unbiased accuracy of test set

In [54]:
testset = data.construct_testset(testset_ratings)
predictions = algo.test(testset)
print("Unbiased accuracy on training set: ")
accuracy.rmse(predictions)

Unbiased accuracy on training set: 
RMSE: 1.1891


1.189111605607265

### Parameter and Algorithm Tuning Results

Best (smallest) MAE and RMSE score for collaborative filtering on the yelp dataset use the following params:

* KNNWithMeans
* cosine similarity
* minimum support of 5? 
* not user_based

(Note to team members -I notice someone was able to run the true, if so perhaps we should integrate into the grid search above and tell me if its more efficient than false? the only thing is I wont be able to run it, so if false has a better score than perhaps we should hold off but definitely use it in our final implementation)

In [14]:
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [True]
}
param_grid = {"sim_options": sim_options}

grid_search = GridSearchCV(KNNBasic, param_grid, measures = ["rmse", "mae"], cv=3)
grid_search.fit(data)

Computing the msd similarity matrix...


MemoryError: Unable to allocate 86.4 GiB for an array with shape (107666, 107666) and data type float64