# Collaborative Filtering + Recommender

## Import Libraries and Preprocessed User, Restaurant, and Review Data 

In [1]:
!pip install surprise



In [296]:
import json
import time
import pandas as pd
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from collections import defaultdict
from surprise import accuracy

In [3]:
start = time.time()
with open('MA_users.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    user_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load user data: %d seconds" %duration_without_dr)

Reading MA_users.json
Time taken to load user data: 0 seconds


In [4]:
start = time.time()
with open('MA_restaurants.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    restaurant_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load restaurant data: %d seconds" %duration_without_dr)

Reading MA_restaurants.json
Time taken to load restaurant data: 0 seconds


In [5]:
start = time.time()
with open('MA_reviews.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    review_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load review data: %d seconds" %duration_without_dr)

Reading MA_reviews.json
Time taken to load review data: 2 seconds


In [6]:
print(str(len(user_data)) + ' user records loaded')
print(str(len(review_data)) + ' review records loaded')
print(str(len(restaurant_data)) + ' business records loaded')

13046 user records loaded
465276 review records loaded
6249 business records loaded


## Algorithm setup
> Create business and user dictionary that will be helpful with viewing the results of the recommender

In [237]:
business_id_dict = dict()
for i,r in enumerate(restaurant_data):
    business_id_dict[r['business_id']] = i

def print_business_info(business_id):
    for attr in restaurant_data[business_id_dict[business_id]].keys():
        print('{0}: {1}'.format(str(attr), str(restaurant_data[business_id_dict[business_id]][attr])))
        
def get_business_attr(business_id, attribute):
    return restaurant_data[business_id_dict[business_id]][attribute]

def get_business_reviews(business_id):
    print('Name:                 User ID:               User Total Review Count         Rating')
    for review in review_data:
        if review['business_id'] == business_id:
            print('{:<15s}{:<15s}{:>20.0f}{:>25.1f}'.format(get_user_attr(review['user_id'], 'name'),review['user_id'],get_user_attr(review['user_id'], 'review_count'), review['stars']))

In [None]:
user_id_dict = dict()
for i,u in enumerate(user_data):
    user_id_dict[u['user_id']] = i

def print_user_info(user_id):
    for attr in user_data[user_id_dict[user_id]].keys():
        print('{0}: {1}'.format(str(attr), str(user_data[user_id_dict[user_id]][attr]) if attr != 'friends' else '[list of user ids]'))

def get_user_attr(user_id, attribute):
    return user_data[user_id_dict[user_id]][attribute]

## Create the Review Dataframe
> Create the rating dictionary by extracting the user_id, business_id, and rating from each review <br>
Import the rating dictionary into a pandas dataframe for fitting the algorithm

In [7]:
rating_dict = {'user_id': [],
               'business_id': [],
               'rating': []}

for review in review_data:
    rating_dict['user_id'].append(review['user_id'])
    rating_dict['business_id'].append(review['business_id'])
    rating_dict['rating'].append(review['stars'])

df = pd.DataFrame(rating_dict)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'business_id', 'rating']], reader)
print('The data frame shape:\t{}'.format(df.shape))

The data frame shape:	(465276, 3)


## Running SVD++ on the Review Dataframe
> Set up the SVD++ hyperparameters found in the downselect process <br>
Run SVD++ on the entire dataset (this takes around 15 minutes)

In [8]:
sim_options = {'name': 'cosine',
               'user_based': False}
trainset = data.build_full_trainset()
print(trainset.n_users)
algo = KNNWithMeans(k=40, min_k=5, sim_options=sim_options)

# note: this takes quite a while...
start = time.time()
algo.fit(trainset)
end = time.time()
duration_without_dr = end-start
print("Time taken to fit data: %d seconds" %duration_without_dr)

Time taken to fit data: 565 seconds


## Selecting a Sample User to View Rating Predictions and the Top Recommended Restaurants
> Get a user_id from the review dataframe

In [293]:
print('User_ID               # of Reviews\n')
print('{}'.format(df['user_id'].value_counts()[:900]))

User_ID               # of Reviews

nl8gWLDo6U6MjqzbBmE_9A    1081
rcU7ysY41qGppbw4pQgjqg     618
ggl6fl-PM5O1WrdReL0l4A     618
1Y0zsJSfWLkfDylH0X1yNQ     588
t903_es-gp3abvdrIQutQA     485
                          ... 
c83EoH7q84EumkKAqzKkNQ      84
J3pXjuQLVOMBFjqsviy2pw      84
bG_DOb7mltWDfbJS1xmsPQ      84
0EFQQB40D24-WuIFH6-rpQ      84
kOmv6GPqLm-SkUC5nZj07g      84
Name: user_id, Length: 900, dtype: int64


## Get All Reviews Made by the Sample User

In [384]:
user_id = 'ggl6fl-PM5O1WrdReL0l4A' # sample user
df_new_user = df.loc[df['user_id'] == user_id]
print(df_new_user)

                       user_id             business_id  rating
452     ggl6fl-PM5O1WrdReL0l4A  Z6zAW5KYUbOqlv1pa63STw     4.0
1087    ggl6fl-PM5O1WrdReL0l4A  M0fRUDEo9mbnZFXhvshlFw     4.0
2263    ggl6fl-PM5O1WrdReL0l4A  wLpF_Qxw4d48-KR2ltTD_A     4.0
2268    ggl6fl-PM5O1WrdReL0l4A  oz882XuZCxajKo64Opgq_Q     3.0
2796    ggl6fl-PM5O1WrdReL0l4A  yKvlRknp1Feues1rG-aJlg     4.0
...                        ...                     ...     ...
460923  ggl6fl-PM5O1WrdReL0l4A  RNqUyblZzXvg0a2fBjJQQg     4.0
461602  ggl6fl-PM5O1WrdReL0l4A  jIxhHWqxKd5ZWBIEmsFxWQ     5.0
461960  ggl6fl-PM5O1WrdReL0l4A  mlYVUORelx-_gFihRA1s6g     1.0
462326  ggl6fl-PM5O1WrdReL0l4A  y2w6rFaO0XEiG5mFfOsiFA     4.0
464471  ggl6fl-PM5O1WrdReL0l4A  rnwrQjhb7KcV6uoElNTH5w     5.0

[618 rows x 3 columns]


## Call the Predict Method for Each of the Restaurants the Sample User has Reviewed
> r_ui is the acutal review score the sample user gave to the restaurant <br>
est is the estimated review score given by the collaborative filtering algorithm

In [385]:
pred_list = list()
for business_id in business_id_dict:
    if (df_new_user['business_id'] == business_id).any():
        r_ui = df_new_user.loc[df_new_user['business_id'] == business_id, 'rating'].iloc[0]
        pred = algo.predict(user_id, business_id, r_ui = r_ui, verbose=True)
        pred_list.append(pred)

impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: seC9cfU7It8mbczK442HAw r_ui = 3.00   est = 3.73   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: 8zLHresim_7dPvkYSuBVzA r_ui = 5.00   est = 4.31   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: XZfs0Ct5Rjl-wunzb6M6ag r_ui = 5.00   est = 3.37   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: kX2fm-REksJrXAviP0vK4g r_ui = 5.00   est = 4.32   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: Y6YtAXCglKWQB9ghwmzWkw r_ui = 3.00   est = 3.98   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: 4MClvr12OXBNvGu8h1yGpA r_ui = 3.00   est = 4.24   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: p8ohzzGvGRCHnJKnyO7exA r_ui = 3.00   est = 3.80   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: 54Flq7X4Ugfj02_IzIoIKw r_ui = 3.00   est = 3.73   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: WsYQF28GdiStUJuLSecIaA r_ui = 5.00   est = 4.17   {'was_i

## Accuracy metrics
> Calculate the RMSE between the predicted rating and the acutal rating given by the sample user

In [388]:
accuracy.rmse(pred_list);

RMSE: 0.8001


> Classify reviews with 3 or more stars as Recommended/Positive and reviews with less than 3 stars as Not Recommended/Negative <br>
Calculate the Precision, Recall, and f1 scores

In [386]:
def precision_recall_f1_at_k(predictions, threshold=3):
    rating_list = []
    for n in range(len(pred_list)):
        rating_list.append([pred_list[n][2],pred_list[n][3]])
    n_rel = sum((true_r[0] >= threshold) for true_r in rating_list)
    n_rec_k = sum((est[1] >= threshold) for est in rating_list)
    n_rel_and_rec_k = sum(((rating[0] >= threshold) and (rating[1] >= threshold)) for rating in rating_list)
    precision = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
    recall = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
    f1 = 2*(precision*recall)/(precision+recall)

    return precision, recall, f1

> Precision: Ability of the algorithm to not recommend a restaurant that the user has given a negative review for <br>
Recall: Ability of the algorithm to recommend a restaurant that the user has given a positive review for <br>
f1 score: Weighted mean of the precision and recall metrics

In [387]:
user_precision, user_recall, user_f1 = precision_recall_f1_at_k(pred_list)
print('Precision Score: {0:0.2f}'.format(user_precision*100))
print('Recall Score: {0:0.2f}'.format(user_recall*100))
print('f1 Score: {0:0.2f}'.format(user_f1*100))

Precision Score: 93.48
Recall Score: 94.68
f1 Score: 94.07


## Predict Review Scores for all the Restaurants in the Dataset

In [389]:
pred_list = list()
for business_id in business_id_dict:
    if (df_new_user['business_id'] == business_id).any():
        r_ui = df_new_user.loc[df_new_user['business_id'] == business_id, 'rating'].iloc[0]
        pred = algo.predict(user_id, business_id, r_ui = r_ui, verbose=True)
    else:
        pred = algo.predict(user_id, business_id, verbose=True)
    pred_list.append(pred)

impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: P5lR-AdqxUC944GJ0Sfd_Q r_ui = None   est = 3.76   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: zWPEAPYJjCHE87snhpTEiA r_ui = None   est = 3.67   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: L1yCGs1G1KM-LLIrLIWAPw r_ui = 3.00   est = 2.90   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: 1qO0DUMv1ZtRtPH2VD8x_Q r_ui = 5.00   est = 4.48   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: b0kpd0u7o8HBlrciMbGMqQ r_ui = None   est = 2.99   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: lYfE6rYm-GiZY0zLZ8S1Gw r_ui = None   est = 4.18   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: h0JnvqMcK_G7CeBijxaVAA r_ui = None   est = 3.67   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: YLBFKc5nL1KBPQ3c334ang r_ui = None   est = 3.67   {'was_impossible': False}
user: ggl6fl-PM5O1WrdReL0l4A item: xtum8zS3BO9E1qcUOpi96w r_ui = None   est = 3.67   {'was_i

## Setup the Recommender algorithm
> Sort the predicted review scores from high to low <br>
Grab the top 25 review scores and print out the restaurants they belong to

In [390]:
def find_topn(predictions, n, user_id):
    topn = defaultdict(list)
    for uid, bid, _, est, _ in predictions:
        topn[uid].append((bid, est))

    for uid, est in topn.items():
        est.sort(key=lambda x: x[1], reverse=True)
        topn[uid] = est[:n]

    user_results = pd.DataFrame.from_dict(topn).transpose().loc[user_id]
    top_restaurant_id = user_results.loc[0][0]
    
    print('Top {0} recommendations for user {1}:\n'.format(n, get_user_attr(user_id, 'name')))
    print('   Restaurant:                                  Predicted Rating:              Restaurant ID:\n')
    for i in range(n):
        print('{:<3d}{:<50s}{:1.2f}{:>40s}'.format(i+1, get_business_attr(user_results.loc[i][0], 'name'), user_results.loc[i][1], user_results.loc[i][0]))
        
    return user_results, top_restaurant_id

In [391]:
user_topn, top_restaurant_id = find_topn(pred_list, 25, user_id)

Top 25 recommendations for user Michael:

   Restaurant:                                  Predicted Rating:              Restaurant ID:

1  Harry's Cheese and Cold Cuts                      4.86                  w7JR1y-TCXmWdFKDLW83eQ
2  Polcari's Coffee                                  4.79                  y5lCCZ4YZk4jLSmBaATOmw
3  Recreo Coffee & Roasterie                         4.75                  ZjNw6MQ9EyS16COI465xbQ
4  Tony's Market                                     4.72                  9Ye0-c0YduHxUk707mftZw
5  Tutto Italiano                                    4.70                  che285DuI-eo98PriDIerg
6  Iggy's Bread                                      4.68                  R7BLdmInOLwKh6bpfuB-Pg
7  Tasting Counter                                   4.68                  zMpWhHh6Cp1BkFRM8cbPhA
8  The Table at Season to Taste                      4.67                  WkjIWu8rIpAAUhnqsuik9w
9  Ciao! Pizza & Pasta                               4.66                  yQL8

## Business information on the top recommended restaurant

In [392]:
print_business_info(top_restaurant_id)

business_id: w7JR1y-TCXmWdFKDLW83eQ
name: Harry's Cheese and Cold Cuts
address: 98 Blackstone St
city: Boston
state: MA
postal_code: 02109
latitude: 42.3615179
longitude: -71.0564517
stars: 5.0
review_count: 57
is_open: 1
attributes: {'RestaurantsTakeOut': 'True', 'BusinessAcceptsCreditCards': 'True', 'GoodForKids': 'True', 'OutdoorSeating': 'False', 'Alcohol': "u'none'", 'Ambience': "{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': True, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}", 'RestaurantsGoodForGroups': 'False', 'RestaurantsAttire': "'casual'", 'RestaurantsReservations': 'False', 'BusinessParking': "{'garage': False, 'street': True, 'validated': True, 'lot': False, 'valet': False}", 'NoiseLevel': "u'average'", 'Caters': 'True', 'WiFi': "u'no'", 'RestaurantsDelivery': 'False', 'RestaurantsPriceRange2': '1', 'BikeParking': 'False', 'HasTV': 'True'}
categories: Cheese Shops, Food, Meat Shops, Delis, Restaurants, Specialt

## Review breakdown of the top recommended restaurant

In [393]:
get_business_reviews(top_restaurant_id)

Name:                 User ID:               User Total Review Count         Rating
Tanya          ILZtzt-3oEmLu8c1rfeJuw                 201                      5.0
Steven         0Zq95YruU7IvCsvzFOSSzA                 170                      4.0
Jacquelyn      F0_FBGWlJS7z_qTDm9dJSA                 281                      5.0
boom           bje1P4XGW9lv79iuSKBgiw                  98                      5.0
Lauretta       W_EQFi2fXpBtuRm510cCpA                 474                      5.0
Vivian         nl8gWLDo6U6MjqzbBmE_9A                1871                      5.0
Sarah          5LT0UfheP3c-_xKqmZDgxQ                  51                      4.0
Dorothy        W-SyRg4KEOwcRRA_hj3Txw                 467                      5.0
Max            p2NLkCmcPp1-Mr8x9TXXcQ                  73                      5.0
Chewie         -GnSkFtT3axpOdkSxmx7Zw                  49                      5.0
Benjamin       4v30Vm27ViLTv4pRQNCIPA                  29                      5.0
Ste

## More Information on the Sample User

In [394]:
print_user_info(user_id)

user_id: ggl6fl-PM5O1WrdReL0l4A
name: Michael
review_count: 1434
yelping_since: 2008-01-28 07:34:15
useful: 3376
funny: 1329
cool: 1658
elite: 2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,20,20
friends: [list of user ids]
fans: 120
average_stars: 3.85
compliment_hot: 51
compliment_more: 12
compliment_profile: 9
compliment_cute: 1
compliment_list: 1
compliment_note: 49
compliment_plain: 122
compliment_cool: 145
compliment_funny: 145
compliment_writer: 55
compliment_photos: 32
