# Yelp On the Go! 
### Collaborative Filtering Recommender (SVD++)


## Install and Import Library Dependencies  

In [None]:
!pip install surprise

In [None]:
import json
import time
import pandas as pd
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from collections import defaultdict
from surprise import accuracy

## Load Preprocessed User, Restaurant, and Review Data 

In [None]:
start = time.time()
with open('MA_users.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    user_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load user data: %d seconds" %duration_without_dr)

In [None]:
start = time.time()
with open('MA_restaurants.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    restaurant_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load restaurant data: %d seconds" %duration_without_dr)

In [None]:
start = time.time()
with open('MA_reviews.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    review_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load review data: %d seconds" %duration_without_dr)

In [None]:
print(str(len(user_data)) + ' user records loaded')
print(str(len(review_data)) + ' review records loaded')
print(str(len(restaurant_data)) + ' business records loaded')

## Algorithm Setup
> Create business and user dictionary that will be helpful with viewing the results of the recommender

In [None]:
business_id_dict = dict()
for i,r in enumerate(restaurant_data):
    business_id_dict[r['business_id']] = i

def print_business_info(business_id):
    for attr in restaurant_data[business_id_dict[business_id]].keys():
        print('{0}: {1}'.format(str(attr), str(restaurant_data[business_id_dict[business_id]][attr])))
        
def get_business_attr(business_id, attribute):
    return restaurant_data[business_id_dict[business_id]][attribute]

def get_business_reviews(business_id):
    print('Name:                 User ID:               User Total Review Count         Rating')
    for review in review_data:
        if review['business_id'] == business_id:
            print('{:<15s}{:<15s}{:>20.0f}{:>25.1f}'.format(get_user_attr(review['user_id'], 'name'),review['user_id'],get_user_attr(review['user_id'], 'review_count'), review['stars']))

In [None]:
user_id_dict = dict()
for i,u in enumerate(user_data):
    user_id_dict[u['user_id']] = i

def print_user_info(user_id):
    for attr in user_data[user_id_dict[user_id]].keys():
        print('{0}: {1}'.format(str(attr), str(user_data[user_id_dict[user_id]][attr]) if attr != 'friends' else '[list of user ids]'))

def get_user_attr(user_id, attribute):
    return user_data[user_id_dict[user_id]][attribute]

## Create the Review Dataframe
> Create the rating dictionary by extracting the user_id, business_id, and rating from each review <br>
Import the rating dictionary into a pandas dataframe for fitting the algorithm

In [None]:
rating_dict = {'user_id': [],
               'business_id': [],
               'rating': []}

for review in review_data:
    rating_dict['user_id'].append(review['user_id'])
    rating_dict['business_id'].append(review['business_id'])
    rating_dict['rating'].append(review['stars'])

df = pd.DataFrame(rating_dict)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'business_id', 'rating']], reader)
print('The data frame shape:\t{}'.format(df.shape))

## Running SVD++ on the Review Dataframe
> Set up the SVD++ hyperparameters found in the downselect process <br>
Run SVD++ on the entire dataset (this takes around 15 minutes)

In [None]:
trainset = data.build_full_trainset()
algo = SVDpp(n_epochs=30, lr_all=0.0025)

# note: this takes quite a while...
start = time.time()
algo.fit(trainset)
end = time.time()
duration_without_dr = end-start
print("Time taken to fit data: %d seconds" %duration_without_dr)

## Sample User Test and Validation
Select a sample user to view rating predictions and the top recommended restaurants

> Get a user_id from the review dataframe <br>
The following code block outputs a list of user_ids and their total # of reviews <br>

In [None]:
print('User_ID               # of Reviews\n')
print('{}'.format(df['user_id'].value_counts()[:900]))

In [None]:
########     User Input     ########
########     User Input     ########
########     User Input     ########
user_id = 'ggl6fl-PM5O1WrdReL0l4A' # sample user
########     User Input     ########
########     User Input     ########
########     User Input     ########

### Sample User Reviews
Get All Reviews Made by the Sample User

In [None]:
df_new_user = df.loc[df['user_id'] == user_id]
print(df_new_user)

### Predict Sample User's Restaurant Review Scores for Restaurants Already Reviewed

Call the Predict Method for Each of the Restaurants the Sample User has Reviewed
> r_ui is the actual review score the sample user gave to the restaurant <br>
est is the estimated review score given by the collaborative filtering algorithm

In [None]:
pred_list = list()
for business_id in business_id_dict:
    if (df_new_user['business_id'] == business_id).any():
        r_ui = df_new_user.loc[df_new_user['business_id'] == business_id, 'rating'].iloc[0]
        pred = algo.predict(user_id, business_id, r_ui = r_ui, verbose=True)
        pred_list.append(pred)

## Accuracy Metrics
### Root Mean Square Error (RMSE)

> Calculate the RMSE between the predicted rating and the acutal rating given by the sample user

In [None]:
accuracy.rmse(pred_list)

> Classify reviews with 3 or more stars as Recommended/Positive and reviews with less than 3 stars as Not Recommended/Negative <br>
Calculate the Precision, Recall, and f1 scores

### Precision, Recall, and F1 Score

In [None]:
def precision_recall_f1_at_k(predictions, threshold=3):
    rating_list = []
    for n in range(len(pred_list)):
        rating_list.append([pred_list[n][2],pred_list[n][3]])
    n_rel = sum((true_r[0] >= threshold) for true_r in rating_list)
    n_rec_k = sum((est[1] >= threshold) for est in rating_list)
    n_rel_and_rec_k = sum(((rating[0] >= threshold) and (rating[1] >= threshold)) for rating in rating_list)
    precision = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
    recall = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
    f1 = 2*(precision*recall)/(precision+recall)

    return precision, recall, f1

> Precision: Ability of the algorithm to not recommend a restaurant that the user has given a negative review for <br>
Recall: Ability of the algorithm to recommend a restaurant that the user has given a positive review for <br>
f1 score: Weighted mean of the precision and recall metrics

In [None]:
user_precision, user_recall, user_f1 = precision_recall_f1_at_k(pred_list)
print('Precision Score: {0:0.2f}'.format(user_precision*100))
print('Recall Score: {0:0.2f}'.format(user_recall*100))
print('f1 Score: {0:0.2f}'.format(user_f1*100))

### Make Recommendations to Sample User
Predictions across all the restaurants in the dataset (even those not visited by the user)

In [None]:
pred_list = list()
for business_id in business_id_dict:
    if (df_new_user['business_id'] == business_id).any():
        r_ui = df_new_user.loc[df_new_user['business_id'] == business_id, 'rating'].iloc[0]
        pred = algo.predict(user_id, business_id, r_ui = r_ui, verbose=True)
    else:
        pred = algo.predict(user_id, business_id, verbose=True)
    pred_list.append(pred)

## Recommender Algorithm Set up
> Sort the predicted review scores from high to low <br>
Grab the top 25 review scores and print out the restaurants they belong to

In [None]:
def find_topn(predictions, n, user_id):
    topn = defaultdict(list)
    for uid, bid, _, est, _ in predictions:
        topn[uid].append((bid, est))

    for uid, est in topn.items():
        est.sort(key=lambda x: x[1], reverse=True)
        topn[uid] = est[:n]

    user_results = pd.DataFrame.from_dict(topn).transpose().loc[user_id]
    top_restaurant_id = user_results.loc[0][0]
    
    print('Top {0} recommendations for user {1}:\n'.format(n, get_user_attr(user_id, 'name')))
    print('   Restaurant:                                  Predicted Rating:              Restaurant ID:\n')
    for i in range(n):
        print('{:<3d}{:<50s}{:1.2f}{:>40s}'.format(i+1, get_business_attr(user_results.loc[i][0], 'name'), user_results.loc[i][1], user_results.loc[i][0]))
        
    return user_results, top_restaurant_id

## Final Recommendations for Sample User

### Top 25 Recommendations for Sample User

In [None]:
user_topn, top_restaurant_id = find_topn(pred_list, 25, user_id)

## Top Recommended Restaurant
### General business information

In [None]:
print_business_info(top_restaurant_id)

### Restaurant Reviews

In [None]:
get_business_reviews(top_restaurant_id)

## Sample User Profile

In [None]:
print_user_info(user_id)