## Read Data

In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/'My Drive'/'SearchAndDiscoveryData'

/content/drive/My Drive/SearchAndDiscoveryData


## Imports

In [3]:
#!/usr/bin/env python
!pip install surprise
!pip install lenskit
import pandas as pd
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import KNNBasic,  KNNWithMeans, KNNBaseline
from surprise.model_selection import KFold, PredefinedKFold, GridSearchCV
from surprise import Reader
from lenskit.crossfold import partition_users, SampleFrac
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict



## Read and filter data

In [4]:
df = pd.read_csv('df_yelp_200K.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.head()

Unnamed: 0,review_id,user_id,business_id,review_stars,useful_x,funny_x,cool_x,text_x,date,Unnamed: 0_x,name_x,review_count_x,yelping_since,useful_y,funny_y,cool_y,elite,friends,fans,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos,Unnamed: 0_y,name_y,address,city,state,postal_code,latitude,longitude,stars,review_count_y,is_open,attributes,categories,hours
0,G1rbhEyMUVnFIq1kAT8jCg,nIJD_7ZXHq-FX8byPMOkMQ,J8GuDrqeLH1xMpSpeEixYA,2.0,2.0,1.0,0.0,I don't get all the hype. This place was about...,2014-05-25 15:43:01,28831,Nicole,36,2008-09-19 23:34:49,46,24,13,,"B5vnnBub9sscTix_tPAwUw, FKFWX9kiyTvJY8_P9j_Rmw...",0,2.89,0,0,0,1,0,3,2,0,0,0,0,207023,Biscuits,1235 N Gilbert Rd,Gilbert,AZ,85234,33.372971,-111.78946,4.0,291,1,"{'Caters': 'False', 'GoodForKids': 'True', 'Bi...","Restaurants, Breakfast & Brunch","{'Monday': '7:0-13:0', 'Tuesday': '7:0-13:0', ..."
1,OtH9uCUr5e2Kz12F1LboXA,pu_AQig2fw40PshvtgONPQ,J8GuDrqeLH1xMpSpeEixYA,2.0,7.0,2.0,2.0,"Yes, the biscuits are delicious enough to repr...",2013-07-22 00:00:20,10457,Stephanie,471,2010-02-13 01:05:36,1433,517,591,2012201320142015201620172018,"SCo1UBoeN3bhRMkSYuiX1A, rA97zeqOrUYuEM69n0Xn4w...",46,3.66,26,8,2,4,0,22,55,58,58,28,6,207023,Biscuits,1235 N Gilbert Rd,Gilbert,AZ,85234,33.372971,-111.78946,4.0,291,1,"{'Caters': 'False', 'GoodForKids': 'True', 'Bi...","Restaurants, Breakfast & Brunch","{'Monday': '7:0-13:0', 'Tuesday': '7:0-13:0', ..."
2,UcQhHkWR8YfIOWeLzjflNA,xUhKi3p2BRTEbTHtJz-Hhg,J8GuDrqeLH1xMpSpeEixYA,4.0,0.0,1.0,0.0,I'm... Kind of confused. This Biscuits isn't p...,2017-10-13 20:04:10,49232,S.Ellen,59,2011-10-08 00:06:58,113,25,10,,"FHAVmMKWq6sJoiiUXxHGsA, TSpAeuGiXrzTXpSdk35CBw...",3,3.56,0,0,0,0,0,0,3,0,0,0,0,207023,Biscuits,1235 N Gilbert Rd,Gilbert,AZ,85234,33.372971,-111.78946,4.0,291,1,"{'Caters': 'False', 'GoodForKids': 'True', 'Bi...","Restaurants, Breakfast & Brunch","{'Monday': '7:0-13:0', 'Tuesday': '7:0-13:0', ..."
3,IJj3by6GI9jV8d5wI46Taw,Vv1jaDcK9fdaqGwODB0AcA,J8GuDrqeLH1xMpSpeEixYA,3.0,0.0,0.0,0.0,Decent southern food. Great Biscuits!! I woul...,2013-08-07 01:47:22,24472,Bruce,227,2010-12-26 17:06:24,150,35,42,2018,"klIN1493a7XioyRXqZHBaQ, thaI8-KSPoMXQe3lOrUA6g...",2,3.52,0,1,0,0,0,3,8,4,4,4,0,207023,Biscuits,1235 N Gilbert Rd,Gilbert,AZ,85234,33.372971,-111.78946,4.0,291,1,"{'Caters': 'False', 'GoodForKids': 'True', 'Bi...","Restaurants, Breakfast & Brunch","{'Monday': '7:0-13:0', 'Tuesday': '7:0-13:0', ..."
4,ynCKeNNCL76ws3W3AtbAKQ,ZefHQwjj3b7mxi46oKik-g,J8GuDrqeLH1xMpSpeEixYA,3.0,0.0,0.0,0.0,I woke up this morning dreaming of biscuits an...,2016-08-06 16:07:00,88838,Patrick,100,2015-03-12 21:53:16,23,15,11,,"SArE5Yd9m6oeCvr6AUOsRw, jjIzxPqFMLsUccPG2Fuvog...",0,4.08,0,0,0,0,0,1,0,0,0,0,0,207023,Biscuits,1235 N Gilbert Rd,Gilbert,AZ,85234,33.372971,-111.78946,4.0,291,1,"{'Caters': 'False', 'GoodForKids': 'True', 'Bi...","Restaurants, Breakfast & Brunch","{'Monday': '7:0-13:0', 'Tuesday': '7:0-13:0', ..."


In [6]:
cats = ['Restaurant', 'Restaurants', 'Cafes', 'Bakeries', 'Bars', 'Desserts', 'Coffee & Tea', 'Juice Bars & Smoothies', 'Food Trucks', 'Caterers', 'Coffee Roasteries', 'Delicatessen', 'Deli']
df_food = df[df['categories'].str.contains('|'.join(cats), regex=True, na=False)]

In [7]:
df_consolidated = df_food[['review_id','user_id','business_id','name_y','name_x','review_stars']]
df_consolidated.columns = ['review_id','user','item','Restaurant Name', 'User Name', 'rating']
df_consolidated

Unnamed: 0,review_id,user,item,Restaurant Name,User Name,rating
0,G1rbhEyMUVnFIq1kAT8jCg,nIJD_7ZXHq-FX8byPMOkMQ,J8GuDrqeLH1xMpSpeEixYA,Biscuits,Nicole,2.0
1,OtH9uCUr5e2Kz12F1LboXA,pu_AQig2fw40PshvtgONPQ,J8GuDrqeLH1xMpSpeEixYA,Biscuits,Stephanie,2.0
2,UcQhHkWR8YfIOWeLzjflNA,xUhKi3p2BRTEbTHtJz-Hhg,J8GuDrqeLH1xMpSpeEixYA,Biscuits,S.Ellen,4.0
3,IJj3by6GI9jV8d5wI46Taw,Vv1jaDcK9fdaqGwODB0AcA,J8GuDrqeLH1xMpSpeEixYA,Biscuits,Bruce,3.0
4,ynCKeNNCL76ws3W3AtbAKQ,ZefHQwjj3b7mxi46oKik-g,J8GuDrqeLH1xMpSpeEixYA,Biscuits,Patrick,3.0
...,...,...,...,...,...,...
231249,_1YbYZ2GJZlKrDu6EFVtAw,LLIIiAToOMcywsSXQ_GUFA,PHcUGebard-dEWiCsj1n3A,More Than Subs,Jeff,4.0
231250,1DJ7xjpWtiLMM-LZPn1RpA,UESlXDRD_p0xNRwsX_5DLA,PHcUGebard-dEWiCsj1n3A,More Than Subs,Kevin,5.0
231251,hVU5SlPgCFcsvAQpJeGSNA,BiZGZ1Eqgk6dYu4HqXVZPQ,dWAeacCm6mpR_fQ5qb8ohA,Super Carniceria Los Alamos,Valeria,3.0
231252,4B4L9maUCxOFq_7tdXE9dw,fIXypw8NNwkuZVQ6TfzPnA,dWAeacCm6mpR_fQ5qb8ohA,Super Carniceria Los Alamos,Gabriel,5.0


## Method to calculate user-based precision and recall

In [8]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

## Variable declaration

In [9]:
N_SPLITS = 5
FRAC_SPLIT = 0.2
seed = 111
train_test_files = []

## K-fold cross validation

In [10]:
i=1
data = df_consolidated[['user', 'item', 'rating']]

for train_val, test in partition_users(data, N_SPLITS, SampleFrac(FRAC_SPLIT)):
    train_val_file = "train_val_"+str(i)+".csv"
    train_val.to_csv(train_val_file, index=False, header=False)
    test_file = "test_"+str(i)+".csv"
    test.to_csv(test_file, index=False, header=False)
    train_test_files.append((train_val_file, test_file))
    i+=1

In [10]:
for i in range(1, N_SPLITS+1):
    train_val_file = "train_val_"+str(i)+".csv"
    test_file = "test_"+str(i)+".csv"
    train_test_files.append((train_val_file, test_file))

## Grid Search for hyperparameter optimization on SVD

In [11]:
reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_folds(train_test_files, reader=reader)
pkf = PredefinedKFold()
param_grid = {'n_epochs': [5, 10, 20, 30], 'lr_all': [0.001, 0.005, 0.01, 0.1],
              'reg_all': [0.02, 0.2, 2], 'random_state': [seed]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=pkf)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.0609247453854116
{'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.2, 'random_state': 111}


## Store results to CSV file

In [12]:
results_df = pd.DataFrame.from_dict(gs.cv_results)
results = results_df[['mean_test_rmse', 'std_test_rmse', 'mean_test_mae', 'std_test_mae', 'param_n_epochs', 'param_lr_all', 'param_reg_all']]
results.to_csv("SVD_results.csv")

## Grid Search for hyperparameter optimization on SVD++

In [13]:
# reader = Reader(line_format='user item rating', sep=',')
# data = Dataset.load_from_folds(train_test_files, reader=reader)
# pkf = PredefinedKFold()
# param_grid = {'n_epochs': [5, 10, 20, 30], 'lr_all': [0.001, 0.005, 0.01, 0.1],
#               'reg_all': [0.02, 0.2, 2], 'random_state': [seed]}
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=pkf)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.0610234810135561
{'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.2, 'random_state': 111}


## Store results to CSV file

In [14]:
results_df = pd.DataFrame.from_dict(gs.cv_results)
results = results_df[['mean_test_rmse', 'std_test_rmse', 'mean_test_mae', 'std_test_mae', 'param_n_epochs', 'param_lr_all', 'param_reg_all']]
results.to_csv("SVDpp_results.csv")

## SVD with tuned hyperparameters

In [15]:
prec_to_ave = []
rec_to_ave = []
rmse_to_ave = []
mae_to_ave = []

reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_folds(train_test_files, reader=reader)
pkf = PredefinedKFold()
algo = SVD(n_epochs=30, reg_all=0.2, random_state=seed)
for trainset, testset in pkf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
    # Precision and recall can then be averaged over all users
    prec_to_ave.append(sum(prec for prec in precisions.values()) / len(precisions))
    rec_to_ave.append(sum(rec for rec in recalls.values()) / len(recalls))
    rmse_to_ave.append(accuracy.rmse(predictions))
    mae_to_ave.append(accuracy.mae(predictions))

RMSE: 1.0659
MAE:  0.8389
RMSE: 1.0778
MAE:  0.8541
RMSE: 1.0571
MAE:  0.8382
RMSE: 1.0539
MAE:  0.8341
RMSE: 1.0499
MAE:  0.8232


## Metrics

In [16]:
precision_average = sum(prec_to_ave)/len(prec_to_ave)
recall_average = sum(rec_to_ave)/len(prec_to_ave)
rmse_average = sum(rmse_to_ave)/len(rmse_to_ave)
mae_average = sum(mae_to_ave)/len(mae_to_ave)

print("Precision and Recall averages are {0} and {1}, respectively".format(precision_average, recall_average))
print("RMSE and MAE averages are {0} and {1}, respectively".format(rmse_average, mae_average))

Precision and Recall averages are 0.9284994304259391 and 0.625111040117248, respectively
RMSE and MAE averages are 1.0609247453854116 and 0.8377018748562264, respectively


## SVD++ with tuned hyperparameters

In [11]:
prec_to_ave = []
rec_to_ave = []
rmse_to_ave = []
mae_to_ave = []
reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_folds(train_test_files, reader=reader)
pkf = PredefinedKFold()
algo = SVDpp(n_epochs=30, reg_all=0.2, random_state=seed)
for trainset, testset in pkf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
    # Precision and recall can then be averaged over all users
    prec_to_ave.append(sum(prec for prec in precisions.values()) / len(precisions))
    rec_to_ave.append(sum(rec for rec in recalls.values()) / len(recalls))
    rmse_to_ave.append(accuracy.rmse(predictions))
    mae_to_ave.append(accuracy.mae(predictions))

RMSE: 1.0677
MAE:  0.8377
RMSE: 1.0782
MAE:  0.8525
RMSE: 1.0609
MAE:  0.8397
RMSE: 1.0563
MAE:  0.8329
RMSE: 1.0532
MAE:  0.8244


## Metrics

In [12]:
precision_average = sum(prec_to_ave)/len(prec_to_ave)
recall_average = sum(rec_to_ave)/len(prec_to_ave)
rmse_average = sum(rmse_to_ave)/len(rmse_to_ave)
mae_average = sum(mae_to_ave)/len(mae_to_ave)

print("Precision and Recall averages are {0} and {1}, respectively".format(precision_average, recall_average))
print("RMSE and MAE averages are {0} and {1}, respectively".format(rmse_average, mae_average))

Precision and Recall averages are 0.926950745541626 and 0.6287460302504606, respectively
RMSE and MAE averages are 1.0632564542910088 and 0.8374465187809921, respectively


In [32]:
def make_binary_tpr_fpr(predictions, threshold=3.5):
    pred_df = pd.DataFrame(predictions)
    pred_df['r_ui'].where(pred_df['r_ui']>threshold, 1, inplace=True)
    pred_df['r_ui'].where(pred_df['r_ui']<=threshold, 0, inplace=True)
    
    pred_df['est'].where(pred_df['est']>threshold, 1, inplace=True)
    pred_df['est'].where(pred_df['est']<=threshold, 0, inplace=True)

    return pred_df['r_ui'], pred_df['est'],

true_r, est = make_binary_tpr_fpr(predictions)