In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/'My Drive'/'SearchAndDiscoveryData'

/content/drive/My Drive/SearchAndDiscoveryData


In [3]:
#!/usr/bin/env python
!pip install surprise
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import KNNBasic,  KNNWithMeans, KNNBaseline
from surprise.model_selection import KFold
from surprise import Reader
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict



In [4]:
df_business = pd.read_json('yelp_academic_dataset_business.json',lines=True)
df_user = pd.read_json('yelp_academic_dataset_user.json',lines=True)
df_review = pd.read_json('yelp_academic_dataset_review.json',lines=True)

In [5]:
cats = ['Restaurant', 'Restaurants', 'Cafes', 'Bakeries', 'Bars', 'Desserts', 'Coffee & Tea', 'Juice Bars & Smoothies', 'Food Trucks', 'Caterers', 'Coffee Roasteries', 'Delicatessen', 'Deli']
df_business = df_business[df_business['categories'].isin(cats)]

In [6]:
df_merged = df_review.merge(df_user, how='inner', left_on = ['user_id'], right_on = ['user_id'])
df_merged = df_merged.merge(df_business, how='inner', left_on = ['business_id'], right_on = ['business_id'])

In [7]:
df_consolidated = df_merged[['review_id','user_id','business_id','name_y','name_x','stars_x']]
df_consolidated.columns = ['review_id','user_id','business_id','Restaurant Name', 'User Name', 'Rating']
df_consolidated

Unnamed: 0,review_id,user_id,business_id,Restaurant Name,User Name,Rating
0,gCn1oTcRPjg0nTNQWlFOgQ,ugr4oqMQxmLBIr9TCSePzQ,d5jY-uA0Zh33oFhENHuEcg,Saigon Kitchen,Joe,5
1,yUu8_Sz2ijF7jhntNTuWzQ,mthqL1PEfyseQ8PLuRs-kA,d5jY-uA0Zh33oFhENHuEcg,Saigon Kitchen,Jane,5
2,0Y-D-F-uk4whpo9bIh7JIA,as0LeBvM5S9MXNrNT1lSbw,d5jY-uA0Zh33oFhENHuEcg,Saigon Kitchen,Jenny,4
3,e8QsEfacsJNHdj_2iS9k6g,lygwv7tz0bPYi1xuXV4DaA,DlCpHyRiobzsbCKKGrQYZA,Restaurant Mangiamo,Jean-claude,5
4,zt3Tu6ISOCPLyRlMHrpkRQ,oEl84Dvg2RNtHs_twyh5SA,hG-okRD-mjpZQheWT9gwBg,Ollies Omelette House,Juan,1
5,pR-tKzS-rMI3NXrRPywCgg,-wV6SMXOVymsA6C6ztlVxw,t5SXjiQlp-0ctHzYES2Juw,Goodlake,Rob,5
6,TgqFFxv8JMwfKbKfE0WzxA,-wV6SMXOVymsA6C6ztlVxw,t5SXjiQlp-0ctHzYES2Juw,Goodlake,Rob,5
7,wQPD5gUDs2Gg_9z5JN9S2A,VgZuko0P1Pf0In9DOKBhpA,U8ypUjl42xwqh5ecQsbptw,Lake Todd Fish Camp,Roseanne,5
8,3R1R1ju6YWLIh3xoxwjqSg,_ixvEpV363vm5cUW52FK7g,U8ypUjl42xwqh5ecQsbptw,Lake Todd Fish Camp,Larry,4
9,RAGwVB3L2wGXSLVWp4yd0A,oWWNBmi5NM39QHU4q03CJA,MaiS2GYrtOGQNOdQAjsXBg,Resto Nouveau Systeme,Jordan,5


In [8]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_consolidated[['user_id', 'business_id', 'Rating']], reader)

In [9]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [10]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [11]:
kf = KFold(n_splits=5)
algo = SVD()

prec_to_ave = []
rec_to_ave = []

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    
    prec_to_ave.append(sum(prec for prec in precisions.values()) / len(precisions))
    rec_to_ave.append(sum(rec for rec in recalls.values()) / len(recalls))

In [12]:
def make_binary_tpr_fpr(predictions, threshold=3.5):
    pred_df = pd.DataFrame(predictions)
    pred_df['r_ui'].where(pred_df['r_ui']>threshold, 1, inplace=True)
    pred_df['r_ui'].where(pred_df['r_ui']<=threshold, 0, inplace=True)
    
    pred_df['est'].where(pred_df['est']>threshold, 1, inplace=True)
    pred_df['est'].where(pred_df['est']<=threshold, 0, inplace=True)

    return pred_df['r_ui'], pred_df['est'], 

true_r, est = make_binary_tpr_fpr(predictions)

In [13]:
precision_average = sum(prec_to_ave)/len(prec_to_ave)
recall_average = sum(rec_to_ave)/len(prec_to_ave)

print("Precision and Recall averages are {0} and {1}, respectively".format(precision_average, recall_average))

Precision and Recall averages are 0.8333333333333333 and 1.0, respectively
