In [1]:
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import KNNBasic,  KNNWithMeans, KNNBaseline
from surprise.model_selection import KFold
from surprise import Reader
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

In [2]:
df_business = pd.read_json('/home/rohan/Desktop/NYU/SearchAndDiscovery/Project/yelp/yelp_academic_dataset_business.json',lines=True)
df_user = pd.read_json('/home/rohan/Desktop/NYU/SearchAndDiscovery/Project/yelp/yelp_academic_dataset_user.json',lines=True)
df_review = pd.read_json('/home/rohan/Desktop/NYU/SearchAndDiscovery/Project/yelp/yelp_academic_dataset_review.json',lines=True)

In [3]:
cats = ['Restaurant', 'Restaurants', 'Cafes', 'Bakeries', 'Bars', 'Desserts', 'Coffee & Tea', 'Juice Bars & Smoothies', 'Food Trucks', 'Caterers', 'Coffee Roasteries', 'Delicatessen', 'Deli']
df_business = df_business[df_business['categories'].isin(cats)]

In [6]:
df_merged = df_review.merge(df_user, how='inner', left_on = ['user_id'], right_on = ['user_id'])
df_merged = df_merged.merge(df_business, how='inner', left_on = ['business_id'], right_on = ['business_id'])

In [19]:
df_consolidated = df_merged[['review_id','user_id','business_id','name_y','name_x','stars_x']]
df_consolidated.columns = ['review_id','user_id','business_id','Restaurant Name', 'User Name', 'Rating']
df_consolidated

Unnamed: 0,review_id,user_id,business_id,Restaurant Name,User Name,Rating
0,gCn1oTcRPjg0nTNQWlFOgQ,ugr4oqMQxmLBIr9TCSePzQ,d5jY-uA0Zh33oFhENHuEcg,Saigon Kitchen,Joe,5
1,yUu8_Sz2ijF7jhntNTuWzQ,mthqL1PEfyseQ8PLuRs-kA,d5jY-uA0Zh33oFhENHuEcg,Saigon Kitchen,Jane,5
2,0Y-D-F-uk4whpo9bIh7JIA,as0LeBvM5S9MXNrNT1lSbw,d5jY-uA0Zh33oFhENHuEcg,Saigon Kitchen,Jenny,4
3,e8QsEfacsJNHdj_2iS9k6g,lygwv7tz0bPYi1xuXV4DaA,DlCpHyRiobzsbCKKGrQYZA,Restaurant Mangiamo,Jean-claude,5
4,zt3Tu6ISOCPLyRlMHrpkRQ,oEl84Dvg2RNtHs_twyh5SA,hG-okRD-mjpZQheWT9gwBg,Ollies Omelette House,Juan,1
5,pR-tKzS-rMI3NXrRPywCgg,-wV6SMXOVymsA6C6ztlVxw,t5SXjiQlp-0ctHzYES2Juw,Goodlake,Rob,5
6,TgqFFxv8JMwfKbKfE0WzxA,-wV6SMXOVymsA6C6ztlVxw,t5SXjiQlp-0ctHzYES2Juw,Goodlake,Rob,5
7,wQPD5gUDs2Gg_9z5JN9S2A,VgZuko0P1Pf0In9DOKBhpA,U8ypUjl42xwqh5ecQsbptw,Lake Todd Fish Camp,Roseanne,5
8,3R1R1ju6YWLIh3xoxwjqSg,_ixvEpV363vm5cUW52FK7g,U8ypUjl42xwqh5ecQsbptw,Lake Todd Fish Camp,Larry,4
9,RAGwVB3L2wGXSLVWp4yd0A,oWWNBmi5NM39QHU4q03CJA,MaiS2GYrtOGQNOdQAjsXBg,Resto Nouveau Systeme,Jordan,5


In [21]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_consolidated[['user_id', 'business_id', 'Rating']], reader)

# Basic KNN

In [22]:
kf = KFold(n_splits=3)
algo = KNNBasic()
best_algo = None
best_rmse = 1000.0
best_pred = None
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=True)
    if rmse < best_rmse:
        best_algo = algo
        best_pred = predictions

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2809
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9743
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.7854


# KNN with cosine similarity

In [23]:
# Fitting with splits 5
kf = KFold(n_splits=5)
sim_options = {'name':'cosine'}
algo = KNNWithMeans(sim_options = sim_options)
best_algo = None
best_rmse = 1000.0
best_pred = None
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=True)
    if rmse < best_rmse:
        best_algo = algo
        best_rmse= rmse
        best_pred = predictions
print(best_rmse)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.7073
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8165
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.5831
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.5831
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9000
0.5830951894845299


# Minkowski distance with p=2 (Euclidean)

In [25]:
# Fitting with splits 5
kf = KFold(n_splits=5)
sim_options = {'name':'msd'}
algo = KNNWithMeans(sim_options = sim_options)
best_algo = None
best_rmse = 1000.0
best_pred = None
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=True)
    if rmse < best_rmse:
        best_algo = algo
        best_rmse= rmse
        best_pred = predictions
print(best_rmse)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0000
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.4699
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.5000
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.5831
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9000
0.5830951894845299


# Pearson coefficient

In [26]:
# Fitting with splits 5
kf = KFold(n_splits=5)
sim_options = {'name':'pearson'}
algo = KNNWithMeans(sim_options = sim_options)
best_algo = None
best_rmse = 1000.0
best_pred = None
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=True)
    if rmse < best_rmse:
        best_algo = algo
        best_rmse= rmse
        best_pred = predictions
print(best_rmse)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.1814
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.0000
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 2.5000
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.6971
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9000
0.18144368465060587
