In [34]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD,NormalPredictor,KNNBasic,CoClustering
from surprise import dump
from surprise.model_selection import cross_validate, train_test_split
from surprise.model_selection import GridSearchCV
import surprise as sp
import pandas as pd

In [35]:
df_review = pd.read_csv("review_clean.csv")
df = df_review[['user_id','business_id','stars']]

##only keep users who have reviewed more than two restaurants
filter_users = df['user_id'].value_counts() > 2
filter_users = filter_users[filter_users].index.tolist()
df_new = df[(df['user_id'].isin(filter_users))]

##merge with business df to get the business name
df_b = pd.read_csv("business_clean.csv")
df_b_selected = df_b[['business_id','name']]
df_with_name = df_new.merge(df_b_selected, on='business_id')

In [36]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df_new, reader)

## Comparing performance of different methods

In [37]:
result = []

for algorithm in [SVD(), NormalPredictor(), KNNBasic(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    result.append(tmp)
    
pd.DataFrame(result).set_index('Algorithm').sort_values('test_rmse')   

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,1.07031,0.121615,0.004679
KNNBasic,1.207693,0.015763,0.015894
CoClustering,1.322392,0.092453,0.003898
NormalPredictor,1.531878,0.00207,0.004512


**Method with lowest RMSE is SVD**

## Tuning parameter for SVD

In [38]:
param_grid = {'biased': [True, False], 'lr_all': [0.001, 0.005, 0.01, 0.1]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
results = gs.cv_results
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df[results_df.mean_test_rmse == results_df.mean_test_rmse.min()]

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_biased,param_lr_all
1,1.126285,1.07883,1.070571,1.091895,0.02455,1,0.900161,0.869127,0.86578,0.878356,0.015479,2,0.113146,0.001958,0.004714,0.000209,"{'biased': True, 'lr_all': 0.005}",True,0.005


In [39]:
param_grid = {'n_epochs': [5, 10, 20, 50], 'n_factors': [1,2,5,10], 'biased': [True], 'lr_all': [0.005]}
gs3 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs3.fit(data)
results3 = gs3.cv_results
results_df3 = pd.DataFrame.from_dict(gs3.cv_results)
results_df3[results_df3.mean_test_rmse == results_df3.mean_test_rmse.min()]

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_n_factors,param_biased,param_lr_all
11,1.092934,1.089916,1.078734,1.087195,0.006108,1,0.875794,0.879371,0.869935,0.875033,...,5,0.028747,0.000242,0.004402,5.7e-05,"{'n_epochs': 20, 'n_factors': 10, 'biased': Tr...",20,10,True,0.005


In [40]:
##Model with smallest RMSE
model = SVD(n_epochs = 50,n_factors= 10, biased = True, lr_all = 0.005)

In [41]:
##split the data to training dataset and testing dataset
trainset, testset = train_test_split(data, test_size=0.25)

##test the performance of our model
predictions = model.fit(trainset).test(testset)
sp.accuracy.rmse(predictions)

RMSE: 1.0861


1.0860907936066886

## Checking the detail of prediction

In [42]:
def get_Iu(uid):
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
prediction_good_performance = df.sort_values(by='err')[:10]
prediction_bad_performance = df.sort_values(by='err')[-10:]

In [43]:
prediction_good_performance

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
707,FY_Nz4u3H3B0vODkL_1V4Q,z90h5I91-mFd6CO7uoIy1w,5.0,5.0,{'was_impossible': False},10,12,0.0
65,FY_Nz4u3H3B0vODkL_1V4Q,z90h5I91-mFd6CO7uoIy1w,5.0,5.0,{'was_impossible': False},10,12,0.0
303,nJVS18mJIB1hCHq1YsFbUQ,PSp0P_3zWIQabA5HAIJBMQ,4.0,4.000027,{'was_impossible': False},2,50,2.7e-05
370,1sD6RragJ6NChRi8JovuFA,MyuVJzBb0WYWUU2bOBBDrg,4.0,4.001536,{'was_impossible': False},1,20,0.001536
593,N8ab_C-2nf_xLugVtc9h4g,LhoV1oaRXOm84v8rltyJtw,4.0,4.001999,{'was_impossible': False},2,9,0.001999
516,d1NWwQnhfRrFZQu-8_XD_g,GsXAJVIwVakxaXkCgt4Spg,4.0,4.010044,{'was_impossible': False},3,2,0.010044
267,ztvRQSJ2Be-7TtAYHuMu_w,Q9zAmNeaRjhKJeuijNccjQ,4.0,3.988179,{'was_impossible': False},3,8,0.011821
266,QQ1ZAkJ9SpUumPA-STtVsQ,6yBizH8RnIYXk6vboLk3PA,4.0,3.987162,{'was_impossible': False},2,42,0.012838
345,7WNZAno9lIk3t7QTZjHWbw,J44x_m383C2GWtzj6xG1HA,4.0,3.985185,{'was_impossible': False},4,6,0.014815
231,kJIfzETjVaYSImkg7t3xfQ,NFb4zqgY-P2A_ISUnJQsnw,4.0,3.983908,{'was_impossible': False},1,6,0.016092


In [44]:
prediction_bad_performance

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
792,Ai7ZSET9dbL-h1qUdcyC5Q,C_KrTR_YCKrF_zT88DwJAg,1.0,3.840258,{'was_impossible': False},2,7,2.840258
558,IieVqc6tXSmq7g_rt7PrXg,G1_eRH_fu7VKu87hZFF4jw,1.0,3.868433,{'was_impossible': False},2,1,2.868433
477,I0cGEV1tyCEzNOTrnqrGvg,LsNOYkvWIbpROu-1Xkv8Vg,1.0,3.894763,{'was_impossible': False},1,2,2.894763
361,9NPKooC5kN7qmxiS7B_X5w,t5-_w9Z0rv1xsQl-npVpgQ,1.0,3.937571,{'was_impossible': False},3,7,2.937571
671,WjnbU7FdHgwewazjIHyamQ,aY8pA_-9U97ZAC9bq5BeiQ,1.0,3.943999,{'was_impossible': False},2,6,2.943999
721,IYc1_oJEC7GtpLsHLNHNug,C8rBW19M-8aNnVJyNHxuPw,5.0,1.925397,{'was_impossible': False},2,9,3.074603
723,QxkY9qWDNQbsxSpZbvyPVg,rudH7t7S9VfRdCAeDGBuFQ,1.0,4.138641,{'was_impossible': False},2,0,3.138641
120,XDHxyObeeprcyH5MIY-3gg,2K6HukapaRK8oZnwLHCw5w,1.0,4.26234,{'was_impossible': False},2,10,3.26234
127,d1ct4dfTtfzxJMgScAdElA,6xOoA6XvQw4tIM-lmZPjcw,1.0,4.372,{'was_impossible': False},2,3,3.372
678,n8toAEoAKPLSVfvONetgsg,gKnkxVBk1Tstu5QnBjk86w,1.0,4.378356,{'was_impossible': False},2,1,3.378356


In [83]:
df_with_name[df_with_name.name.isin(["Gonzo At Bar XV","Pokeworks", "Pomodoro", "Cold Beer",'Arepazo Tapas & Wine'])].business_id.unique()

array(['acaHnArsLgDMeeCgvp7cVQ', 'Ie3LnWkmTyUf6zL7tOVP2g',
       'VN4xcfjHWmVCJmKyRw5mww', 'EZFrejOgce-CdeJIit9JAg',
       '7_q1IsPoR25clSoigtGmeg'], dtype=object)

In [96]:
favor.iloc[0, 1] = 'acaHnArsLgDMeeCgvp7cVQ'
favor.iloc[1, 1] = 'Ie3LnWkmTyUf6zL7tOVP2g'
favor.iloc[2, 1] = 'VN4xcfjHWmVCJmKyRw5mww'
favor.iloc[3, 1] = 'EZFrejOgce-CdeJIit9JAg'
favor.iloc[4, 1] = '7_q1IsPoR25clSoigtGmeg'
favor

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0,uid,iid,rui,est,details
792,test,acaHnArsLgDMeeCgvp7cVQ,5.0,4.913825,{'was_impossible': False}
193,test,Ie3LnWkmTyUf6zL7tOVP2g,5.0,4.91848,{'was_impossible': False}
90,test,VN4xcfjHWmVCJmKyRw5mww,5.0,5.0,{'was_impossible': False}
686,test,EZFrejOgce-CdeJIit9JAg,5.0,5.0,{'was_impossible': False}
1220,test,7_q1IsPoR25clSoigtGmeg,5.0,5.0,{'was_impossible': False}


## Generate a sample user and recommend five restaurant based on his past ratings

**The sample user we create like Athenian Bar & Grill, Starbucks, Applebee's Grill + Bar and Dunkin**

In [95]:
df_b_selected_1 = df_b_selected[(df_b_selected['name'].isin(['Athenian Bar & Grill','Starbucks','''Applebee's Grill + Bar''', 'Dunkin']))]
e = df_b_selected.business_id.unique()
dataframe=pd.DataFrame(e, columns=['a']) 
dataframe['user_id'] = 'test'
dataframe['stars'] = 5
dataframe.at[1, 'stars'] = 2
dataframe.at[3, 'stars'] = 2
dataframe = dataframe.rename(columns={'a': 'business_id'})
df_test = dataframe[dataframe.business_id.isin(['HQ05pwqEn6zx4zIcAvNZzQ','mQYG4rBs_BAd8DkbgCZTtw','Tf09q-_F2CVLLGARmxVFZw','tUQOptBIGrpCi8f6l9QEbw'])]
dataframe
df_new_test = df_new.append(df_test)
df_new_train = df_new.append(dataframe)
data2 = Dataset.load_from_df(df_new_test, reader)
data3 = Dataset.load_from_df(df_new_train, reader)
trainset, testset = train_test_split(data2, test_size=0.25)
trainset2, testset2 = train_test_split(data3, test_size=0.25)
algo = SVD(n_epochs = 50,n_factors= 10, biased = True, lr_all = 0.005)
predictions = algo.fit(trainset).test(testset2)
predictions2_df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
my_pred = predictions2_df[predictions2_df.uid == 'test'].sort_values(by = ['est'])
favor = my_pred.tail(5)

## Prediction Result:

In [97]:
print(df_with_name[df_with_name.business_id.isin(favor.iid)].name.unique())

['Arepazo Tapas & Wine' 'Gonzo At Bar XV' 'Pokeworks' 'Cold Beer'
 'Pomodoro']


In [102]:
df_review[df_review.user_id == 'd1ct4dfTtfzxJMgScAdElA'].text[30189]

df_with_name[df_with_name.business_id == "gKnkxVBk1Tstu5QnBjk86w"]

Unnamed: 0,user_id,business_id,stars,name
3423,y3lr3awsJ6BUyPfr6rachg,gKnkxVBk1Tstu5QnBjk86w,5.0,ReelHouse Boston
3424,n8toAEoAKPLSVfvONetgsg,gKnkxVBk1Tstu5QnBjk86w,1.0,ReelHouse Boston
