## Read Data

In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/'My Drive'/'SearchAndDiscoveryData'

/content/drive/My Drive/SearchAndDiscoveryData


## Imports

In [3]:
#!/usr/bin/env python
!pip install surprise
!pip install lenskit
import pandas as pd
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import KNNBasic,  KNNWithMeans, KNNBaseline
from surprise.model_selection import KFold, PredefinedKFold, GridSearchCV
from surprise import Reader
from lenskit.crossfold import partition_users, SampleFrac
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict



## Read and filter data

### The data has columns for the number of friends of a user, and whether a restaurant is good for groups.

### Hypothesis: Restaurants that are rated good for groups will be visited by users with number of friends > 5.

### Procedure:
1. We carry out user-based train-test splitting.
2. We pre-filter the train set based on ```RestaurantsGoodForGroups==True``` to only include restaurants which are rated to be good for groups (might have larger seating).
3. Train SVD and SVD++ models on this filtered training set - carry out hyperparameter optimization using grid search.
4. Filter the test set to only include users with number of ``` friends > 5 ```
5. Report MAE, RMSE, Precision and Recall on the filtered test subset.

In [4]:
df = pd.read_csv('contextual_data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,business_id,review_id,user_id,Friends,Restaurant Name,Rating,GoodForKids,BikeParking,RestaurantsPriceRange2,WiFi,RestaurantsGoodForGroups,RestaurantsAttire,RestaurantsTakeOut,Alcohol,OutdoorSeating,NoiseLevel,HasTV,romantic,intimate,classy,upscale,casual
0,0,J8GuDrqeLH1xMpSpeEixYA,G1rbhEyMUVnFIq1kAT8jCg,nIJD_7ZXHq-FX8byPMOkMQ,25,Biscuits,2.0,True,True,1,no,True,casual,True,,False,average,False,False,False,False,False,True
1,1,J8GuDrqeLH1xMpSpeEixYA,OtH9uCUr5e2Kz12F1LboXA,pu_AQig2fw40PshvtgONPQ,425,Biscuits,2.0,True,True,1,no,True,casual,True,,False,average,False,False,False,False,False,True
2,2,J8GuDrqeLH1xMpSpeEixYA,UcQhHkWR8YfIOWeLzjflNA,xUhKi3p2BRTEbTHtJz-Hhg,14,Biscuits,4.0,True,True,1,no,True,casual,True,,False,average,False,False,False,False,False,True
3,3,J8GuDrqeLH1xMpSpeEixYA,IJj3by6GI9jV8d5wI46Taw,Vv1jaDcK9fdaqGwODB0AcA,20,Biscuits,3.0,True,True,1,no,True,casual,True,,False,average,False,False,False,False,False,True
4,4,J8GuDrqeLH1xMpSpeEixYA,ynCKeNNCL76ws3W3AtbAKQ,ZefHQwjj3b7mxi46oKik-g,330,Biscuits,3.0,True,True,1,no,True,casual,True,,False,average,False,False,False,False,False,True


In [6]:
df_consolidated = df[['review_id','user_id','business_id','Restaurant Name','Rating', 'Friends', 'RestaurantsGoodForGroups']]
df_consolidated.columns = ['review_id','user','item','Restaurant Name', 'rating', 'Friends', 'RestaurantsGoodForGroups']
print(df_consolidated.shape)
# df_consolidated = df_consolidated[df_consolidated['RestaurantsGoodForGroups']==True]
# print(df_consolidated.shape)
df_consolidated

(213378, 7)


Unnamed: 0,review_id,user,item,Restaurant Name,rating,Friends,RestaurantsGoodForGroups
0,G1rbhEyMUVnFIq1kAT8jCg,nIJD_7ZXHq-FX8byPMOkMQ,J8GuDrqeLH1xMpSpeEixYA,Biscuits,2.0,25,True
1,OtH9uCUr5e2Kz12F1LboXA,pu_AQig2fw40PshvtgONPQ,J8GuDrqeLH1xMpSpeEixYA,Biscuits,2.0,425,True
2,UcQhHkWR8YfIOWeLzjflNA,xUhKi3p2BRTEbTHtJz-Hhg,J8GuDrqeLH1xMpSpeEixYA,Biscuits,4.0,14,True
3,IJj3by6GI9jV8d5wI46Taw,Vv1jaDcK9fdaqGwODB0AcA,J8GuDrqeLH1xMpSpeEixYA,Biscuits,3.0,20,True
4,ynCKeNNCL76ws3W3AtbAKQ,ZefHQwjj3b7mxi46oKik-g,J8GuDrqeLH1xMpSpeEixYA,Biscuits,3.0,330,True
...,...,...,...,...,...,...,...
213373,BQ7dFc-3vGsqZOvm9WHxCg,pdUDahfWzMY8Y91lmxxvOQ,chM2ywVOhmrKSI2kopKB3A,Wendy's,3.0,708,
213374,hfQ-I810mjdU0wwwUZdRWw,PigmfOVw58sLqdUDLZXT-A,chM2ywVOhmrKSI2kopKB3A,Wendy's,1.0,1,
213375,0g8hTWBptv0POpXUtyADRA,M6MyQzKjOBYZVwaZzMl5FQ,91lHqPZKsySetYu8xWo2GQ,Kouzina,5.0,27,
213376,_kuCov_gzBFV34Oe7d9sCw,jP7KB6uLQ6WeCFYfJYDjKw,91lHqPZKsySetYu8xWo2GQ,Kouzina,5.0,1,


## Method to calculate user-based precision and recall

In [7]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

## Variable declaration

In [8]:
N_SPLITS = 5
FRAC_SPLIT = 0.2
seed = 111
train_test_files = []

## K-fold cross validation

In [16]:
i=1
data = df_consolidated

for train_val, test in partition_users(data, N_SPLITS, SampleFrac(FRAC_SPLIT)):
    train_val_file = "train_val_"+str(i)+".csv"
    train_val = train_val[train_val['RestaurantsGoodForGroups']==True]
    train_val = train_val[['user', 'item', 'rating']]
    train_val.to_csv(train_val_file, index=False, header=False)
    test_file = "test_"+str(i)+".csv"
    test = test[test["Friends"] >= 5]
    test = test[['user', 'item', 'rating']]
    test.to_csv(test_file, index=False, header=False)
    train_test_files.append((train_val_file, test_file))
    i+=1

In [None]:
for i in range(1, N_SPLITS+1):
    train_val_file = "train_val_"+str(i)+".csv"
    test_file = "test_"+str(i)+".csv"
    train_test_files.append((train_val_file, test_file))

## Grid Search for hyperparameter optimization on SVD

In [17]:
reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_folds(train_test_files, reader=reader)
pkf = PredefinedKFold()
param_grid = {'n_epochs': [5, 10, 20, 30], 'lr_all': [0.001, 0.005, 0.01, 0.1],
              'reg_all': [0.02, 0.2, 2], 'random_state': [seed]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=pkf)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.0770548032491352
{'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.2, 'random_state': 111}


## Store results to CSV file

In [18]:
results_df = pd.DataFrame.from_dict(gs.cv_results)
results = results_df[['mean_test_rmse', 'std_test_rmse', 'mean_test_mae', 'std_test_mae', 'param_n_epochs', 'param_lr_all', 'param_reg_all']]
results.to_csv("SVD_filtered_results.csv")

## Grid Search for hyperparameter optimization on SVD++

In [19]:
# reader = Reader(line_format='user item rating', sep=',')
# data = Dataset.load_from_folds(train_test_files, reader=reader)
# pkf = PredefinedKFold()
# param_grid = {'n_epochs': [5, 10, 20, 30], 'lr_all': [0.001, 0.005, 0.01, 0.1],
#               'reg_all': [0.02, 0.2, 2], 'random_state': [seed]}
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=pkf)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.0754327658908416
{'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.2, 'random_state': 111}


## Store results to CSV file

In [20]:
results_df = pd.DataFrame.from_dict(gs.cv_results)
results = results_df[['mean_test_rmse', 'std_test_rmse', 'mean_test_mae', 'std_test_mae', 'param_n_epochs', 'param_lr_all', 'param_reg_all']]
results.to_csv("SVDpp_filtered_results.csv")

## SVD with tuned hyperparameters

In [21]:
prec_to_ave = []
rec_to_ave = []
rmse_to_ave = []
mae_to_ave = []

reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_folds(train_test_files, reader=reader)
pkf = PredefinedKFold()
algo = SVD(n_epochs=30, reg_all=0.2, random_state=seed)
for trainset, testset in pkf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
    # Precision and recall can then be averaged over all users
    prec_to_ave.append(sum(prec for prec in precisions.values()) / len(precisions))
    rec_to_ave.append(sum(rec for rec in recalls.values()) / len(recalls))
    rmse_to_ave.append(accuracy.rmse(predictions))
    mae_to_ave.append(accuracy.mae(predictions))

RMSE: 1.1103
MAE:  0.8634
RMSE: 1.0358
MAE:  0.8081
RMSE: 1.0609
MAE:  0.8268
RMSE: 1.1033
MAE:  0.8663
RMSE: 1.0749
MAE:  0.8359
RMSE: 1.1103
MAE:  0.8634
RMSE: 1.0358
MAE:  0.8081
RMSE: 1.0609
MAE:  0.8268
RMSE: 1.1033
MAE:  0.8663
RMSE: 1.0749
MAE:  0.8359


## Metrics

In [22]:
precision_average = sum(prec_to_ave)/len(prec_to_ave)
recall_average = sum(rec_to_ave)/len(prec_to_ave)
rmse_average = sum(rmse_to_ave)/len(rmse_to_ave)
mae_average = sum(mae_to_ave)/len(mae_to_ave)

print("Precision and Recall averages are {0} and {1}, respectively".format(precision_average, recall_average))
print("RMSE and MAE averages are {0} and {1}, respectively".format(rmse_average, mae_average))

Precision and Recall averages are 0.9299364874705051 and 0.5718002366149364, respectively
RMSE and MAE averages are 1.0770548032491352 and 0.8400912628202726, respectively


## SVD++ with tuned hyperparameters

In [23]:
prec_to_ave = []
rec_to_ave = []
rmse_to_ave = []
mae_to_ave = []
reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_folds(train_test_files, reader=reader)
pkf = PredefinedKFold()
algo = SVDpp(n_epochs=30, reg_all=0.2, random_state=seed)
for trainset, testset in pkf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
    # Precision and recall can then be averaged over all users
    prec_to_ave.append(sum(prec for prec in precisions.values()) / len(precisions))
    rec_to_ave.append(sum(rec for rec in recalls.values()) / len(recalls))
    rmse_to_ave.append(accuracy.rmse(predictions))
    mae_to_ave.append(accuracy.mae(predictions))

RMSE: 1.1124
MAE:  0.8643
RMSE: 1.0376
MAE:  0.8084
RMSE: 1.0633
MAE:  0.8274
RMSE: 1.1022
MAE:  0.8653
RMSE: 1.0786
MAE:  0.8387
RMSE: 1.1124
MAE:  0.8643
RMSE: 1.0376
MAE:  0.8084
RMSE: 1.0633
MAE:  0.8274
RMSE: 1.1022
MAE:  0.8653
RMSE: 1.0786
MAE:  0.8387


## Metrics

In [24]:
precision_average = sum(prec_to_ave)/len(prec_to_ave)
recall_average = sum(rec_to_ave)/len(prec_to_ave)
rmse_average = sum(rmse_to_ave)/len(rmse_to_ave)
mae_average = sum(mae_to_ave)/len(mae_to_ave)

print("Precision and Recall averages are {0} and {1}, respectively".format(precision_average, recall_average))
print("RMSE and MAE averages are {0} and {1}, respectively".format(rmse_average, mae_average))

Precision and Recall averages are 0.9202020999468304 and 0.5877478952522146, respectively
RMSE and MAE averages are 1.0788141789637655 and 0.8408337642685382, respectively


In [None]:
def make_binary_tpr_fpr(predictions, threshold=3.5):
    pred_df = pd.DataFrame(predictions)
    pred_df['r_ui'].where(pred_df['r_ui']>threshold, 1, inplace=True)
    pred_df['r_ui'].where(pred_df['r_ui']<=threshold, 0, inplace=True)
    
    pred_df['est'].where(pred_df['est']>threshold, 1, inplace=True)
    pred_df['est'].where(pred_df['est']<=threshold, 0, inplace=True)

    return pred_df['r_ui'], pred_df['est'],

true_r, est = make_binary_tpr_fpr(predictions)