In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances

In [15]:
df_cuisine = pd.read_csv('./inputs/chefmozcuisine.csv')
df_cuisine.head()

Unnamed: 0,placeID,Rcuisine
0,135110,Spanish
1,135109,Italian
2,135107,Latin_American
3,135106,Mexican
4,135105,Fast_Food


In [16]:
df_parking = pd.read_csv('./inputs/chefmozparking.csv')
df_parking.head()
df_parking['parking_lot'].unique()

array(['public', 'none', 'yes', 'valet parking', 'fee', 'street',
       'validated parking'], dtype=object)

In [17]:
df_ratings = pd.read_csv('./inputs/rating_final.csv')

In [18]:
df_geoplaces = pd.read_csv('./inputs/geoplaces2.csv')
df_geoplaces = df_geoplaces[['placeID', 'name', 'alcohol', 'smoking_area', 'dress_code', 'accessibility', 'price', 'Rambience']]
df_geoplaces.head()
print(df_geoplaces['alcohol'].unique())
print(df_geoplaces['smoking_area'].unique())
print(df_geoplaces['dress_code'].unique())
print(df_geoplaces['accessibility'].unique())
print(df_geoplaces['price'].unique())
print(df_geoplaces['Rambience'].unique())

['No_Alcohol_Served' 'Wine-Beer' 'Full_Bar']
['none' 'only at bar' 'permitted' 'section' 'not permitted']
['informal' 'casual' 'formal']
['no_accessibility' 'completely' 'partially']
['medium' 'low' 'high']
['familiar' 'quiet']


In [19]:
df_ratings = pd.read_csv('./inputs/rating_final.csv')
df_ratings['overall_rating'] = (df_ratings['rating'] + df_ratings['food_rating'] + df_ratings['service_rating'])
df_ratings.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,overall_rating
0,U1077,135085,2,2,2,6
1,U1077,135038,2,2,1,5
2,U1077,132825,2,2,2,6
3,U1077,135060,1,2,2,5
4,U1068,135104,1,1,2,4


In [20]:
df = pd.merge(df_ratings, df_geoplaces, on='placeID')
df = pd.merge(df, df_cuisine, on='placeID')
df = pd.merge(df, df_parking, on='placeID')
df.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,overall_rating,name,alcohol,smoking_area,dress_code,accessibility,price,Rambience,Rcuisine,parking_lot
0,U1077,135085,2,2,2,6,Tortas Locas Hipocampo,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,public
1,U1108,135085,1,2,1,4,Tortas Locas Hipocampo,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,public
2,U1081,135085,1,2,1,4,Tortas Locas Hipocampo,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,public
3,U1056,135085,2,2,2,6,Tortas Locas Hipocampo,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,public
4,U1134,135085,2,1,2,5,Tortas Locas Hipocampo,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,public


In [21]:
user_ids = df['userID'].tolist()
user_ids = list(map(lambda id : int(id[1:])-1001, user_ids))

In [22]:
df['userID'] = pd.DataFrame(user_ids) 

In [23]:
place_ids = df['placeID'].tolist()
place_ids.sort()
place_ids = list(map(lambda id : (id-132560), place_ids))

new_place_ids = []
i = -1
seen = []

for place_id in place_ids:
    if place_id not in seen:
        seen.append(place_id)
        i = i+1
    new_place_ids.append(i)

In [24]:
df['placeID'] = pd.DataFrame(new_place_ids) 

In [25]:
# Parking Encoding
df['parking_lot'] = df['parking_lot'].map({'public':2, 'none':0, 'yes':1, 'valet parking': 1, 'fee':1, 'street':1, 'validated parking':1})
df

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,overall_rating,name,alcohol,smoking_area,dress_code,accessibility,price,Rambience,Rcuisine,parking_lot
0,76,0,2,2,2,6,Tortas Locas Hipocampo,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,2
1,107,0,1,2,1,4,Tortas Locas Hipocampo,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,2
2,80,0,1,2,1,4,Tortas Locas Hipocampo,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,2
3,55,0,2,2,2,6,Tortas Locas Hipocampo,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,2
4,133,1,2,1,2,5,Tortas Locas Hipocampo,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038,60,93,2,2,2,6,tacos los volcanes,No_Alcohol_Served,none,informal,completely,low,quiet,American,0
1039,24,94,1,0,0,1,tacos los volcanes,No_Alcohol_Served,none,informal,completely,low,quiet,American,0
1040,96,94,2,1,1,4,tacos los volcanes,No_Alcohol_Served,none,informal,completely,low,quiet,American,0
1041,95,94,1,2,2,5,tacos los volcanes,No_Alcohol_Served,none,informal,completely,low,quiet,American,0


In [26]:
# Geo Encoding Based on Categories
df['alcohol'] = df['alcohol'].map({'No_Alcohol_Served':0, 'Wine-Beer':1, 'Full_Bar':2})
df['smoking_area'] = df['smoking_area'].map({'none':0, 'only at bar':1, 'permitted':1, 'section':0, 'not permitted':0})
df['dress_code'] = df['dress_code'].map({'informal':0, 'casual':1, 'formal':2})
df['accessibility'] = df['accessibility'].map({'no_accessibility':0, 'completely':1, 'partially':2})
df['price'] = df['price'].map({'medium':0, 'low':1, 'high':2})
df['Rambience'] = df['Rambience'].map({'familiar':0, 'quiet':1})
df

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,overall_rating,name,alcohol,smoking_area,dress_code,accessibility,price,Rambience,Rcuisine,parking_lot
0,76,0,2,2,2,6,Tortas Locas Hipocampo,0,0,0,0,0,0,Fast_Food,2
1,107,0,1,2,1,4,Tortas Locas Hipocampo,0,0,0,0,0,0,Fast_Food,2
2,80,0,1,2,1,4,Tortas Locas Hipocampo,0,0,0,0,0,0,Fast_Food,2
3,55,0,2,2,2,6,Tortas Locas Hipocampo,0,0,0,0,0,0,Fast_Food,2
4,133,1,2,1,2,5,Tortas Locas Hipocampo,0,0,0,0,0,0,Fast_Food,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038,60,93,2,2,2,6,tacos los volcanes,0,0,0,1,1,1,American,0
1039,24,94,1,0,0,1,tacos los volcanes,0,0,0,1,1,1,American,0
1040,96,94,2,1,1,4,tacos los volcanes,0,0,0,1,1,1,American,0
1041,95,94,1,2,2,5,tacos los volcanes,0,0,0,1,1,1,American,0


In [27]:
# Rcuisine encoding
df['Rcuisine'] = df['Rcuisine'].map(
    {
        'Spanish':0,
        'Italian':1,
        'Latin_American':2,
        'Mexican': 3,
        'Fast_Food':4,
        'Burgers':5,
        'Dessert-Ice_Cream':6,
        'Hot_Dogs':7,
        'Steaks':8,
        'Asian':9,
        'International':10,
        'Mongolian':11,
        'Vegetarian':12,
        'Brazilian':13,
        'Cafe-Coffee_Shop':14,
        'Cafeteria':15,
        'Contemporary':16,
        'Deli-Sandwiches':17,
        'Diner':18,
        'Japanese':19,
        'Sushi':20,
        'Seafood':21,
        'Chinese':22,
        'Bar':23,
        'Bar_Pub_Brewery':24,
        'Pizzeria':25,
        'Mediterranean':26,
        'American':27,
        'Family':28,
        'Caribbean':29,
        'African':30,
        'Breakfast-Brunch':31,
        'Regional':32,
        'Afghan':33,
        'Bakery':34,
        'Game':35,
        'Armenian':36,
        'Vietnamese':37,
        'Korean':38,
        'Thai':39,
        'Barbecue':40,
        'Polish':41,
        'Dutch-Belgian':42,
        'French':43,
        'German':44,
        'Southwestern':45,
        'Persian':46,
        'Ethiopian':47,
        'Juice':48,
        'Soup':49,
        'Continental-European':50,
        'Greek':51,
        'Southern':52,
        'Eastern_European':53,
        'California':54,
        'Bagels':55,
        'Turkish':56,
        'Organic-Healthy':57,
        'Fine_Dining':58
    }
)
df.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,overall_rating,name,alcohol,smoking_area,dress_code,accessibility,price,Rambience,Rcuisine,parking_lot
0,76,0,2,2,2,6,Tortas Locas Hipocampo,0,0,0,0,0,0,4,2
1,107,0,1,2,1,4,Tortas Locas Hipocampo,0,0,0,0,0,0,4,2
2,80,0,1,2,1,4,Tortas Locas Hipocampo,0,0,0,0,0,0,4,2
3,55,0,2,2,2,6,Tortas Locas Hipocampo,0,0,0,0,0,0,4,2
4,133,1,2,1,2,5,Tortas Locas Hipocampo,0,0,0,0,0,0,4,2


In [28]:
# generate encoded string
df['encoded_values'] = (df['overall_rating'].map(str)+df['alcohol'].map(str)).map(int)

df['encoded_values'] = (
        df['overall_rating'].map(str) + 
        '.' +
        df['alcohol'].map(str) + 
        df['smoking_area'].map(str) + 
        df['dress_code'].map(str) + 
        df['accessibility'].map(str) + 
        df['price'].map(str) + 
        df['Rambience'].map(str) + 
        df['Rcuisine'].map(str) + 
        df['parking_lot'].map(str)
    ).astype(np.float64)

df.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,overall_rating,name,alcohol,smoking_area,dress_code,accessibility,price,Rambience,Rcuisine,parking_lot,encoded_values
0,76,0,2,2,2,6,Tortas Locas Hipocampo,0,0,0,0,0,0,4,2,6.0
1,107,0,1,2,1,4,Tortas Locas Hipocampo,0,0,0,0,0,0,4,2,4.0
2,80,0,1,2,1,4,Tortas Locas Hipocampo,0,0,0,0,0,0,4,2,4.0
3,55,0,2,2,2,6,Tortas Locas Hipocampo,0,0,0,0,0,0,4,2,6.0
4,133,1,2,1,2,5,Tortas Locas Hipocampo,0,0,0,0,0,0,4,2,5.0


In [29]:
df = df[['userID', 'placeID', 'encoded_values', 'name']]
df

Unnamed: 0,userID,placeID,encoded_values,name
0,76,0,6.000000,Tortas Locas Hipocampo
1,107,0,4.000000,Tortas Locas Hipocampo
2,80,0,4.000000,Tortas Locas Hipocampo
3,55,0,6.000000,Tortas Locas Hipocampo
4,133,1,5.000000,Tortas Locas Hipocampo
...,...,...,...,...
1038,60,93,6.000111,tacos los volcanes
1039,24,94,1.000111,tacos los volcanes
1040,96,94,4.000111,tacos los volcanes
1041,95,94,5.000111,tacos los volcanes


In [30]:
n_users = df.userID.nunique()
n_items = df.placeID.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Restaurants: '+str(n_items))

Num. of Users: 138
Num of Restaurants: 95


In [31]:
train_data, test_data = train_test_split(df, test_size=0.25)

In [32]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1], line[2]] = line[3]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1], line[2]] = line[3]

In [33]:
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [34]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [35]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [36]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [37]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.937338425798904
Item-based CF RMSE: 3.990994400614101
