In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances

In [42]:
df_cuisine = pd.read_csv('./inputs/chefmozcuisine.csv')
df_cuisine.head()

Unnamed: 0,placeID,Rcuisine
0,135110,Spanish
1,135109,Italian
2,135107,Latin_American
3,135106,Mexican
4,135105,Fast_Food


In [35]:
df_parking = pd.read_csv('./inputs/chefmozparking.csv')
df_parking.head()
df_parking['parking_lot'].unique()

array(['public', 'none', 'yes', 'valet parking', 'fee', 'street',
       'validated parking'], dtype=object)

In [36]:
df_geoplaces = pd.read_csv('./inputs/geoplaces2.csv')
df_geoplaces = df_geoplaces[['placeID', 'alcohol', 'smoking_area', 'dress_code', 'accessibility', 'price', 'Rambience']]
df_geoplaces.head()
print(df_geoplaces['alcohol'].unique())
print(df_geoplaces['smoking_area'].unique())
print(df_geoplaces['dress_code'].unique())
print(df_geoplaces['accessibility'].unique())
print(df_geoplaces['price'].unique())
print(df_geoplaces['Rambience'].unique())

['No_Alcohol_Served' 'Wine-Beer' 'Full_Bar']
['none' 'only at bar' 'permitted' 'section' 'not permitted']
['informal' 'casual' 'formal']
['no_accessibility' 'completely' 'partially']
['medium' 'low' 'high']
['familiar' 'quiet']


In [69]:
df_ratings = pd.read_csv('./inputs/rating_final.csv')
df_ratings['overall_rating'] = (df_ratings['rating'] + df_ratings['food_rating'] + df_ratings['service_rating'])
df_ratings.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,overall_rating
0,U1077,135085,2,2,2,6
1,U1077,135038,2,2,1,5
2,U1077,132825,2,2,2,6
3,U1077,135060,1,2,2,5
4,U1068,135104,1,1,2,4


In [70]:
df = pd.merge(df_ratings, df_geoplaces, on='placeID')
df = pd.merge(df, df_cuisine, on='placeID')
df = pd.merge(df, df_parking, on='placeID')
df.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,overall_rating,alcohol,smoking_area,dress_code,accessibility,price,Rambience,Rcuisine,parking_lot
0,U1077,135085,2,2,2,6,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,public
1,U1108,135085,1,2,1,4,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,public
2,U1081,135085,1,2,1,4,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,public
3,U1056,135085,2,2,2,6,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,public
4,U1134,135085,2,1,2,5,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,public


In [71]:
# Parking Encoding
df['parking_lot'] = df['parking_lot'].map({'public':2, 'none':0, 'yes':1, 'valet parking': 1, 'fee':1, 'street':1, 'validated parking':1})
df

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,overall_rating,alcohol,smoking_area,dress_code,accessibility,price,Rambience,Rcuisine,parking_lot
0,U1077,135085,2,2,2,6,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,2
1,U1108,135085,1,2,1,4,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,2
2,U1081,135085,1,2,1,4,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,2
3,U1056,135085,2,2,2,6,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,2
4,U1134,135085,2,1,2,5,No_Alcohol_Served,not permitted,informal,no_accessibility,medium,familiar,Fast_Food,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038,U1061,132958,2,2,2,6,No_Alcohol_Served,none,informal,completely,low,quiet,American,0
1039,U1025,132958,1,0,0,1,No_Alcohol_Served,none,informal,completely,low,quiet,American,0
1040,U1097,132958,2,1,1,4,No_Alcohol_Served,none,informal,completely,low,quiet,American,0
1041,U1096,132958,1,2,2,5,No_Alcohol_Served,none,informal,completely,low,quiet,American,0


In [72]:
# Geo Encoding Based on Categories
df['alcohol'] = df['alcohol'].map({'No_Alcohol_Served':0, 'Wine-Beer':1, 'Full_Bar':2})
df['smoking_area'] = df['smoking_area'].map({'none':0, 'only at bar':1, 'permitted':1, 'section':0, 'not permitted':0})
df['dress_code'] = df['dress_code'].map({'informal':0, 'casual':1, 'formal':2})
df['accessibility'] = df['accessibility'].map({'no_accessibility':0, 'completely':1, 'partially':2})
df['price'] = df['price'].map({'medium':0, 'low':1, 'high':2})
df['Rambience'] = df['Rambience'].map({'familiar':0, 'quiet':1})
df

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,overall_rating,alcohol,smoking_area,dress_code,accessibility,price,Rambience,Rcuisine,parking_lot
0,U1077,135085,2,2,2,6,0,0,0,0,0,0,Fast_Food,2
1,U1108,135085,1,2,1,4,0,0,0,0,0,0,Fast_Food,2
2,U1081,135085,1,2,1,4,0,0,0,0,0,0,Fast_Food,2
3,U1056,135085,2,2,2,6,0,0,0,0,0,0,Fast_Food,2
4,U1134,135085,2,1,2,5,0,0,0,0,0,0,Fast_Food,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038,U1061,132958,2,2,2,6,0,0,0,1,1,1,American,0
1039,U1025,132958,1,0,0,1,0,0,0,1,1,1,American,0
1040,U1097,132958,2,1,1,4,0,0,0,1,1,1,American,0
1041,U1096,132958,1,2,2,5,0,0,0,1,1,1,American,0


In [73]:
# Rcuisine encoding
df['Rcuisine'] = df['Rcuisine'].map(
    {
        'Spanish':0,
        'Italian':1,
        'Latin_American':2,
        'Mexican': 3,
        'Fast_Food':4,
        'Burgers':5,
        'Dessert-Ice_Cream':6,
        'Hot_Dogs':7,
        'Steaks':8,
        'Asian':9,
        'International':10,
        'Mongolian':11,
        'Vegetarian':12,
        'Brazilian':13,
        'Cafe-Coffee_Shop':14,
        'Cafeteria':15,
        'Contemporary':16,
        'Deli-Sandwiches':17,
        'Diner':18,
        'Japanese':19,
        'Sushi':20,
        'Seafood':21,
        'Chinese':22,
        'Bar':23,
        'Bar_Pub_Brewery':24,
        'Pizzeria':25,
        'Mediterranean':26,
        'American':27,
        'Family':28,
        'Caribbean':29,
        'African':30,
        'Breakfast-Brunch':31,
        'Regional':32,
        'Afghan':33,
        'Bakery':34,
        'Game':35,
        'Armenian':36,
        'Vietnamese':37,
        'Korean':38,
        'Thai':39,
        'Barbecue':40,
        'Polish':41,
        'Dutch-Belgian':42,
        'French':43,
        'German':44,
        'Southwestern':45,
        'Persian':46,
        'Ethiopian':47,
        'Juice':48,
        'Soup':49,
        'Continental-European':50,
        'Greek':51,
        'Southern':52,
        'Eastern_European':53,
        'California':54,
        'Bagels':55,
        'Turkish':56,
        'Organic-Healthy':57,
        'Fine_Dining':58
    }
)
df.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,overall_rating,alcohol,smoking_area,dress_code,accessibility,price,Rambience,Rcuisine,parking_lot
0,U1077,135085,2,2,2,6,0,0,0,0,0,0,4,2
1,U1108,135085,1,2,1,4,0,0,0,0,0,0,4,2
2,U1081,135085,1,2,1,4,0,0,0,0,0,0,4,2
3,U1056,135085,2,2,2,6,0,0,0,0,0,0,4,2
4,U1134,135085,2,1,2,5,0,0,0,0,0,0,4,2


In [75]:
# generate encoded string
df['encoded_values'] = (df['overall_rating'].map(str)+df['alcohol'].map(str)).map(int)



df['encoded_values'] = (
        df['overall_rating'].map(str) + 
        df['alcohol'].map(str) + 
        df['smoking_area'].map(str) + 
        df['dress_code'].map(str) + 
        df['accessibility'].map(str) + 
        df['price'].map(str) + 
        df['Rambience'].map(str) + 
        df['Rcuisine'].map(str) + 
        df['parking_lot'].map(str)
    ).map(int)
df.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,overall_rating,alcohol,smoking_area,dress_code,accessibility,price,Rambience,Rcuisine,parking_lot,encoded_values
0,U1077,135085,2,2,2,6,0,0,0,0,0,0,4,2,600000042
1,U1108,135085,1,2,1,4,0,0,0,0,0,0,4,2,400000042
2,U1081,135085,1,2,1,4,0,0,0,0,0,0,4,2,400000042
3,U1056,135085,2,2,2,6,0,0,0,0,0,0,4,2,600000042
4,U1134,135085,2,1,2,5,0,0,0,0,0,0,4,2,500000042


In [11]:
train_data, test_data = train_test_split(df, test_size=0.25)
restmatrix_train_data = train_data.pivot_table(index='userID',columns='name',values='overall_rating')
restmatrix_test_data = test_data.pivot_table(index='userID',columns='name',values='overall_rating')

restmatrix = df.pivot_table(index='userID',columns='name',values='overall_rating')
restmatrix

name,Abondance Restaurante Bar,Cabana Huasteca,Cafe Chaires,Cafeteria cenidet,Cafeteria y Restaurant El Pacifico,Carls Jr,Carreton de Flautas y Migadas,Cenaduria El Rincón de Tlaquepaque,Chaires,Dominos Pizza,...,puesto de gorditas,puesto de tacos,rockabilly,shi ro ie,tacos abi,tacos de barbacoa enfrente del Tec,tacos de la estacion,tacos los volcanes,tortas hawai,vips
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1001,,,,,,,,,,,...,,1.666667,,,,,,,,
U1002,,,,,,,,,,,...,,1.333333,,,,,,,,
U1003,,2.000000,,,,,,,,,...,,1.666667,1.666667,,,,,,,
U1004,,,,,2.000000,,,,,,...,,,,,,,,2.0,,
U1005,,,,,1.666667,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U1134,,1.333333,0.0,,,,,,,,...,,1.333333,,,,,,,,
U1135,,,,,0.000000,,,,,,...,,0.000000,,,,,,,,
U1136,,,,,,,,,,,...,,,,,,,,2.0,,
U1137,,,,,,,,,,,...,,1.666667,,,,,,,,


In [17]:
ratings.sort_values('number of ratings', ascending=False).head(10)

Unnamed: 0_level_0,overall_rating,number of ratings
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Tortas Locas Hipocampo,1.324074,36
puesto de tacos,1.1875,32
Cafeteria y Restaurant El Pacifico,1.107143,28
La Cantina Restaurante,1.2,25
Gorditas Doa Gloria,1.0,25
Restaurant la Chalita,1.111111,24
Restaurante Marisco Sam,1.151515,22
Restaurante El Cielo Potosino,1.190476,21
Restaurant Oriental Express,1.216667,20
La Posada del Virrey,1.407407,18


In [18]:
train_data_RMSE = train_data.pivot_table(index='userID',columns='name',values='overall_rating')
test_data_RMSE = test_data.pivot_table(index='userID',columns='name',values='overall_rating')
train_data_RMSE.fillna(0, inplace=True)
test_data_RMSE.fillna(0, inplace=True)

train_data_RMSE = train_data_RMSE.values
test_data_RMSE = test_data_RMSE.values

print(train_data_RMSE)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 2. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [19]:
print(test_data_RMSE)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 2. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [20]:
item_similarity = pairwise_distances(train_data_RMSE.T, metric='cosine')

In [21]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [23]:
item_prediction = predict(train_data_RMSE, item_similarity, type='item')

In [24]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [25]:
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_RMSE)))

Item-based CF RMSE: 1.4589627235441878
