In [120]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances

In [121]:
df_ratings = pd.read_csv('./inputs/rating_final.csv')

In [122]:
df_geoplaces = pd.read_csv('./inputs/geoplaces2.csv')

In [123]:
df_ratings['overall_rating'] = (df_ratings['rating'] + df_ratings['food_rating'] + df_ratings['service_rating'])/3

In [124]:
df = pd.merge(df_ratings, df_geoplaces, on='placeID')

In [125]:
user_ids = df['userID'].tolist()
user_ids = list(map(lambda id : int(id[1:])-1001, user_ids))

In [126]:
df['userID'] = pd.DataFrame(user_ids) 

In [127]:
place_ids = df['placeID'].tolist()
place_ids.sort()
place_ids = list(map(lambda id : (id-132560), place_ids))

new_place_ids = []
i = -1
seen = []

for place_id in place_ids:
    if place_id not in seen:
        seen.append(place_id)
        i = i+1
    new_place_ids.append(i)

In [128]:
df['placeID'] = pd.DataFrame(new_place_ids) 

In [129]:
df = df[['userID', 'placeID', 'overall_rating', 'name']]
df

Unnamed: 0,userID,placeID,overall_rating,name
0,76,0,2.000000,Tortas Locas Hipocampo
1,107,0,1.333333,Tortas Locas Hipocampo
2,80,0,1.333333,Tortas Locas Hipocampo
3,55,0,2.000000,Tortas Locas Hipocampo
4,133,1,1.666667,Tortas Locas Hipocampo
...,...,...,...,...
1156,60,128,2.000000,tacos los volcanes
1157,24,129,0.333333,tacos los volcanes
1158,96,129,1.333333,tacos los volcanes
1159,95,129,1.666667,tacos los volcanes


In [130]:
n_users = df.userID.nunique()
n_items = df.placeID.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Restaurants: '+str(n_items))

Num. of Users: 138
Num of Restaurants: 130


In [131]:
train_data, test_data = train_test_split(df, test_size=0.25)

In [132]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1], line[2]] = line[3]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1], line[2]] = line[3]

In [133]:
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

[[0.         1.         1.         ... 1.         1.         1.        ]
 [1.         0.         0.85760626 ... 0.74857736 0.77706004 0.73454316]
 [1.         0.85760626 0.         ... 0.91671292 0.91819492 1.        ]
 ...
 [1.         0.74857736 0.91671292 ... 0.         0.85555791 1.        ]
 [1.         0.77706004 0.91819492 ... 0.85555791 0.         0.80936909]
 [1.         0.73454316 1.         ... 1.         0.80936909 0.        ]]


In [134]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [135]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [136]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [137]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 1.4503255046819437
Item-based CF RMSE: 1.4637344428576775
