# Collaborative Filtering

Here we look at a basic collaborative filtering model using Yelp academic dataset.

In [45]:
# importing libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cross_validation import train_test_split
from scipy import sparse
print('Libraries imported.')

Libraries imported.


In [2]:
# reading j_son data
df_business = pd.read_json(r'C:\Users\Bhavya\Desktop\242\dataset\business.json', lines = True)
df_user = pd.read_json(r'C:\Users\Bhavya\Desktop\242\dataset\user.json', lines = True)
df_review = pd.read_json(r'C:\Users\Bhavya\Desktop\242\dataset\review.json', lines = True)
print('Data imported.')

Data imported.


In [3]:
# filtering restaurant businesses in Arizona
df_bus = df_business.copy()
df_bus = df_bus[df_bus['is_open'] == 1]
df_bus = df_bus[df_bus['state'] == 'AZ']
df_bus = df_bus.reset_index()
temp = []
for i in range(len(df_bus)):
    if('Restaurants' in df_bus['categories'][i]):
        temp.append(1)
    else:
        temp.append(0)
df_bus['isres'] = temp
df_bus = df_bus[df_bus['isres'] == 1]
df_bus = df_bus.drop(['attributes', 'address', 'neighborhood', 'postal_code', 'is_open', 'city', 'state', 'index', 'categories', 'isres'], axis = 1)
df_bus.head()

Unnamed: 0,business_id,hours,latitude,longitude,name,review_count,stars
9,rDMptJYWtnMhpQu_rRXHng,{},33.60707,-112.064382,McDonald's,10,1.0
10,1WBkAuQg81kokZIPMpn9Zg,"{'Monday': '11:00-22:00', 'Tuesday': '11:00-22...",33.60731,-112.063404,Charr An American Burger Bar,232,3.0
13,iPa__LOhse-hobC2Xmp-Kw,"{'Monday': '5:00-23:00', 'Tuesday': '5:00-23:0...",33.508765,-112.04624,McDonald's,34,3.0
16,kKx8iCJkomVQBdWHnmmOiA,{},33.394877,-111.600194,Little Caesars Pizza,4,2.5
17,YhV93k9uiMdr3FlV4FHjwA,"{'Monday': '8:00-17:00', 'Tuesday': '8:00-17:0...",33.449967,-112.070222,Caviness Studio,4,5.0


In [5]:
# filtering reviews related to businesses in Arizona
df_rev = df_review.copy()
df_rev = df_rev.drop(['cool', 'date', 'funny', 'review_id', 'text', 'useful'], axis = 1)
df_mat = df_rev.merge(df_bus, on = 'business_id', how = 'inner')
df_mat = df_mat.rename(columns = {'stars_x':'stars_u', 'stars_y':'stars_b'})
df_mat.head()

Unnamed: 0,business_id,stars_u,user_id,hours,latitude,longitude,name,review_count,stars_b
0,JlNeaOymdVbE6_bubqjohg,1,ssuXFjkH4neiBgwv-oN4IA,"{'Monday': '6:00-22:00', 'Tuesday': '6:00-22:0...",33.320441,-111.991401,Papa Chevo's Taco Shop,103,3.5
1,JlNeaOymdVbE6_bubqjohg,5,BPKpLbR9NuWFAR9SUWpZOw,"{'Monday': '6:00-22:00', 'Tuesday': '6:00-22:0...",33.320441,-111.991401,Papa Chevo's Taco Shop,103,3.5
2,JlNeaOymdVbE6_bubqjohg,5,sg5q7rz2_7PfaN-6JnLb5g,"{'Monday': '6:00-22:00', 'Tuesday': '6:00-22:0...",33.320441,-111.991401,Papa Chevo's Taco Shop,103,3.5
3,JlNeaOymdVbE6_bubqjohg,4,I-W_at9CPQox-t0xGveymw,"{'Monday': '6:00-22:00', 'Tuesday': '6:00-22:0...",33.320441,-111.991401,Papa Chevo's Taco Shop,103,3.5
4,JlNeaOymdVbE6_bubqjohg,1,jFTfEhwPuBsWXydhcEbqAQ,"{'Monday': '6:00-22:00', 'Tuesday': '6:00-22:0...",33.320441,-111.991401,Papa Chevo's Taco Shop,103,3.5


In [37]:
# filtering top 1000 users
df_mat2 = pd.merge(df_mat, df_user[['user_id', 'review_count']], on = 'user_id', how = 'inner')
df_mat2 = df_mat2.sort_values(['review_count'], ascending = 0)
desc_uid = df_mat2['user_id'].unique()
top_uid = desc_uid[:1000]
top_user_id = pd.DataFrame(top_uid, columns = ['user_id'])
df_mat3 = pd.merge(df_mat2, top_user_id[['user_id']], on = 'user_id', how = 'inner')
df_mat3  = df_mat3[['business_id', 'stars_u', 'user_id']]
df_mat3.head()

Unnamed: 0,business_id,stars_u,user_id
0,E25-2ssHwsajyGtsllMXaA,4,hWDybu_KvYLSdEFzGrniTw
1,GHs7grPeyVPwmiVKhhfMvg,4,hWDybu_KvYLSdEFzGrniTw
2,dIA4aUlrRHthZDH5oqwuvg,4,hWDybu_KvYLSdEFzGrniTw
3,QuybD_bJcAB2CHcpTfREYg,4,hWDybu_KvYLSdEFzGrniTw
4,vK1_qKEG7zHvjiTOsN0CRg,5,hWDybu_KvYLSdEFzGrniTw


In [38]:
# creating the training-testing data split
train_data, test_data = train_test_split(df_mat3, test_size = 0.2)
print('Training data rows: ', len(train_data))
print('Testing data rows: ', len(test_data))

Training data rows:  14449
Testing data rows:  3613


In [62]:
# creating sparse matrix for user and businesses
train_data_matrix = train_data.pivot(index='user_id', columns='business_id', values='stars_u')
train_data_matrix.head()

business_id,-01XupAWZEXbdNbxNg5mEg,-0Sgh0QlUKVsWosCWJzGqQ,-0WegMt6Cy966qlDKhu6jA,-0tgMGl7D9B10YjSN2ujLA,-1UMR00eXtwaeh59pEiDjA,-1VaIJza42Hjev6ukacCNg,-4TMQnQJW1yd6NqGRDvAeA,-4g68Hwm892_KPUuW5g1_Q,-6h3K1hj0d4DRcZNUtHDuw,-6tvduBzjLI1ISfs3F_qTg,...,zrTGcb83AsfyVTMrsCa65A,zra20XPGVL9P3i5hMoKjig,zrdQ0X9yT3p8Ho99evHB2A,ztP466jMUMtqLwwHqXbk9w,ztcdHj6EpUD7EemcntSsrg,ztk25JYxDZ9_9e2v6uHJAA,zuVvDYJkKAbXQTTBauAqJQ,zwNLJ2VglfEvGu7DDZjJ4g,zwmps5SXn30g-f5wqg_r9A,zzwicjPC9g246MK2M1ZFBA
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-0IiMAZI2SsQ7VmyzJjokQ,,,,,,,,,,,...,,,,,,,,,,
-9da1xk7zgnnfO1uTVYGkA,,,,,,,,,,,...,,,,,,,,,,
-B-QEUESGWHPE_889WJaeg,,,,,,,,,,,...,,,,,,,,,,
-FZBTkAZEXoP7CYvRV2ZwQ,,,,,,,,,,,...,,,,,,,,,,
-IgKkE8JvYNWeGu8ze4P8Q,,,,,,,,,,,...,,,,,,,,,,


In [72]:
# calculating cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
train_data_matrix_filled = train_data_matrix.fillna(0)
train_data_matrix_sparse = sparse.csr_matrix(train_data_matrix_filled)
user_similarity = cosine_similarity(train_data_matrix_sparse)
print(user_similarity)

[[ 1.  0.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  1.  0.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  0.  1.]]


In [104]:
# defining the prediction function
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = (train_data.pivot(index='user_id', columns='business_id', values='stars_u')).mean(axis = 1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + (similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T)
    return pred
print('Function defined.')

Function defined.


In [109]:
mean_user_rating = (train_data.pivot(index='user_id', columns='business_id', values='stars_u')).mean(axis = 1)
ratings_diff = (train_data_matrix_sparse - mean_user_rating[:, np.newaxis]) 
user_prediction = mean_user_rating[:, np.newaxis] + (user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T)
user_prediction

matrix([[-0.08417665, -0.08262416, -0.08262416, ..., -0.085651  ,
         -0.05241284, -0.08908921],
        [-1.10053241, -1.10053241, -1.10053241, ..., -1.10053241,
         -1.10053241, -1.10053241],
        [ 1.03820361,  1.03820361,  1.03820361, ...,  1.03820361,
          1.07171368,  1.04564787],
        ..., 
        [-0.19273269, -0.26556395, -0.26556395, ..., -0.26556395,
         -0.26556395, -0.24040498],
        [-0.11455325, -0.11116403, -0.11116403, ..., -0.11455325,
         -0.11455325, -0.11455325],
        [ 0.29513128,  0.31704263,  0.31704263, ...,  0.30268703,
          0.31769848,  0.29136005]])

In [108]:
mean_user_rating = (train_data.pivot(index='user_id', columns='business_id', values='stars_u')).mean(axis = 1)
ratings_diff = (train_data_matrix_sparse - mean_user_rating[:, np.newaxis]) 
mean_user_rating[:, np.newaxis] + (user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T)

[[-0.08417665 -0.08262416 -0.08262416 ..., -0.085651   -0.05241284
  -0.08908921]
 [-1.10053241 -1.10053241 -1.10053241 ..., -1.10053241 -1.10053241
  -1.10053241]
 [ 1.03820361  1.03820361  1.03820361 ...,  1.03820361  1.07171368
   1.04564787]
 ..., 
 [-0.19273269 -0.26556395 -0.26556395 ..., -0.26556395 -0.26556395
  -0.24040498]
 [-0.11455325 -0.11116403 -0.11116403 ..., -0.11455325 -0.11455325
  -0.11455325]
 [ 0.29513128  0.31704263  0.31704263 ...,  0.30268703  0.31769848
   0.29136005]]


In [111]:
user_prediction = predict(train_data_matrix_sparse, user_similarity, type='user')
user_prediction

matrix([[-0.08417665, -0.08262416, -0.08262416, ..., -0.085651  ,
         -0.05241284, -0.08908921],
        [-1.10053241, -1.10053241, -1.10053241, ..., -1.10053241,
         -1.10053241, -1.10053241],
        [ 1.03820361,  1.03820361,  1.03820361, ...,  1.03820361,
          1.07171368,  1.04564787],
        ..., 
        [-0.19273269, -0.26556395, -0.26556395, ..., -0.26556395,
         -0.26556395, -0.24040498],
        [-0.11455325, -0.11116403, -0.11116403, ..., -0.11455325,
         -0.11455325, -0.11455325],
        [ 0.29513128,  0.31704263,  0.31704263, ...,  0.30268703,
          0.31769848,  0.29136005]])

In [113]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))
print('Function defined.')

Function defined.


In [116]:
test_data_matrix = test_data.pivot(index='user_id', columns='business_id', values='stars_u')
train_data_matrix_filled = train_data_matrix.fillna(0)
train_data_matrix_sparse = sparse.csr_matrix(train_data_matrix_filled)
print('User-based CF RMSE: ' + str(rmse(user_prediction, train_data_matrix_sparse)))

User-based CF RMSE: 3.3786823204141676
