In [1]:
!pip install surprise



In [44]:
import json
import time
import pandas as pd
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from collections import defaultdict

In [3]:
start = time.time()
with open('MA_users.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    user_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load user data: %d seconds" %duration_without_dr)

Reading MA_users.json
Time taken to load user data: 0 seconds


In [4]:
start = time.time()
with open('MA_restaurants.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    restaurant_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load restaurant data: %d seconds" %duration_without_dr)

Reading MA_restaurants.json
Time taken to load restaurant data: 0 seconds


In [5]:
start = time.time()
with open('MA_reviews.json', encoding="utf8") as fin:
    print('Reading',fin.name)
    review_data = json.load(fin)
end = time.time()
duration_without_dr = end-start
print("Time taken to load review data: %d seconds" %duration_without_dr)

Reading MA_reviews.json
Time taken to load review data: 2 seconds


In [6]:
print(str(len(user_data)) + ' user records loaded')
print(str(len(review_data)) + ' review records loaded')
print(str(len(restaurant_data)) + ' business records loaded')

13046 user records loaded
465276 review records loaded
6249 business records loaded


In [None]:
business_id_dict = dict()
for i,r in enumerate(restaurant_data):
    business_id_dict[r['business_id']] = i

def print_business_info(business_id):
    for attr in restaurant_data[business_id_dict[business_id]].keys():
        print('{0}: {1}'.format(str(attr), str(restaurant_data[business_id_dict[business_id]][attr])))
        
def get_business_attr(business_id, attribute):
    return restaurant_data[business_id_dict[business_id]][attribute]

In [None]:
user_id_dict = dict()
for i,u in enumerate(user_data):
    user_id_dict[u['user_id']] = i

def print_user_info(user_id):
    for attr in user_data[user_id_dict[user_id]].keys():
        print('{0}: {1}'.format(str(attr), str(user_data[user_id_dict[user_id]][attr]) if attr != 'friends' else '[list of user ids]'))

def get_user_attr(user_id, attribute):
    return user_data[user_id_dict[user_id]][attribute]

In [7]:
rating_dict = {'user_id': [],
               'business_id': [],
               'rating': []}

for review in review_data:
    rating_dict['user_id'].append(review['user_id'])
    rating_dict['business_id'].append(review['business_id'])
    rating_dict['rating'].append(review['stars'])

df = pd.DataFrame(rating_dict)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'business_id', 'rating']], reader)
print('The data frame shape:\t{}'.format(df.shape))

The data frame shape:	(465276, 3)


In [8]:
trainset = data.build_full_trainset()
algo = SVDpp(n_epochs=30, lr_all=0.0025)

# note: this takes quite a while...
start = time.time()
algo.fit(trainset)
end = time.time()
duration_without_dr = end-start
print("Time taken to fit data: %d seconds" %duration_without_dr)

Time taken to fit data: 565 seconds


In [113]:
print(df['user_id'].value_counts()[0:100])

nl8gWLDo6U6MjqzbBmE_9A    1081
rcU7ysY41qGppbw4pQgjqg     618
ggl6fl-PM5O1WrdReL0l4A     618
1Y0zsJSfWLkfDylH0X1yNQ     588
t903_es-gp3abvdrIQutQA     485
                          ... 
DHTAcCJ1YXQVOE0F8c2_dA     222
p8yQsVA51dzkc9cecDpvrw     220
JM_JfYRtTIBc42zHSRgsrA     220
q_m6qpGrwH_JUZjVJmEsBw     220
uB_aCUa8XvsyUZM_1UxYfg     219
Name: user_id, Length: 100, dtype: int64


In [114]:
#user_id = 'nl8gWLDo6U6MjqzbBmE_9A' # sample user
user_id = 'JM_JfYRtTIBc42zHSRgsrA' # sample user
df_new_user = df.loc[df['user_id'] == user_id]
print(df_new_user)

                       user_id             business_id  rating
574     JM_JfYRtTIBc42zHSRgsrA  vV5ciKb5sDKVmF8mAoz1iA     3.0
9322    JM_JfYRtTIBc42zHSRgsrA  4UjU7F_EX3lgUtkzN8Bbrw     3.0
12821   JM_JfYRtTIBc42zHSRgsrA  zmZ3HkVCeZPBefJJxzdJ7A     5.0
14998   JM_JfYRtTIBc42zHSRgsrA  5gJ54ZorEYdQK7TAD7Yz7g     4.0
17252   JM_JfYRtTIBc42zHSRgsrA  6dB2_doBzgx3SGkgyMA96g     5.0
...                        ...                     ...     ...
462293  JM_JfYRtTIBc42zHSRgsrA  qS_sydoMwEMuBU0ByOM-JA     4.0
462491  JM_JfYRtTIBc42zHSRgsrA  H6YtJyXYPTyU7EQ_jHs-7Q     4.0
463731  JM_JfYRtTIBc42zHSRgsrA  bwNwEWulr97oCJ6ux9VVoA     4.0
463940  JM_JfYRtTIBc42zHSRgsrA  W-yWnbKJr1e7nGPipPLxgw     3.0
465075  JM_JfYRtTIBc42zHSRgsrA  SSOx0BLXiVfmJWCY5XJDrw     1.0

[220 rows x 3 columns]


In [105]:
business_id_dict = dict()
for i,r in enumerate(restaurant_data):
    business_id_dict[r['business_id']] = i

def print_business_info(business_id):
    for attr in restaurant_data[business_id_dict[business_id]].keys():
        print('{0}: {1}'.format(str(attr), str(restaurant_data[business_id_dict[business_id]][attr])))
        
def get_business_attr(business_id, attribute):
    return restaurant_data[business_id_dict[business_id]][attribute]

In [115]:
pred_list = list()
for business_id in business_id_dict:
    if (df_new_user['business_id'] == business_id).any():
        r_ui = df_new_user.loc[df_new_user['business_id'] == business_id, 'rating'].iloc[0]
        pred = algo.predict(user_id, business_id, r_ui = r_ui, verbose=True)
        pred_list.append(pred)

impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: PSp0P_3zWIQabA5HAIJBMQ r_ui = 4.00   est = 3.74   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: yNYyM-xdZDh_rITFDMp2tg r_ui = 4.00   est = 3.23   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: tfdIB3AlviYKVYguaT1S8g r_ui = 4.00   est = 3.88   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: cMSdyE8pLdb_XIcwr9eDTw r_ui = 4.00   est = 2.34   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: fgoSWwrL0JY2Qr-rqCcQMw r_ui = 3.00   est = 3.71   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: 5eYs6Kz72F6ICd_kZ6mhxA r_ui = 4.00   est = 3.43   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: VnuD2cojPTWd3nIHQjnL8w r_ui = 4.00   est = 4.06   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: KFVgWwwGgepVeR6tDs3yfQ r_ui = 5.00   est = 3.50   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: rqzXyfMKcvTkLbX2kJeolg r_ui = 3.00   est = 3.52   {'was_i

In [116]:
from surprise import accuracy
accuracy.rmse(pred_list)

RMSE: 0.9383


0.9382838793783648

In [117]:
pred_list = list()
for business_id in business_id_dict:
    if (df_new_user['business_id'] == business_id).any():
        r_ui = df_new_user.loc[df_new_user['business_id'] == business_id, 'rating'].iloc[0]
        pred = algo.predict(user_id, business_id, r_ui = r_ui, verbose=True)
    else:
        pred = algo.predict(user_id, business_id, verbose=True)
    pred_list.append(pred)

impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: P5lR-AdqxUC944GJ0Sfd_Q r_ui = None   est = 3.56   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: zWPEAPYJjCHE87snhpTEiA r_ui = None   est = 3.54   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: L1yCGs1G1KM-LLIrLIWAPw r_ui = None   est = 2.78   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: 1qO0DUMv1ZtRtPH2VD8x_Q r_ui = None   est = 4.22   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: b0kpd0u7o8HBlrciMbGMqQ r_ui = None   est = 2.83   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: lYfE6rYm-GiZY0zLZ8S1Gw r_ui = None   est = 3.97   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: h0JnvqMcK_G7CeBijxaVAA r_ui = None   est = 3.54   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: YLBFKc5nL1KBPQ3c334ang r_ui = None   est = 3.54   {'was_impossible': False}
user: JM_JfYRtTIBc42zHSRgsrA item: xtum8zS3BO9E1qcUOpi96w r_ui = None   est = 3.54   {'was_i

In [90]:
def find_topn(predictions, n, user_id):
    topn = defaultdict(list)
    for uid, bid, _, est, _ in predictions:
        topn[uid].append((bid, est))

    for uid, est in topn.items():
        est.sort(key=lambda x: x[1], reverse=True)
        topn[uid] = est[:n]

    user_results = pd.DataFrame.from_dict(topn).transpose().loc[user_id]
    
    print('Top {0} recommendations for user {1}:\n'.format(n, get_user_attr(user_id, 'name')))
    print('   Restaurant:                                  Predicted Rating:              Restaurant ID:\n')
    for i in range(n):
        print('{:<3d}{:<50s}{:1.2f}{:>40s}'.format(i+1, get_business_attr(user_results.loc[i][0], 'name'), user_results.loc[i][1], user_results.loc[i][0]))
        
    return user_results

In [118]:
user_topn = find_topn(pred_list, 25, user_id)

Top 25 recommendations for user Young:

   Restaurant:                                  Predicted Rating:              Restaurant ID:

1  Harry's Cheese and Cold Cuts                      4.96                  w7JR1y-TCXmWdFKDLW83eQ
2  Tasting Counter                                   4.63                  zMpWhHh6Cp1BkFRM8cbPhA
3  Firenze Trattoria                                 4.62                  7jF3jDKji39bzLYVb3R3xA
4  Polcari's Coffee                                  4.60                  y5lCCZ4YZk4jLSmBaATOmw
5  Karl's Sausage Kitchen & European Market          4.58                  MYuqzv-jQLBUP0vIppAkGw
6  Recreo Coffee & Roasterie                         4.57                  ZjNw6MQ9EyS16COI465xbQ
7  Bella Ravioli                                     4.57                  uV2P0OlVMtesxZ4aIxmuGg
8  Roadworthy                                        4.56                  rGs0vEyN3puNd9RiIsPeKQ
9  O Ya                                              4.56                  87f7kR

In [52]:
print_business_info('w7JR1y-TCXmWdFKDLW83eQ')

business_id: w7JR1y-TCXmWdFKDLW83eQ
name: Harry's Cheese and Cold Cuts
address: 98 Blackstone St
city: Boston
state: MA
postal_code: 02109
latitude: 42.3615179
longitude: -71.0564517
stars: 5.0
review_count: 57
is_open: 1
attributes: {'RestaurantsTakeOut': 'True', 'BusinessAcceptsCreditCards': 'True', 'GoodForKids': 'True', 'OutdoorSeating': 'False', 'Alcohol': "u'none'", 'Ambience': "{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': True, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}", 'RestaurantsGoodForGroups': 'False', 'RestaurantsAttire': "'casual'", 'RestaurantsReservations': 'False', 'BusinessParking': "{'garage': False, 'street': True, 'validated': True, 'lot': False, 'valet': False}", 'NoiseLevel': "u'average'", 'Caters': 'True', 'WiFi': "u'no'", 'RestaurantsDelivery': 'False', 'RestaurantsPriceRange2': '1', 'BikeParking': 'False', 'HasTV': 'True'}
categories: Cheese Shops, Food, Meat Shops, Delis, Restaurants, Specialt

In [132]:
business_id = 'w7JR1y-TCXmWdFKDLW83eQ'
for review in review_data:
    if review['business_id'] == business_id:
        print('{:<15s}{:<15s}{:>10.1f}'.format(get_user_attr(review['user_id'], 'name'),review['user_id'], review['stars']))

Tanya          ILZtzt-3oEmLu8c1rfeJuw       5.0
Steven         0Zq95YruU7IvCsvzFOSSzA       4.0
Jacquelyn      F0_FBGWlJS7z_qTDm9dJSA       5.0
boom           bje1P4XGW9lv79iuSKBgiw       5.0
Lauretta       W_EQFi2fXpBtuRm510cCpA       5.0
Vivian         nl8gWLDo6U6MjqzbBmE_9A       5.0
Sarah          5LT0UfheP3c-_xKqmZDgxQ       4.0
Dorothy        W-SyRg4KEOwcRRA_hj3Txw       5.0
Max            p2NLkCmcPp1-Mr8x9TXXcQ       5.0
Chewie         -GnSkFtT3axpOdkSxmx7Zw       5.0
Benjamin       4v30Vm27ViLTv4pRQNCIPA       5.0
Steve          AhsN6hwl6YOMoxx4tZkTEw       5.0
Jenny          5BpJEC8YVi80ecWT5CtlfQ       4.0
Anna           N6eoEXDb4Yrwjkkopz6j-A       5.0
Nancy          _JATjpl2QlrIDJ7Y-3gmiA       5.0
Tim            Gp6AHxjQZLNmF8TfxQOJIQ       5.0
Christi        HM5hkVTNRVqqqFHhjbtZwQ       5.0
Virginia       nr35WjC-YI0dFcsnW33Szw       5.0
Rebecca        7PA7DYCcNPo79lyqS2J2fQ       5.0
Nancy          5GYBDtLPStI2D0OcVNFyWA       5.0
Eliza          1pzjkHhSg_gn1F829FzIlw   