# 3. Collaborative Filtering - User Based

In [2]:
# imports

import pandas as pd
import numpy as np
import json
import gzip

from collections import defaultdict

In [3]:
# define file paths

path_meta = 'original_data/meta-Utah.json.gz'
path_pivot_subset = 'data/subset.parquet'
path_reviews_subset = 'data/subset_reviews.parquet'
path_user_sim = 'data/subset_user_sim.parquet'
path_item_sim = 'data/subset_item_sim.parquet'

In [4]:
# import data

user_sim = pd.read_parquet(path_user_sim)

In [5]:
# define a function for reading the data using a generator

def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [6]:
# import reviews subset

reviews = pd.read_parquet(path_reviews_subset)
reviews

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,108599429350262421326,Sarah Jensen,1510264647735,5,My fiance and I had an awesome stress-free exp...,,{'text': 'Thank you for your kind review Sarah...,0x87528767d0ec0e4d:0x7a2f1637a6fb6925
1,116427980967433332299,Amanda Tapp,1501002398116,5,"Extremely easy to work with auto loans, great ...",,{'text': 'We are glad to hear you had such a w...,0x87528767d0ec0e4d:0x7a2f1637a6fb6925
2,101683919557400338793,Christopher Barnes Williams,1590602894837,5,This place has been a life-saver especially co...,,,0x87528b34a4f8738f:0x79d61042dfa75e8
3,100518858506638839555,Lisa Reed,1472671353948,1,,,,0x8752f569a04d64bf:0x2b7e577cda98e81a
4,104908426908994871059,Christopher Danson,1607790321820,5,,,,0x8753052ed25d2ba3:0xf2ab0981e10a01e0
...,...,...,...,...,...,...,...,...
304987,103310333040616187375,Teague Baldwin,1559614074811,4,,,,0x875287112eb7ef65:0x3aefb61bbcb5f2a9
304988,104684222172138652619,Cyrus Biddle,1607737974010,5,,,,0x875287112eb7ef65:0x3aefb61bbcb5f2a9
304989,104142949011044790908,Maximum Decibel,1555810229017,5,,,,0x875287112eb7ef65:0x3aefb61bbcb5f2a9
304990,112713899303234238909,Eric Griffeth,1532966241127,4,,,,0x875287112eb7ef65:0x3aefb61bbcb5f2a9


In [7]:
# define a function to get all the reviews for a user, sorted by rating in descending order

def get_user_rated_sorted(user_id, df):
    user_reviews = df[df['user_id'] == user_id]
    user_rated = dict(zip(user_reviews['gmap_id'], user_reviews['rating']))
    user_rated = pd.DataFrame(list(user_rated.items()), columns=['gmap_id', 'rating'])
    user_rated.sort_values(by='rating', ascending=False, inplace=True)
    return user_rated

In [10]:
# test the function

get_user_rated_sorted('100518858506638839555', reviews)

Unnamed: 0,gmap_id,rating
46,0x8752880044f13fdd:0x908842aecab80a3f,5
14,0x87528bcc9cb4e93d:0xc3339f491d50180,5
25,0x8752890421f75c1b:0x395cf8820d09cf84,5
1,0x87528bcd22dedaf1:0x46fabacc23efe1cb,5
22,0x87528bedbcd1febf:0x258ad68cc64a1d81,5
21,0x8752f4e9dd9d9d5f:0x5d6f6cc2c8bebb8d,5
20,0x87528df1707f2019:0x7d5773c9b9fbbd0d,5
19,0x8752f479e557b67d:0x9e117cd5f64f9c6,5
30,0x875261a7924c019b:0x4a64591038379f9f,5
34,0x874d9a896798ded9:0xdc9ca4adc01529e3,5


In [11]:
user_rated_sorted = get_user_rated_sorted('100518858506638839555', reviews)
user_rated_sorted[user_rated_sorted['rating'] == 5]

Unnamed: 0,gmap_id,rating
46,0x8752880044f13fdd:0x908842aecab80a3f,5
14,0x87528bcc9cb4e93d:0xc3339f491d50180,5
25,0x8752890421f75c1b:0x395cf8820d09cf84,5
1,0x87528bcd22dedaf1:0x46fabacc23efe1cb,5
22,0x87528bedbcd1febf:0x258ad68cc64a1d81,5
21,0x8752f4e9dd9d9d5f:0x5d6f6cc2c8bebb8d,5
20,0x87528df1707f2019:0x7d5773c9b9fbbd0d,5
19,0x8752f479e557b67d:0x9e117cd5f64f9c6,5
30,0x875261a7924c019b:0x4a64591038379f9f,5
34,0x874d9a896798ded9:0xdc9ca4adc01529e3,5


In [11]:
# define a function to get a user's favorite places, favorites defined as being rated 4 or higher

def get_favorites(user_id, df):
  user_rated_sorted = get_user_rated_sorted(user_id, df)
  favorites = user_rated_sorted[user_rated_sorted['rating'] >= 4]
  return favorites

In [12]:
# test the function

favorite_test = get_favorites('111182595077674366891', reviews)
favorite_test

Unnamed: 0,gmap_id,rating
0,0x87528767d0ec0e4d:0x7a2f1637a6fb6925,5
1,0x8752882b5600ef39:0xafc90216a48750d7,5
2,0x875261c2a67f1f33:0x8ab8521e85dd979a,5
3,0x87528811d14c8bb9:0xcb24b62ebab773ea,5
6,0x87528939f49bff31:0x1009ab6f29e1017,5
8,0x875271779e7d9723:0xef883558ef266180,5
9,0x87528996ff8339b7:0xb6415f8a4e68ff83,5
11,0x875289e5cc7d4795:0x900f4f5050c7c034,5
12,0x87528be2cf552fd5:0x7d96ac3655de524c,5


# User-based collaborative filtering

In [13]:
# get the id of a sample user

sample_user_id = user_sim.index[5]
sample_user_id

'104493008685483322565'

In [14]:
# get the list of what the user already rated

user_rated_sorted = get_user_rated_sorted(sample_user_id, reviews)
user_rated_sorted

Unnamed: 0,gmap_id,rating
0,0x874d9a747023aa33:0xfb444113571545f6,5
7,0x874c5637aaf130c5:0xe733a952d639b1a4,5
3,0x874c560d07525a23:0xc0fe89001532fd58,4
2,0x874da428ec91b6b1:0x74ee74a084324ac7,3
4,0x874d99ff9a68ad97:0x6ada80cef3ad9652,3
9,0x874d81c8523acd55:0xd1e7e765d5814c83,3
10,0x874d9a886277d0e9:0xb226d474250321ed,3
11,0x874c57cbc47bb6a9:0x80abefc98320f2,3
1,0x874c449594ee07b5:0xd21edebfb3f479be,2
6,0x874c5613a0269b85:0xc8677b68f7a42b6f,2


In [15]:
# check the similarities for the user, sorted highest to lowest, index starting at 1 to remove itself

user_sim.loc[sample_user_id].sort_values(ascending=False)[1:]

user_id
116478652045049798984    0.136953
108768235902401499429    0.123987
104670981307047656082    0.116438
109008758103455272143    0.113415
110284743875157329968    0.110114
                           ...   
103421770970720991306    0.000000
112393479470320243350    0.000000
108160460172023739763    0.000000
114176429842267229610    0.000000
106908485809410410550    0.000000
Name: 104493008685483322565, Length: 9999, dtype: float64

In [16]:
# get the top 50 most similar users

similar_users_50 = user_sim.loc[sample_user_id].sort_values(ascending=False)[1:51]
similar_users_50

user_id
116478652045049798984    0.136953
108768235902401499429    0.123987
104670981307047656082    0.116438
109008758103455272143    0.113415
110284743875157329968    0.110114
107530899936865281469    0.105737
115631904981557445215    0.105207
107881759739661996400    0.097652
108295377169434969234    0.092623
111370517541866922928    0.091936
111957960434916977558    0.089527
105092840039796819242    0.087120
101511787189615949215    0.086244
110586835406821639964    0.086244
110709141404228602156    0.086244
110777758948519095024    0.084246
115261671370155136620    0.081203
110393615673627905235    0.078768
105414192843800369073    0.078730
102898829375839629881    0.078730
104113532272919289632    0.078525
106161759051698132099    0.077922
103350748971365574350    0.075641
112449931566812916372    0.075409
114044919200143061932    0.075280
111505123935098941018    0.074227
116742710871951500251    0.073382
118176996259037760415    0.072476
111239025653431008963    0.072171
107926

In [17]:
# use get_favorites function to get the favorites for the top 50 most similar users

from collections import defaultdict

favs_of_similar_users = defaultdict(float)

for key in similar_users_50.keys():
  favs = get_favorites(key, reviews)
  for _, row in favs.iterrows():
    if row['gmap_id'] not in set(user_rated_sorted['gmap_id']):
      favs_of_similar_users[row['gmap_id']] += row['rating']

favs_of_similar_users

defaultdict(float,
            {'0x874da46c2d59a36f:0x3144b10f0347dfd6': 9.0,
             '0x874c565ca36d4f97:0xf14a624b8cac5c74': 5.0,
             '0x8752f4a45dce3fdf:0x9ca6efc531d1d2': 5.0,
             '0x874c560da8726de1:0x6e88bb1d1df17872': 5.0,
             '0x874d9af48a2b19d3:0x97782bb3acfa81ff': 5.0,
             '0x874dbd20d54e8019:0x1b50998742d6c947': 5.0,
             '0x874da42f5d939b47:0x3cfc33d94bb6f185': 5.0,
             '0x874dbd26c52a5373:0x92a59bd71c3a9808': 5.0,
             '0x874d85752c517c29:0x9474f3c438f89bb9': 5.0,
             '0x874da426c141d2c1:0x34c2b432322bbc16': 5.0,
             '0x875285bdefa2e18d:0x1375d1dc52be696e': 5.0,
             '0x874c563cde78cefd:0x8da55d760665814e': 15.0,
             '0x874c57c4365fb433:0xc340956a6001b285': 10.0,
             '0x875285969fae1369:0xb045a85fd70a6be4': 5.0,
             '0x874d9af3587cfeb9:0x19131fdae495c3c3': 5.0,
             '0x874dbd11488dc4f1:0xf1a09df9bded9656': 14.0,
             '0x874c560da0dfd8a5:0x8

In [18]:
# sort by the total rating

favs_of_similar_users_df = pd.DataFrame(list(favs_of_similar_users.items()), columns=['gmap_id','total_rating'])
favs_of_similar_users_df.sort_values(by='total_rating', ascending=False, inplace=True)
favs_of_similar_users_df

Unnamed: 0,gmap_id,total_rating
213,0x874d9a8c0968fac1:0x505d78120ac24a9a,22.0
55,0x874d9b1e244d21dd:0x3123b745ca3813e9,19.0
209,0x874da6a05021db97:0x91f755330d287e7c,15.0
11,0x874c563cde78cefd:0x8da55d760665814e,15.0
53,0x874d900c2fcee02b:0xc03a990c8b6a97b1,15.0
...,...,...
436,0x874d9a74c5994013:0xb14890a871ab53e6,4.0
441,0x874d907c52d51001:0xa92ad406221ba785,4.0
443,0x875280301e0c5363:0xcbc9e8f3e5803b5,4.0
444,0x874d846a5fb9a4e9:0x70b323c511a3feef,4.0


# Define helper functions

In [29]:
# function to get all the reviews for a user, sorted by rating in descending order

def get_user_rated_sorted(user_id, df):
    user_reviews = df[df['user_id'] == user_id]
    user_rated = dict(zip(user_reviews['gmap_id'], user_reviews['rating']))
    user_rated = pd.DataFrame(list(user_rated.items()), columns=['gmap_id', 'rating'])
    user_rated.sort_values(by='rating', ascending=False, inplace=True)
    return user_rated

# define a function to get a user's favorite places, favorites defined as being rated 4 or higher

def get_favorites(user_id, df):
  user_rated_sorted = get_user_rated_sorted(user_id, df)
  favorites = user_rated_sorted[user_rated_sorted['rating'] >= 4]
  return favorites

# create a function to return the business name using the gmap_id

def get_business_name(gmap_id):
  meta_generator = parse(path_meta)
  for place in meta_generator:
    if place.get('gmap_id') == gmap_id:
      name = place['name']
      break
  return name

# define a function to get n most popular businesses, popular determined as 1) the most # of reviews and 2) highest average review

def get_popular(n, df):
  popular = df.groupby('gmap_id')['rating'].agg(['count','mean']).sort_values(by=['count','mean'], ascending=False)
  return popular.head(n)

# Create a user-based recommender function

In [30]:
# create a function to recommend n number of businesses to a user based on what similar users like

def user_based_recommendations(user_id, df, user_sim, n_recs):

  # get the list of what the user has already rated
  user_rated_sorted = get_user_rated_sorted(user_id, df)

  # change to a set for faster lookup later
  set_user_rated_sorted = set(user_rated_sorted['gmap_id'])

  # if the user_id is not in predicted_df
  if user_id in df['user_id'].values:

    # get the top 50 most similar users
    similar_users_50 = user_sim.loc[user_id].sort_values(ascending=False)[1:51]

    # create a dictionary to store the similar users' favorites
    favs_of_similar_users = defaultdict(float)

    # create a loop to iterate through the 50 users and get their favorite businesses
    for key in similar_users_50.keys():
      favs = get_favorites(key, df)
      for _, row in favs.iterrows():
        if row['gmap_id'] not in set_user_rated_sorted:
          favs_of_similar_users[row['gmap_id']] += row['rating']

    # sort by the total rating
    favs_of_similar_users_df = pd.DataFrame(list(favs_of_similar_users.items()), columns=['gmap_id','total_rating'])
    favs_of_similar_users_df.sort_values(by='total_rating', ascending=False, inplace=True)
    favs_of_similar_users_df.reset_index(drop=True, inplace=True)

    # get n recommendations
    n_favs = favs_of_similar_users_df.head(n_recs).copy()

    # add the business names
    n_favs['name'] = n_favs['gmap_id'].apply(get_business_name)

    return n_favs

  else:
    popular = get_popular(n_recs, df)

    # name the columns and then resent index
    popular.columns = ['similarity','avg_rating']
    popular.reset_index(inplace=True)

    # add the business name to the df
    popular['name'] = popular['gmap_id'].apply(get_business_name)

    # reorder the columns to match the recs df
    popular = popular[['gmap_id','similarity','name','avg_rating']]

    return popular


In [25]:
# check the function 1

user_based_recommendations('104493008685483322565', reviews, user_sim, 10)

Unnamed: 0,gmap_id,total_rating,name
0,0x874d9a8c0968fac1:0x505d78120ac24a9a,22.0,University Place
1,0x874d9b1e244d21dd:0x3123b745ca3813e9,19.0,Walmart Supercenter
2,0x874da6a05021db97:0x91f755330d287e7c,15.0,Denny's
3,0x874c563cde78cefd:0x8da55d760665814e,15.0,Juab County Fairgrounds
4,0x874d900c2fcee02b:0xc03a990c8b6a97b1,15.0,Tucanos Brazilian Grill
5,0x8752879236e85383:0xbaf443f16b40940c,14.0,Loveland Living Planet Aquarium
6,0x874dbd11488dc4f1:0xf1a09df9bded9656,14.0,Walmart Supercenter
7,0x874d81bacccbcba5:0xb4eb2e6e910911ab,14.0,Del Taco
8,0x874d81c91fca62d1:0x168165e337a6dff4,13.0,Macey's
9,0x874d99ff80a1a091:0x2631e99b7f11ae3d,13.0,Fresh Market


In [22]:
# check the function 2

user_based_recommendations('108160460172023739763', reviews, user_sim, 10)

Unnamed: 0,gmap_id,total_rating,name
0,0x875303c7421e6a83:0xc0d2c755c21c458d,25.0,Chick-fil-A
1,0x875303b89523824f:0x1d66a50c4f10f8b9,24.0,Texas Roadhouse
2,0x875304aa0e983185:0x35cd6dc3ec4ab7d0,23.0,McDonald's
3,0x875302256c14e743:0xc37d34f0900a2712,20.0,Sill's Cafe
4,0x87531ac6a139c4c9:0x8ab28f3611cf1bd2,19.0,Walmart Supercenter
5,0x874dbd21b4733f59:0xc526c2ce087059bf,19.0,Denny's
6,0x874dbd5fe161cc67:0x30e21ff8bc6f29bd,19.0,Walmart Supercenter
7,0x8753039117a03f11:0xd9a54c35788907fe,18.0,Pizza Pie Cafe
8,0x875303921e547ae7:0xc3c9fe11590d4e3e,18.0,Ross Dress for Less
9,0x87530396f3b94ed7:0x94d19eaa2db055e5,18.0,Applebee's Grill + Bar


In [31]:
# check the function 2

user_based_recommendations('12345', reviews, user_sim, 10)

Unnamed: 0,gmap_id,similarity,name,avg_rating
0,0x875289bef87fcb6b:0x9e865a4dadee3648,7815,Fashion Place,4.353039
1,0x8752f508b7dc56a9:0x13d77d6e854d7e79,7540,City Creek Center,4.496552
2,0x8752f51b7961dc89:0x73f1a804eb909466,7206,Hogle Zoo,4.56453
3,0x8752879236e85383:0xbaf443f16b40940c,7084,Loveland Living Planet Aquarium,4.627612
4,0x8752872f3f4b153b:0xd8bb5e82808eedbd,7068,IKEA,4.316355
5,0x87528c9d1d641321:0x90cf6256d98a62eb,6998,Valley Fair Mall,3.883252
6,0x8752f50023b5e105:0xbf4b78f58640694f,6997,The Gateway,4.040303
7,0x87528e94d44e7fc5:0x42b1cfae697ba4a6,6778,Jordan Landing,4.3144
8,0x8752ff18c387e561:0xf3256e9df9ab3603,6420,Lagoon Amusement Park,4.390498
9,0x8752f54425332f67:0x354cd3a2499141d0,6281,Liberty Park,4.619965
