# 3. Collaborative Filtering - User Based

In [1]:
# imports

import pandas as pd
import numpy as np
import json
import gzip

from collections import defaultdict

In [2]:
# define file paths

path_meta = 'original_data/meta-Utah.json.gz'
path_pivot_subset = 'data/subset.parquet'
path_reviews_subset = 'data/subset_reviews.parquet'
path_user_sim = 'data/subset_user_sim.parquet'
path_item_sim = 'data/subset_item_sim.parquet'

In [3]:
# import data

user_sim = pd.read_parquet(path_user_sim)

In [4]:
# define a function for reading the data using a generator

def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [5]:
# import reviews subset

reviews = pd.read_parquet(path_reviews_subset)
reviews

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,116427980967433332299,Amanda Tapp,1501002398116,5,"Extremely easy to work with auto loans, great ...",,{'text': 'We are glad to hear you had such a w...,0x87528767d0ec0e4d:0x7a2f1637a6fb6925
1,100217552787897641896,Rylee Jones,1566432339019,5,,,,0x875287dced2a1f1f:0xf3764a9211d4f382
2,113719864440680408253,Kylie McDonald,1559939531153,5,Highly recommend this business! They truly car...,,"{'text': 'Thank you, Kylie! It was a pleasure ...",0x8752841a66574037:0x6a51c0f67ca3002
3,114710026425309062285,Jess Bird,1540481859746,5,,,"{'text': 'Thank you, Jess!', 'time': 159667147...",0x8752f365da14f295:0x264218c77da46a71
4,114105421795834263422,Maxwell McLeod,1544839018564,5,Tom and the team do an excellent job. I have b...,,,0x875287d36010a61b:0xab575aa5992155b3
...,...,...,...,...,...,...,...,...
305257,107626136867067342591,Rachelle Taysom,1525060448816,3,,,,0x875287112eb7ef65:0x3aefb61bbcb5f2a9
305258,108155158391983335470,Cafe Guru,1603655474608,5,,,,0x875287112eb7ef65:0x3aefb61bbcb5f2a9
305259,104620742288190585924,Sean Smith,1578929371061,2,,,,0x875287112eb7ef65:0x3aefb61bbcb5f2a9
305260,102429264321348600901,Eric Anderson,1564336239651,5,,,,0x875287112eb7ef65:0x3aefb61bbcb5f2a9


In [6]:
# define a function to get all the reviews for a user, sorted by rating in descending order

def get_user_rated_sorted(user_id, df):
    user_reviews = df[df['user_id'] == user_id]
    user_rated = dict(zip(user_reviews['gmap_id'], user_reviews['rating']))
    user_rated = pd.DataFrame(list(user_rated.items()), columns=['gmap_id', 'rating'])
    user_rated.sort_values(by='rating', ascending=False, inplace=True)
    return user_rated

In [7]:
# test the function

get_user_rated_sorted('100518858506638839555', reviews)

Unnamed: 0,gmap_id,rating


In [8]:
user_rated_sorted = get_user_rated_sorted('100518858506638839555', reviews)
user_rated_sorted[user_rated_sorted['rating'] == 5]

Unnamed: 0,gmap_id,rating


In [9]:
# define a function to get a user's favorite places, favorites defined as being rated 4 or higher

def get_favorites(user_id, df):
  user_rated_sorted = get_user_rated_sorted(user_id, df)
  favorites = user_rated_sorted[user_rated_sorted['rating'] >= 4]
  return favorites

In [10]:
# test the function

favorite_test = get_favorites('111182595077674366891', reviews)
favorite_test

Unnamed: 0,gmap_id,rating


# User-based collaborative filtering

In [11]:
# get the id of a sample user

sample_user_id = user_sim.index[5]
sample_user_id

'104493008685483322565'

In [12]:
# get the list of what the user already rated

user_rated_sorted = get_user_rated_sorted(sample_user_id, reviews)
user_rated_sorted

Unnamed: 0,gmap_id,rating


In [13]:
# check the similarities for the user, sorted highest to lowest, index starting at 1 to remove itself

user_sim.loc[sample_user_id].sort_values(ascending=False)[1:]

user_id
116478652045049798984    0.136953
108768235902401499429    0.123987
104670981307047656082    0.116438
109008758103455272143    0.113415
110284743875157329968    0.110114
                           ...   
103421770970720991306    0.000000
112393479470320243350    0.000000
108160460172023739763    0.000000
114176429842267229610    0.000000
106908485809410410550    0.000000
Name: 104493008685483322565, Length: 9999, dtype: float64

In [14]:
# get the top 50 most similar users

similar_users_50 = user_sim.loc[sample_user_id].sort_values(ascending=False)[1:51]
similar_users_50

user_id
116478652045049798984    0.136953
108768235902401499429    0.123987
104670981307047656082    0.116438
109008758103455272143    0.113415
110284743875157329968    0.110114
107530899936865281469    0.105737
115631904981557445215    0.105207
107881759739661996400    0.097652
108295377169434969234    0.092623
111370517541866922928    0.091936
111957960434916977558    0.089527
105092840039796819242    0.087120
101511787189615949215    0.086244
110586835406821639964    0.086244
110709141404228602156    0.086244
110777758948519095024    0.084246
115261671370155136620    0.081203
110393615673627905235    0.078768
105414192843800369073    0.078730
102898829375839629881    0.078730
104113532272919289632    0.078525
106161759051698132099    0.077922
103350748971365574350    0.075641
112449931566812916372    0.075409
114044919200143061932    0.075280
111505123935098941018    0.074227
116742710871951500251    0.073382
118176996259037760415    0.072476
111239025653431008963    0.072171
107926

In [15]:
# use get_favorites function to get the favorites for the top 50 most similar users

from collections import defaultdict

favs_of_similar_users = defaultdict(float)

for key in similar_users_50.keys():
  favs = get_favorites(key, reviews)
  for _, row in favs.iterrows():
    if row['gmap_id'] not in set(user_rated_sorted['gmap_id']):
      favs_of_similar_users[row['gmap_id']] += row['rating']

favs_of_similar_users

defaultdict(float,
            {'0x875287f3b184f8c3:0x305ad1941440c38a': 5.0,
             '0x8753030b60a4460b:0x2af915d0faaa8da8': 5.0,
             '0x87530f815eb3ce1d:0xf6dc6bcdf1eec3f2': 5.0,
             '0x874d9a886277d0e9:0xb226d474250321ed': 5.0,
             '0x87547d5f3bc57873:0x21e3915c55430830': 5.0,
             '0x874d97674392bebb:0x23db0017e952c548': 5.0,
             '0x80b561c630f2a519:0xbfd3cfe8e7cb7f48': 5.0,
             '0x875280301e0c5363:0x71e3a75637c0f0fa': 5.0,
             '0x87530c8e6726ad51:0x4ab1f32f1f914eb5': 4.0,
             '0x874cca851f396949:0x70852e3baee1779d': 5.0,
             '0x874cca7e754dd9f7:0x2336d62421017ea9': 5.0,
             '0x874cb58ca81c6439:0xff7e077d76a37b4e': 5.0,
             '0x874cca81ec98331b:0x61dfc886007396e0': 5.0,
             '0x874cca7eb13c9ad7:0xe6cad2a80dcde3ed': 5.0,
             '0x874cca70514dfcf7:0x56d84c65030f3422': 5.0,
             '0x874c57cbc47bb6a9:0x80abefc98320f2': 5.0,
             '0x874b5c05ad082da1:0x8c09

In [16]:
# sort by the total rating

favs_of_similar_users_df = pd.DataFrame(list(favs_of_similar_users.items()), columns=['gmap_id','total_rating'])
favs_of_similar_users_df.sort_values(by='total_rating', ascending=False, inplace=True)
favs_of_similar_users_df

Unnamed: 0,gmap_id,total_rating
0,0x875287f3b184f8c3:0x305ad1941440c38a,5.0
1,0x8753030b60a4460b:0x2af915d0faaa8da8,5.0
15,0x874c57cbc47bb6a9:0x80abefc98320f2,5.0
14,0x874cca70514dfcf7:0x56d84c65030f3422,5.0
13,0x874cca7eb13c9ad7:0xe6cad2a80dcde3ed,5.0
12,0x874cca81ec98331b:0x61dfc886007396e0,5.0
11,0x874cb58ca81c6439:0xff7e077d76a37b4e,5.0
10,0x874cca7e754dd9f7:0x2336d62421017ea9,5.0
9,0x874cca851f396949:0x70852e3baee1779d,5.0
7,0x875280301e0c5363:0x71e3a75637c0f0fa,5.0


# Define helper functions

In [17]:
# function to get all the reviews for a user, sorted by rating in descending order

def get_user_rated_sorted(user_id, df):
    user_reviews = df[df['user_id'] == user_id]
    user_rated = dict(zip(user_reviews['gmap_id'], user_reviews['rating']))
    user_rated = pd.DataFrame(list(user_rated.items()), columns=['gmap_id', 'rating'])
    user_rated.sort_values(by='rating', ascending=False, inplace=True)
    return user_rated

# define a function to get a user's favorite places, favorites defined as being rated 4 or higher

def get_favorites(user_id, df):
  user_rated_sorted = get_user_rated_sorted(user_id, df)
  favorites = user_rated_sorted[user_rated_sorted['rating'] >= 4]
  return favorites

# create a function to return the business name using the gmap_id

def get_business_name(gmap_id):
  meta_generator = parse(path_meta)
  for place in meta_generator:
    if place.get('gmap_id') == gmap_id:
      name = place['name']
      break
  return name

# define a function to get n most popular businesses, popular determined as 1) the most # of reviews and 2) highest average review

def get_popular(n, df):
  popular = df.groupby('gmap_id')['rating'].agg(['count','mean']).sort_values(by=['count','mean'], ascending=False)
  return popular.head(n)

# Create a user-based recommender function

In [38]:
# create a function to recommend n number of businesses to a user based on what similar users like

def user_based_recommendations(user_id, df, user_sim, n_recs):

  # get the list of what the user has already rated
  user_rated_sorted = get_user_rated_sorted(user_id, df)

  # change to a set for faster lookup later
  set_user_rated_sorted = set(user_rated_sorted['gmap_id'])

  # if the user_id is not in predicted_df
  if user_id in user_sim.keys():
    print(f"{user_id} found in dataset")

    # get the top 50 most similar users
    similar_users_50 = user_sim.loc[user_id].sort_values(ascending=False)[1:51]

    # create a dictionary to store the similar users' favorites
    favs_of_similar_users = defaultdict(float)

    # create a loop to iterate through the 50 users and get their favorite businesses
    for key in similar_users_50.keys():
      favs = get_favorites(key, df)
      for _, row in favs.iterrows():
        if row['gmap_id'] not in set_user_rated_sorted:
          favs_of_similar_users[row['gmap_id']] += row['rating']

    # sort by the total rating
    favs_of_similar_users_df = pd.DataFrame(list(favs_of_similar_users.items()), columns=['gmap_id','total_rating'])
    favs_of_similar_users_df.sort_values(by='total_rating', ascending=False, inplace=True)
    favs_of_similar_users_df.reset_index(drop=True, inplace=True)

    # get n recommendations
    n_favs = favs_of_similar_users_df.head(n_recs).copy()

    # add the business names
    n_favs['name'] = n_favs['gmap_id'].apply(get_business_name)

    return n_favs

  else:
    print(f"{user_id} not found — using popularity")
    popular = get_popular(n_recs, df)

    # name the columns and then resent index
    popular.columns = ['similarity','avg_rating']
    popular.reset_index(inplace=True)

    # add the business name to the df
    popular['name'] = popular['gmap_id'].apply(get_business_name)

    # reorder the columns to match the recs df
    popular = popular[['gmap_id','similarity','name','avg_rating']]

    return popular


In [41]:
# check the function 1

user_based_recommendations('116427980967433332299', reviews, user_sim, 10)

116427980967433332299 found in dataset


Unnamed: 0,gmap_id,total_rating,name
0,0x8752858a882ea6cd:0xa740796cc19f6bc,5.0,VASA Fitness
1,0x87528bfaffa3a98d:0x919a8655083ae40c,5.0,Francesco's
2,0x87528bee05c7d4c5:0xc98bc487299e85,5.0,General Army Navy Outdoor
3,0x87547de0ecef3943:0x4af21e320bd2a754,5.0,A&W
4,0x87528b92bd41f8ed:0x6b853570b4c883ce,5.0,Dee's Family Restaurant
5,0x87537e2f20bf890f:0xa7052e857a61c793,5.0,Maverik
6,0x87528a0e5935534d:0x2babcbe2ac504b1e,5.0,Great Clips
7,0x87528a89951f1f97:0xd474151a6b0d248f,5.0,Supersonic Express Car Wash - E 3300 S
8,0x875287d17c84d34f:0x7cd695da7be1d5c0,5.0,Hale Centre Theatre
9,0x87526286c61b04a3:0x6514f5fc654f6d9d,5.0,Target


In [42]:
# check the function 2

user_based_recommendations('108160460172023739763', reviews, user_sim, 10)

108160460172023739763 found in dataset


Unnamed: 0,gmap_id,total_rating,name
0,0x875304973428756b:0x932c3918c0853175,24.0,Kent's Market
1,0x875303b828f60367:0xf86b78c965d59f77,15.0,Walmart Supercenter
2,0x8753055cf3d55541:0x16b040704300f95a,14.0,Ocean Mart
3,0x87530396f3b94ed7:0x94d19eaa2db055e5,14.0,Applebee's Grill + Bar
4,0x8753039702045de3:0xced7315d01806fb3,10.0,Golden Corral Buffet & Grill
5,0x87530f7f8dde5dad:0xfdfd9f0a7f2eb535,10.0,Walmart Supercenter
6,0x87530f40e02c5835:0x30bbf6004df183bf,10.0,Chuck-A-Rama Buffet
7,0x875302341b79ce73:0xae70d0e8d32f8957,9.0,Layton Hills Mall
8,0x87530f86dbdc684b:0xe6a146e4e8b4fa45,9.0,Applebee's Grill + Bar
9,0x875304aa0e983185:0x35cd6dc3ec4ab7d0,9.0,McDonald's


In [43]:
# check the function 2

user_based_recommendations('12345', reviews, user_sim, 10)

12345 not found — using popularity


Unnamed: 0,gmap_id,similarity,name,avg_rating
0,0x875289bef87fcb6b:0x9e865a4dadee3648,496,Fashion Place,4.417339
1,0x8752f508b7dc56a9:0x13d77d6e854d7e79,482,City Creek Center,4.491701
2,0x8752879236e85383:0xbaf443f16b40940c,458,Loveland Living Planet Aquarium,4.624454
3,0x8752f51b7961dc89:0x73f1a804eb909466,440,Hogle Zoo,4.506818
4,0x87528e94d44e7fc5:0x42b1cfae697ba4a6,422,Jordan Landing,4.281991
5,0x8752872f3f4b153b:0xd8bb5e82808eedbd,421,IKEA,4.244656
6,0x87528c9d1d641321:0x90cf6256d98a62eb,410,Valley Fair Mall,3.756098
7,0x8752f50023b5e105:0xbf4b78f58640694f,403,The Gateway,4.004963
8,0x8752ff18c387e561:0xf3256e9df9ab3603,397,Lagoon Amusement Park,4.445844
9,0x8752f54425332f67:0x354cd3a2499141d0,388,Liberty Park,4.585052
