# 4. Collaborative Filtering - Item Based

In [1]:
# imports

import pandas as pd
import numpy as np
import json
import gzip

In [2]:
# define file paths

path_meta = 'original_data/meta-Utah.json.gz'
path_pivot_subset = 'data/subset.parquet'
path_reviews_subset = 'data/subset_reviews.parquet'
path_user_sim = 'data/subset_user_sim.parquet'
path_item_sim = 'data/subset_item_sim.parquet'

In [3]:
# import the item similarities

item_sim = pd.read_parquet(path_item_sim)

In [4]:
# import the reviews for the 10,000 subset of users

reviews = pd.read_parquet(path_reviews_subset)

### Helper functions

In [5]:
# define a function for reading the data using a generator

def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

# define a function to get all the reviews for a user, sorted by rating in descending order

def get_user_rated_sorted(user_id, df):
    user_reviews = df[df['user_id'] == user_id]
    user_rated = dict(zip(user_reviews['gmap_id'], user_reviews['rating']))
    user_rated = pd.DataFrame(list(user_rated.items()), columns=['gmap_id', 'rating'])
    user_rated.sort_values(by='rating', ascending=False, inplace=True)
    return user_rated

# define a function to get a user's favorite places, favorites defined as being rated 4 or higher

def get_favorites(user_id, df):
  user_rated_sorted = get_user_rated_sorted(user_id, df)
  favorites = user_rated_sorted[user_rated_sorted['rating'] >= 4]
  return favorites

# define a function to get n most popular businesses, popular determined as 1) the most # of reviews and 2) highest average review

def get_popular(n, df):
  popular = df.groupby('gmap_id')['rating'].agg(['count','mean']).sort_values(by=['count','mean'], ascending=False)
  return popular.head(n)

# create a function to return the business name using the gmap_id

def get_business_name(gmap_id):
  meta_generator = parse(path_meta)
  for place in meta_generator:
    if place.get('gmap_id') == gmap_id:
      name = place['name']
      break
  return name

# create a function to return the business rating using the gmap_id

def get_business_rating(gmap_id):
  meta_generator = parse(path_meta)
  for place in meta_generator:
    if place.get('gmap_id') == gmap_id:
      avg_rating = place['avg_rating']
      break
  return avg_rating

### Recommender function

In [25]:
def item_based_recommendations(user_id, df, item_sim, n_recs):
  """
  Accepts the following parameters and returns a DataFrame containing n_recs recommendations.

  user_id = user_id
  df = the pivoted table
  item_sim = the item similarities matrix
  n_recs = the number of recommendations desired
  """

  # get the list of what the user already rated
  user_rated_sorted = get_user_rated_sorted(user_id, df)

  # get favorites for user
  user_favs = get_favorites(user_id, df)

  # create an empty dictionary to add recommendations to
  recs_dict = {}

  # for each favorite place in the user's list of favorite place:
  for fav in user_favs.values:
    fav = fav[0]

    # get the similar places with cosine similarity > 0
    user_item_sims = item_sim.loc[fav].sort_values(ascending=False)[1:]
    similar_places = user_item_sims[user_item_sims.values > 0]

    # for each of the similar places:
    for index, value in similar_places.items():
      place = index

      # if they are not in the list of places the user has already rated
      if place not in user_rated_sorted and place not in recs_dict:

      # then add the place's gmap_id and the similarity, to the dictionary.
        recs_dict[place] = similar_places[place]

      # if the place is already in the dictionary, then add the cosine similarity to the existing similarity
      if place in recs_dict:
        recs_dict[place] += similar_places[place]

  # turn the dictionary into a dataframe
  recs_df = pd.DataFrame(list(recs_dict.items()), columns=['gmap_id','similarity'])

  # sort the dataframe by the total cosine similarity, from highest to lowest
  recs_df.sort_values(by='similarity', ascending=False, inplace=True)

  # narrow down the recommendation to n * 2 by using .head
  recs_df = recs_df.head(n_recs*2)

  # for each place, add the business name
  recs_df['name'] = recs_df['gmap_id'].apply(get_business_name)

  # for each place, add the average rating
  recs_df['avg_rating'] = recs_df['gmap_id'].apply(get_business_rating)

  # sort by 1) similarity, 2) avg_rating
  recs_df.sort_values(by=['similarity','avg_rating'], ascending=False, inplace=True)

  # remove businesses that have an average rating under 3.5 stars
  recs_df = recs_df[recs_df['avg_rating'] > 3.5]

  # limit the length using head of n_recs
  recs = recs_df.head(n_recs)

  # calculate the difference between the length of list of recommendation and n_recs
  diff = n_recs - recs['gmap_id'].count()

  # if the difference is 0, return the list of recommendations
  if diff == 0:
    return recs

  # else create an additional list of recommendations using get_popular, with n = the difference
  else:
    popular = get_popular(diff, df)

    # name the columns and then resent index
    popular.columns = ['similarity','avg_rating']
    popular.reset_index(inplace=True)

    # add the business name to the df
    popular['name'] = popular['gmap_id'].apply(get_business_name)

    # reorder the columns to match the recs df
    popular = popular[['gmap_id','similarity','name','avg_rating']]


  # add the additional list to the bottom of the original list of recommendations
    recs = pd.concat([recs, popular], ignore_index=True)

  return recs


In [26]:
# test the function 1

item_based_recommendations('104620742288190585924', reviews , item_sim, 10)

Unnamed: 0,gmap_id,similarity,name,avg_rating
1,0x874d81d1df0ab325:0x71220a9a0bb365cb,2.0,Garrin's Automotive,4.8
0,0x87528bb327861fcb:0xa06c6e305d8be7bb,2.0,Oasis Auto Body & Paint,4.4
1773,0x874d84c9dab436d7:0x6bb637732438c490,1.260887,Megaplex Theatres at Geneva,4.6
3452,0x87527df5552f7b23:0x67363fb56d60357d,1.23698,Ace Hardware Draper,4.2
1721,0x874d9acec9505d19:0xced1f5974ee87df1,1.211907,Cafe Rio Mexican Grill,3.7
566,0x87528714ef42f279:0xec14cbcc56dfb3e6,1.150476,Penny Ann's Cafe,4.6
598,0x87528767bf528631:0xde2437da161bacf8,1.112383,Papa Murphy's | Take 'N' Bake Pizza,4.2
1601,0x874d9a623fec0de1:0x79a14957b6fda37a,1.101236,Sushi Ya,4.0
5168,0x87520c5a915d2e49:0x7b58624e97b0e7b,1.085829,Jordanelle General Store,4.5
5147,0x875287688e714781:0x2cefecde4c4e0c1d,1.085829,Dillman Square,4.4


In [27]:
# test the function 2

item_based_recommendations('12345', reviews , item_sim, 10)

  recs = pd.concat([recs, popular], ignore_index=True)


Unnamed: 0,gmap_id,similarity,name,avg_rating
0,0x875289bef87fcb6b:0x9e865a4dadee3648,496,Fashion Place,4.417339
1,0x8752f508b7dc56a9:0x13d77d6e854d7e79,482,City Creek Center,4.491701
2,0x8752879236e85383:0xbaf443f16b40940c,458,Loveland Living Planet Aquarium,4.624454
3,0x8752f51b7961dc89:0x73f1a804eb909466,440,Hogle Zoo,4.506818
4,0x87528e94d44e7fc5:0x42b1cfae697ba4a6,422,Jordan Landing,4.281991
5,0x8752872f3f4b153b:0xd8bb5e82808eedbd,421,IKEA,4.244656
6,0x87528c9d1d641321:0x90cf6256d98a62eb,410,Valley Fair Mall,3.756098
7,0x8752f50023b5e105:0xbf4b78f58640694f,403,The Gateway,4.004963
8,0x8752ff18c387e561:0xf3256e9df9ab3603,397,Lagoon Amusement Park,4.445844
9,0x8752f54425332f67:0x354cd3a2499141d0,388,Liberty Park,4.585052


In [28]:
# test the function 3

item_based_recommendations('111717473911684632928', reviews , item_sim, 50)

Unnamed: 0,gmap_id,similarity,name,avg_rating
0,0x8752f796b56c557d:0x54742b5acc6749f6,2.0,"Christine Packard, PAC | Utah Family Practice",4.9
4,0x875303405f87384b:0x1a63d54bda594a90,2.0,"Sheryl Bingham, FNP | Medical Weight Loss Expert",4.9
1,0x87530f257acbaf01:0xa9cd9057d50e7927,2.0,Sawyer Lock and Key,4.9
3,0x87530ec0a9b4b56f:0x9827798770b3895d,2.0,Shaidz Hair Salon,4.6
2,0x87530f23e3ee22f9:0xf7cc14b639406420,2.0,Mark-A-Newt Appliance Specialists,4.5
6,0x8753bad2cb88cbad:0x58c01b9cbb9ffd25,2.0,Magpie Campground,4.1
5,0x8753bad6f0c99bd3:0x8e5cc8a326c852ba,2.0,Botts Campground,4.0
1885,0x8752890fa6ea0757:0x2cc1b06be7651e2e,1.86101,Momentum Electric,5.0
1883,0x874d9715e96fdf95:0xca64169390b79798,1.86101,Treeline,4.8
1875,0x874d9b3219acad3d:0x34b4fc26e6a5b6db,1.86101,Simple Elegance Rock Shop,4.8
