# Computing baseline Yelp business ratings for Champaign, IL

This notebook follows guidelines in [this guide to collaborative filtering systems](http://files.grouplens.org/papers/FnT%20CF%20Recsys%20Survey.pdf) to compute simple baselines for recommender system predictions.

In [1]:
%matplotlib inline
import numpy as np
import json
import pandas as pd
import random
import os
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [2]:
US_CANADA_REGION_BY_STATE_DICT = {
    'AZ': 'Phoenix',
    'NV': 'Las Vegas',
    'ON': 'Toronto',
    'NC': 'Charlotte',
    'SC': 'Charlotte',
    'OH': 'Cleveland',
    'PA': 'Pittsburgh',
    'QC': 'Montreal',
    'NY': 'Montreal',
    'VT': 'Montreal',
    'WI': 'Madison',
    'IL': 'Champaign'
}

In [3]:
# File paths
all_reviews_file = '../raw-data/yelp_academic_dataset_review.json'
all_businesses_file = '../raw-data/yelp_academic_dataset_business.json'
champaign_reviews_file = '../preprocessed-data/all-champaign-reviews.csv'

In [4]:
def get_businesses_df(json_file_name, select_keys=None, us_canada_only=True):
    """ Return dataframe from raw data.
    All rows unless max_rows is set. All cities unless city is set. All columns unless select_keys is set.
    Only businesses in US/Canada if us_canada_only is true
    """
    with open(json_file_name, 'r') as f:
        i_row = 0
        df_dict_list = []
        for line in f:
            row_dict = json.loads(line)
            if us_canada_only:
                if row_dict['state'] not in US_CANADA_REGION_BY_STATE_DICT.keys():
                    continue
            if select_keys is not None:
                select_keys = set(select_keys)
                select_keys.add('business_id') # make sure to keep business_id no matter what
                row_dict = {k: row_dict[k] for k in select_keys} # get dict of subset of keys/columns
            df_dict_list.append(row_dict)
            i_row += 1
        df = pd.DataFrame(df_dict_list)
        df = df.set_index('business_id')
        return df

In [5]:
def get_reviews_df(json_file_name, df_businesses, city=None, max_rows=None,
                   select_keys=None, keep_rand_frac=None, seed=None):
    """ Return dataframe from raw data.
    All rows unless max_rows is set. All cities unless city is set. All columns unless select_keys is set.
    """
    if keep_rand_frac is not None:
        if seed is None:
            random.seed()
        else:
            random.seed(seed)
    with open(json_file_name, 'r') as f:
        i_row = 0
        df_dict_list = []
        for line in f:
            row_dict = json.loads(line)
            if city:
                b_id = row_dict['business_id']
                if df_businesses.loc[b_id, 'city'] != city:
                    continue
            if select_keys is not None:
                select_keys = set(select_keys)
                select_keys.add('review_id') # make sure to keep business_id no matter what
                select_keys.add('user_id') # make sure to keep user_id no matter what
                select_keys.add('business_id') # make sure to keep user_id no matter what
                row_dict = {k: row_dict[k] for k in select_keys}
            if (keep_rand_frac is None) or ((keep_rand_frac is not None) and (random.random() < keep_rand_frac)):
                df_dict_list.append(row_dict)
                i_row += 1
                if (max_rows is not None) and (i_row >= max_rows):
                    break
        df = pd.DataFrame(df_dict_list)
        df = df.set_index('review_id')
        return df

## Get Businesses Data

In [6]:
keys = ['name', 'state', 'latitude', 'longitude']
df_businesses = get_businesses_df(all_businesses_file, select_keys=keys, us_canada_only=False)
df_businesses['city'] = df_businesses['state'].map(US_CANADA_REGION_BY_STATE_DICT)
print('{} total businesses'.format(len(df_businesses)))
df_businesses.head()

144072 total businesses


Unnamed: 0_level_0,latitude,longitude,name,state,city
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0DI8Dt2PJp07XkVvIElIcQ,33.378214,-111.936102,Innovative Vapors,AZ,Phoenix
LTlCaCGZE14GuaUXUGbamg,36.192284,-115.159272,Cut and Taste,NV,Las Vegas
EDqCEAGXVGCH4FJXgqtjqg,43.661054,-79.429089,Pizza Pizza,ON,Toronto
cnGIivYRLxpF7tBVR_JwWA,40.444544,-80.17454,Plush Salon and Spa,PA,Pittsburgh
cdk-qqJ71q6P7TJTww_DSA,43.659829,-79.375401,Comfort Inn,ON,Toronto


## Get Reviews Data

In [7]:
if not os.path.isfile(champaign_reviews_file):
    # If you don't already have the champaign reviews csv, this will generate it.
    # It took 20 minutes on my computer.
    %%time
    keys = ['stars']
    df_reviews = get_reviews_df(all_reviews_file, df_businesses, city='Champaign', select_keys=keys)
    df_reviews.to_csv('../preprocessed-data/all-champaign-reviews.csv')
else:
    df_reviews = pd.read_csv('../preprocessed-data/all-champaign-reviews.csv')
    df_reviews = df_reviews.set_index('review_id')
    print('{} reviews'.format(len(df_reviews)))
    
df_reviews.head()

29874 reviews


Unnamed: 0_level_0,business_id,stars,user_id
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3xGR24wD5ILntyX2UXZWTA,1DedueD53YsKcpqMWPIe9w,3,OkZk0I2S6mcMOtjSP12U_A
2_BvxFBvtyMKjNf3gzmbqw,1DedueD53YsKcpqMWPIe9w,4,8f9m9EdA6M5Jr-sqdPrc5A
V926hjwKcbT-ZVJOwSeXnQ,1DedueD53YsKcpqMWPIe9w,2,oJl-C8UECsibhHS2dB8yzQ
HBcuWFsU-6VrvzYZUDZQzA,1DedueD53YsKcpqMWPIe9w,5,XLwW5c_194tekHBy6ee7eg
fnO18xosHxN5E8tKtI3AGA,1DedueD53YsKcpqMWPIe9w,5,qJfW5-Z890LfBV62xDqzUQ


In [8]:
n_users = df_reviews['user_id'].unique().shape[0]
n_businesses = df_reviews['business_id'].unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of businesses = ' + str(n_businesses))

Number of users = 10022 | Number of businesses = 1556


In [9]:
mean_rating = df_reviews['stars'].mean()
print('Mean rating: {}'.format(mean_rating))

Mean rating: 3.6055432817834907


In [10]:
# Calculate the RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

## Predict with the average rating of the training data

This uses the simple baseline prediction of $b_{u,i} = \mu$ as mentioned in 2.1 of the collaborative filtering guide.

In [11]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)
errors = []
for i, (train_index, test_index) in enumerate(kf.split(df_reviews)):
    df_train, df_test = df_reviews.iloc[train_index,:], df_reviews.iloc[test_index,:]
    y_pred = df_train['stars'].mean() * np.ones(len(df_test))
    err = rmse(df_test['stars'], y_pred)
    errors.append(err)
    print('Fold {}: RMSE = {}'.format(i, err))

print('\nAverage Error: {}'.format(np.mean(errors)))

Fold 0: RMSE = 1.4127093492278564
Fold 1: RMSE = 1.419392931917939
Fold 2: RMSE = 1.4234855168786251
Fold 3: RMSE = 1.4185792824843677
Fold 4: RMSE = 1.4139278697584874

Average Error: 1.417618990053455


## Predict with User's average rating

This baseline averages all of the user's ratings to give the prediction for each rating. This is $b_{u,i} = \bar{r_u}$ as mentioned in section 2.1.

In [12]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)
errors = []
for i, (train_index, test_index) in enumerate(kf.split(df_reviews)):
    df_train, df_test = df_reviews.iloc[train_index,:], df_reviews.iloc[test_index,:]
    train_mean = df_train['stars'].mean()
    user_means = df_train.groupby('user_id').mean().rename(columns={'stars': 'mean_user'})
    df_test = df_test.join(user_means, on='user_id').fillna(train_mean)
    err = rmse(df_test['stars'], df_test['mean_user'])
    errors.append(err)
    print('Fold {}: RMSE = {}'.format(i, err))

print('\nAverage Error: {}'.format(np.mean(errors)))

Fold 0: RMSE = 1.4905474126401397
Fold 1: RMSE = 1.4919646222610048
Fold 2: RMSE = 1.496843610432585
Fold 3: RMSE = 1.4840542152537801
Fold 4: RMSE = 1.4681452332359883

Average Error: 1.4863110187646995


## Predict with Business's average rating

This is $b_{u,i} = \bar{r_i}$ from section 2.1.

In [13]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)
errors = []
for i, (train_index, test_index) in enumerate(kf.split(df_reviews)):
    df_train, df_test = df_reviews.iloc[train_index,:], df_reviews.iloc[test_index,:]
    train_mean = df_train['stars'].mean()
    bus_means = df_train.groupby('business_id').mean().rename(columns={'stars': 'mean_bus'})
    df_test = df_test.join(bus_means, on='business_id').fillna(train_mean)
    err = rmse(df_test['stars'], df_test['mean_bus'])
    errors.append(err)
    print('Fold {}: RMSE = {}'.format(i, err))

print('\nAverage Error: {}'.format(np.mean(errors)))

Fold 0: RMSE = 1.307568254149668
Fold 1: RMSE = 1.2963818607924362
Fold 2: RMSE = 1.3044196993787507
Fold 3: RMSE = 1.3071222719786242
Fold 4: RMSE = 1.3168544145379486

Average Error: 1.3064693001674854


## Predict with a business and user baseline

This is equation 2.1 from section 2.1 of the collaborative filtering guide. The equation is:

$$b_{u_i} = \mu + b_u + b_i$$

where

$$b_u = \frac{1}{|I_u|}\sum_{i \in I_u} (r_{u,i} - \mu)$$

and

$$b_i = \frac{1}{|U_i|}\sum_{u \in U_i} (r_{u,i} - b_u - \mu)$$

(See equations 2.2 and 2.3)

In [14]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)
errors = []
for i, (train_index, test_index) in enumerate(kf.split(df_reviews)):
    df_train, df_test = df_reviews.iloc[train_index,:], df_reviews.iloc[test_index,:]
    # Get mean rating of all training ratings
    train_mean = df_train['stars'].mean()
    # Get dataframe of b_u part of baseline for each user id
    df_train_user = df_train[['user_id', 'stars']].groupby('user_id').mean().rename(columns={'stars': 'user_mean'})
    df_train_user['b_u'] = df_train_user['user_mean'] - train_mean
    # Create column of b_u values corresponding to the user who made the review
    df_train = df_train.join(df_train_user['b_u'], on='user_id')
    # Add column which will turn into b_i when averaging over each business
    df_train['b_i'] = df_train['stars'] - df_train['b_u'] - train_mean
    # Average over each business to get the actual b_i values for each business
    df_train_bus = df_train[['business_id', 'b_i']].groupby('business_id').mean()
    # Join b_u and b_i columns to test dataframe
    df_test = df_test.join(df_train_user['b_u'], on='user_id').fillna(df_train_user['b_u'].mean())
    df_test = df_test.join(df_train_bus['b_i'], on='business_id').fillna(df_train_bus['b_i'].mean())
    # Predict and Compute error
    err = rmse(df_test['stars'], df_test['b_u'] + df_test['b_i'] + train_mean)
    errors.append(err)
    print('Fold {}: RMSE = {}'.format(i, err))

print('\nAverage Error: {}'.format(np.mean(errors)))

Fold 0: RMSE = 1.4235157070620779
Fold 1: RMSE = 1.4260768578617162
Fold 2: RMSE = 1.428865294847576
Fold 3: RMSE = 1.4100757175658603
Fold 4: RMSE = 1.4099408738397532

Average Error: 1.4196948902353967


## Predict with a business and user baseline including damping terms

This follows the same equation 2.1 from above ($b_{u_i} = \mu + b_u + b_i$), but $b_u$ and $b_i$ are defined in a kind of Bayesian way. Specifically, damping factors are added in to push them closer to zero, making the baselines closer to the global average the lower the number of reviews. The equations for $b_u$ and $b_i$ here are

$$b_u = \frac{1}{|I_u| + \beta_u}\sum_{i \in I_u} (r_{u,i} - \mu)$$

and

$$b_i = \frac{1}{|U_i| + \beta_i}\sum_{u \in U_i} (r_{u,i} - b_u - \mu)$$

(See equations 2.4 and 2.5) where $\beta_u$ and $\beta_i$ are damping factors, for which the guide reported 25 is a good number. 5 seems to work better in this case though, maybe because it's a small dataset.

In [15]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)
beta_u, beta_i = 5, 5
errors = []
for i, (train_index, test_index) in enumerate(kf.split(df_reviews)):
    df_train, df_test = df_reviews.iloc[train_index,:], df_reviews.iloc[test_index,:]
    # Get mean rating of all training ratings
    train_mean = df_train['stars'].mean()
    # Get dataframe of b_u part of baseline for each user id
    user_group = df_train[['user_id', 'stars']].groupby('user_id')
    df_train_user = user_group.sum().rename(columns={'stars': 'sum'})
    df_user_counts = pd.DataFrame(user_group.size(), columns=['count'])
    df_train_user = pd.concat([df_train_user, df_user_counts], axis=1)
    df_train_user['b_u'] = (df_train_user['sum'] - train_mean * df_train_user['count'])
    df_train_user['b_u'] /= (df_train_user['count'] + beta_u)
    # Create column of b_u values corresponding to the user who made the review
    df_train = df_train.join(df_train_user['b_u'], on='user_id')
    # Add column representing the expression inside the summation part of the b_i equation
    df_train['b_i_sum'] = df_train['stars'] - df_train['b_u'] - train_mean
    # Average over each business to get the actual b_i values for each business
    bus_group = df_train[['business_id', 'b_i_sum']].groupby('business_id')
    df_train_bus = bus_group.sum().rename(columns={'b_i_sum': 'b_i'})
    df_train_bus['b_i'] /= bus_group.size() + beta_i
    # Join b_u and b_i columns to test dataframe
    df_test = df_test.join(df_train_user['b_u'], on='user_id').fillna(df_train_user['b_u'].mean())
    df_test = df_test.join(df_train_bus['b_i'], on='business_id').fillna(df_train_bus['b_i'].mean())
    # Predict and Compute error
    err = rmse(df_test['stars'], df_test['b_u'] + df_test['b_i'] + train_mean)
    errors.append(err)
    print('Fold {}: RMSE = {}'.format(i, err))

print('\nAverage Error: {}'.format(np.mean(errors)))

Fold 0: RMSE = 1.2694763779129867
Fold 1: RMSE = 1.2686029812582706
Fold 2: RMSE = 1.2803895811347372
Fold 3: RMSE = 1.266922460881656
Fold 4: RMSE = 1.2690870521819977

Average Error: 1.2708956906739295
