In [2]:
import os
from collections import defaultdict
import string
import re
import numpy as np
from tqdm import tqdm
from tqdm._tqdm_notebook import tqdm_notebook
import pandas as pd
from multiprocessing import Pool
tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  import sys
  from pandas import Panel


In [3]:
def mkdir(d):
    if not os.path.exists(d):
        os.makedirs(d)

## Data Loading

We first load the Yelp dataset as well as the annotations from [1].

[1] Hernández-Rubio, M., Cantador, I., Bellogín, A. 2019. A comparative analysis of recommender systems based on item aspect opinions extracted from user reviews. User Modeling and User-Adapted Interaction 29(2), 381-441.

In [41]:
# Dataframe for all users.
df_user = pd.read_csv("data/user.csv")[["user_id", "name", "review_count", "yelping_since"]]
#df_user = df_user.head(1000)

In [25]:
# Dataframe for all reviews.
df_review = pd.read_csv("data/review.csv").drop(["useful", "funny", "cool"], axis=1)
df_review = df_review.rename(columns={"business_id": "item_id"})
#df_review = df_review.head(1000)

In [26]:
# Dataframe for all restaurants.
df_item = pd.read_csv("data/business.csv")
df_item = df_item.rename(columns={"business_id": "item_id"})
df_item["is_restaurant"] = df_item[["categories"]].apply(lambda x: "Restaurants" in x["categories"], axis=1)
df_item = df_item[df_item["is_restaurant"]]
df_item = df_item[["item_id", "name", "city", "state", "postal_code", "latitude", "longitude", "stars"]]
#df_item = df_item.head(1000)

In [27]:
# Dataframe for words representing aspects.
df_aspects = pd.read_csv("aspect/aspects_restaurants.csv", header=0, names=["aspect", "word"])
aspect_dict = defaultdict(list)
aspect_reverse_dict = {}
for i, row in df_aspects.iterrows():
    aspect_dict[row["aspect"]].append(row["word"])
    aspect_reverse_dict[row["word"]] = row["aspect"]

In [28]:
# Dataframe for words representing aspect sentiment orientations.
df_so = pd.read_csv("aspect/lexicon_restaurants.csv", header=0, names=["aspect", "word", "score"])
so_dict = defaultdict(dict)
so_reverse_dict = {}
for i, row in df_so.iterrows():
    so_dict[row["aspect"]][row["word"]] = row["score"]
    so_reverse_dict[row["word"]] = row["aspect"]

In [29]:
# preview the review dataframe
df_review.iloc[:5]

Unnamed: 0,review_id,user_id,item_id,stars,date,text
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...


## Item Profiling

We propose to use latent factor-based item profiles to model restaurants in the dataset. Concretely, for each item $i$ and aspect $k$ extracted in [1], we compute two values for the (item, aspect) pair by going through all reviews for $i$:

- $\psi_{i,k}$: the proportion of all review contents discussing about aspect $k$;
- ${so}_{i,k}$: the average sentiment orientation of all words in aspect $k$.

We define the item aspect weight for item $i$ and aspect $k$ to be $$w_{i,k} = \psi_{i,k} \cdot {so}_{i,k}.$$

In [42]:
def text_aspect_counts(text):
    ''' Given a block of text, count the number of words in each aspect. 
    
        Returns: a list of counts with indices corresponding to so_dict.
    '''
    words = re.sub('['+string.punctuation+']', '', text.lower()).split()
    counts = [0] * len(so_dict)
    for word in words:
        if word in aspect_reverse_dict:
            aspect_name = aspect_reverse_dict[word]
            aspect_ix = list(so_dict.keys()).index(aspect_name)
            counts[aspect_ix] += 1
    return counts

In [43]:
def text_so_measure(text):
    ''' Given a block of text, calculate the sum of sentiment orientation 
        values as well as their counts.
    
        Returns:
            sums: the sum of so values
            counts: count of values used for the sums
    '''
    words = re.sub('['+string.punctuation+']', '', text.lower()).split()
    sums = [0] * len(so_dict)
    counts = [0] * len(so_dict)
    for word in words:
        if word in so_reverse_dict:
            aspect_name = so_reverse_dict[word]
            aspect_ix = list(so_dict.keys()).index(aspect_name)
            word_so = so_dict[aspect_name][word]
            sums[aspect_ix] += word_so
            counts[aspect_ix] += 1
    return sums, counts

In [44]:
def item_aspect_weights(item_id):
    ''' Calculates w_{i,k} wrt. all k for item i. '''
    reviews = df_review[df_review['item_id'] == item_id]["text"].tolist()
    
    # calculating $\psi$
    psi = np.zeros((len(so_dict),))
    for review in reviews:
        psi += text_aspect_counts(review)
    psi = psi.astype(float) / np.sum(psi)
    
    # calculating so
    sums = np.zeros((len(so_dict),))
    counts = np.zeros((len(so_dict),))
    for review in reviews:
        review_sum, review_count = text_so_measure(review)
        sums += review_sum
        counts += review_count
    counts[counts == 0] = 1
    so = sums / counts
    
    return psi * so

In [45]:
# example usage
item_aspect_weights('5ubokMNw8qfbX2WtxgJG1Q')

array([ 0.        , -0.01547117,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.0140647 ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.04360056,  0.        ,  0.03656821,  0.        ,  0.        ,
        0.        ,  0.        ,  0.00952072,  0.01172058,  0.        ,
        0.        , -0.00046882,  0.03656821,  0.        ,  0.        ,
        0.00679794, -0.01828411, -0.00738397, -0.0081831 ,  0.00052743,
       -0.00323189,  0.00138666,  0.        , -0.00140647, -0.00229723])

In [37]:
# add column for item profile
df_item['aspect_weights'] = 0
df_item['aspect_weights'] = df_item['aspect_weights'].astype(object)
df_item = df_item[:1000]
df_item.head()

for i, row in tqdm_notebook(df_item.iterrows(), total=len(df_item)):
    df_item.at[i, 'aspect_weights'] = item_aspect_weights(row['item_id'])

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

  if __name__ == '__main__':





In [38]:
# save produced item profiles
item_profile_path = 'saved/item.csv'
mkdir('/'.join(item_profile_path.split('/')[:-1]))
df_item.to_csv(item_profile_path, sep='\t', encoding='utf-8')

## User Profiling

We propose to use implicit aspect-based user profiles to model users. Concretely, we compute a vector of length $K$ for each user, where $K$ is the number of extracted aspects in [1]. Each item in the vector, which we denote as $w_{u,k}$, can be computed as $$w_{u,k} = \sum_{\{(u,i,r), r \neq \emptyset\}} r(u,i) \cdot w_{i,k}.$$

In [46]:
def user_aspect_weights(user_id):
    reviews = df_review[df_review["user_id"] == user_id][["item_id", "stars"]]
    w_uk = np.zeros((len(so_dict),))
    for i, row in reviews.iterrows():
        item_row = df_item.loc[df_item["item_id"] == row["item_id"]]
        if len(item_row) == 0:
            continue
        w_uk += float(row["stars"]) * item_row.iloc[0]["aspect_weights"]
    return w_uk

In [47]:
# example usage
user_aspect_weights('bv2nCi5Qv5vroFiqKGopiw')

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0.])

In [48]:
# add column for user profile
df_user['aspect_weights'] = 0
df_user['aspect_weights'] = df_user['aspect_weights'].astype(object)
df_user = df_user[:1000]

for i, row in tqdm_notebook(df_user.iterrows(), total=len(df_user)):
    df_user.at[i, 'aspect_weights'] = user_aspect_weights(row['user_id'])

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [49]:
# save produced user profiles
user_profile_path = 'saved/user.csv'
mkdir('/'.join(user_profile_path.split('/')[:-1]))
df_user.to_csv(user_profile_path, sep='\t', encoding='utf-8')

In [51]:
df_user.to_json('saved/user.json')
df_item.to_json('saved/item.json')