In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
#!pip3 install faiss-cpu

In [3]:
#import faiss

In [4]:
from tqdm import tqdm

## Data Loading

Load yelp dataset along with pre-computed user and item vectors.

In [5]:
# read datasets
df_item = pd.read_json('saved/item.json')
df_user = pd.read_json('saved/user.json')

In [6]:
df_review = pd.read_csv('data/review.csv')

In [7]:
df_item.head(n=1)

Unnamed: 0,item_id,name,city,state,postal_code,latitude,longitude,stars,aspect_weights
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",Cuyahoga Falls,OH,44221,41.119535,-81.47569,3.5,"[0.0005837712, 0.0022712347000000003, 0.0, 0.0..."


In [8]:
df_user.head(n=1)

Unnamed: 0,user_id,name,review_count,yelping_since,aspect_weights
0,JJ-aSuM4pCFPdkfoZ34q0Q,Chris,10,2013-09-24,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [9]:
df_review.head(n=1)

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0


## Preparations

Load index for fast nearby user retrieval and function for converting user information to feature vectors.

In [32]:
# initialize the user index
d = len(df_user.iloc[0]['aspect_weights'])
index = []
#index = faiss.IndexFlatIP(d)

In [33]:
# populate the index with user vectors
user_vectors = np.array(df_user['aspect_weights'].to_list()).astype(np.float32)
user_vectors = user_vectors / np.linalg.norm(user_vectors, axis=1, keepdims=True)
index.append(user_vectors)

  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
def get_user_vector(reviews):
    # sample input: [{"item_id": "xyz", "rating": 5}]
    # output: user vector
    w_uk = np.zeros((d,))
    for review in reviews:
        item_row = df_item.loc[df_item["item_id"] == review["item_id"]]
        if len(item_row) == 0:
            continue
        w_uk += float(review["rating"]) * np.array(item_row.iloc[0]["aspect_weights"]).astype(float)
    return w_uk / np.linalg.norm(w_uk)

In [14]:
user_vector = get_user_vector([{"item_id": "XOSRcvtaKc_Q5H1SAzN20A", "rating": 5}])

## Item Rating Prediction

Inspired by section 4.2.2 of [1], we design our method for predicting ratings given (user, item) pairs. The rough procedure for predicting the ratings is described as follows:

* We denote the target user and item as $u_m$ and $i_n$.
* We find a list of all neighboring users $N_k(u_m)$ of the target user, where distance is measured by cosine similarity.
* For each nearby user $u'$, wee find all reviews $(u', i', r) \in R(u')$ made by this user.
* For each user, we save the tuple $(\text{sim}(u_m, u'), \text{sim}(i_n, i'), r)$ in a list `score_info`.
* We compute the predicted score using the `score_info` list by taking the weighted some of the scores $r$ by weights $\text{sim}(u_m, u') \cdot \text{sim}(i_n, i')$.

Concerely, we use the following formula for item score prediction: $$ \hat{r}(u_m, i_n) = \frac{\sum_{u' \in N_k(u_m)} \sum_{(u', i', r) \in R(u')} \text{sim}(u_m, u') \cdot \text{sim}(i_n, i') \cdot r}{\sum_{u' \in N_k(u_m)} \sum_{(u', i', r) \in R(u')} \text{sim}(u_m, u') \cdot \text{sim}(i_n, i')}.$$

[1] Hernández-Rubio, M., Cantador, I., Bellogín, A. 2019. A comparative analysis of recommender systems based on item aspect opinions extracted from user reviews. User Modeling and User-Adapted Interaction 29(2), 381-441.

In [50]:
def rate_item(user_vector, item_id, k=10):
    # fetch item vector
    item_vector = np.array(df_item[df_item["item_id"] == item_id].iloc[0]["aspect_weights"]).astype(float)

    # get the nearest k users given the user vector
    nearest_user_dists = user_vectors.search(user_vector.reshape((1,d)).astype(np.float32), k)
    nearest_users = index[0:5]
    print(nearest_users)
    #index.search(user_vector.reshape((1,d)).astype(np.float32), k)
    nearest_user_dists = nearest_user_dists / np.linalg.norm(nearest_user_dists)

    score_info = []

    # for each nearby user...
    for user_ix in nearest_users[0]:
        # get his/her similarity value the queried user
        user_id = df_user.iloc[user_ix]["user_id"]
        nearby_vector = np.array(df_user[df_user["user_id"] == user_id].iloc[0]["aspect_weights"]).astype(float)
        user_sim = np.dot(user_vector, nearby_vector)/(np.linalg.norm(user_vector) * np.linalg.norm(nearby_vector))

        # iterate all his/her reviews
        reviews = df_review[df_review["user_id"] == user_id]
        for _, review in reviews.iterrows():
            # record the (user similarity, item similarity, reviewed score) tuple
            item = df_item[df_item["item_id"] == review["business_id"]]
            if len(item) == 0:
                continue
            review_vector = np.array(item.iloc[0]["aspect_weights"]).astype(float)
            item_sim = np.dot(item_vector, review_vector)/(np.linalg.norm(item_vector) * np.linalg.norm(review_vector))
            score_info.append([user_sim, item_sim, review["stars"]])

    # accumulate results
    weights = np.array([x[0] * x[1] for x in score_info])
    weights /= np.sum(weights)
    ratings = np.array([x[2] for x in score_info])
    return weights.dot(ratings)

In [51]:
rate_item(user_vector, "XOSRcvtaKc_Q5H1SAzN20A")

AttributeError: 'numpy.ndarray' object has no attribute 'search'

In [13]:
def predict_score(user, item_id):
    # sample user input: { 
    #     "latitude": 40,
    #     "longitude": -80,
    #     "...": (other info),
    #     "reviews": [
    #         {
    #             "item_id": "A",
    #             "rating": 4.5
    #         }
    #     ]
    # }
    return rate_item(get_user_vector(user["reviews"]), item_id)