In [81]:
import gzip
import json
import numpy as np
from collections import defaultdict
import tensorflow as tf
import random
import sklearn

import gzip
import random
import scipy
import tensorflow as tf
from collections import defaultdict
# from implicit import bpr
# from surprise import SVD, Reader, Dataset
# from surprise.model_selection import train_test_split

In [82]:
### Read in data

In [83]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readJSON(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = json.loads(l)

        yield d

# For REVIEW data
def readJSON_1(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = json.loads(l)
        u = d['user_id']
        n = d['name']
        b = d['gmap_id']  # businessID
        r = d['rating']
        yield u,b,r,n,d

# For BUSINESS META-data
def readJSON_2(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = json.loads(l)
        b      = d['gmap_id']  # businessID
        cat    = d['category']
        coords = (d['latitude'], d['longitude'])
        name   = d['name']
        #state = d['state']   #<<< shouldn't be "permanently closed"
        yield b,name,cat,coords
        #yield b,cat,coords


In [84]:
allReviews = []
for l in readJSON("review-District_of_Columbia_10.json.gz"):
    allReviews.append(l)

allMeta = []
for l in readJSON("meta-District_of_Columbia.json.gz"):
    allMeta.append(l)


In [85]:
# Reviews by Business ID
reviewById = defaultdict(list)

for review in allReviews:
    gmap_id = review["gmap_id"]
    reviewById[gmap_id].append(review)


# Reviews by Business ID
metaById = defaultdict(list)

for meta in allMeta:
    gmap_id = meta["gmap_id"]
    metaById[gmap_id].append(meta)

In [86]:
allMetaData = []
for l in readJSON_2("meta-District_of_Columbia.json.gz"):
    allMetaData.append(l)

In [87]:
### Find the restaurants in the dataset
word_list = ["restaurant", "food", "pizza", "juice", "dessert", "takeout", "sandwich", "diner", "bar", "cocktail", "coffee", "cafe", "deli"]  #EXPAND, NON-MANUALLY?
word_blacklist = ["barber shop", "eyebrow bar"]

restaurant_gmapIDs = set()

count=0
for i in range(len(allMetaData)):
    if(allMetaData[i][2]):
        category_list = [word.lower() for word in allMetaData[i][2]]
    else:
        category_list = []

    contains_word             = any(any(word in s for word in word_list) for s in category_list) # true if any word in word_list is contained in somewhere in category_list
    contains_blacklisted_word = any(any(word in s for word in word_blacklist) for s in category_list)

    if(category_list and contains_word and not contains_blacklisted_word):
        #print(category_list)
        restaurant_gmapIDs.add(allMetaData[i][0])

        count+=1

In [88]:
### Getting rid of repeat entries:

encountered_ids = set()
restaurantMetaData = []

for entry in allMetaData:
    entry_ID = entry[0]
    if entry_ID in restaurant_gmapIDs and entry_ID not in encountered_ids:
        restaurantMetaData.append(entry)
        encountered_ids.add(entry_ID)

In [89]:
### For every reviewed place that is a restaurant

allRestaurantReviews = []
for review in allReviews:
    u = review["user_id"]
    b = review["gmap_id"]
    r = review["rating"]

    if b in restaurant_gmapIDs:
        allRestaurantReviews.append((u,b,r))

In [90]:
userIDs = {}
bizIDs = {}
interactions = []

for u,b,r in allRestaurantReviews:
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not b in bizIDs: bizIDs[b] = len(bizIDs)
    interactions.append((u,b,r))

In [91]:
random.shuffle(interactions)

In [92]:
nTrain = int(len(interactions) * 0.8)
nTest = int(len(interactions) * 0.05)
nValid = int(len(interactions) * 0.15)
interactionsTrain = interactions[:nTrain]
interactionsTest = interactions[nTrain:nTrain+nTest]
interactionsValid = interactions[nTrain+nTest:]
labelsTest = [r for _,_,r in interactionsTest]
labelsValid = [r for _,_,r in interactionsValid]

In [93]:
usersPerBiz = defaultdict(list)
bizsPerUser = defaultdict(list)
for u,b,r in interactionsTrain:
    bizsPerUser[u].append(b)
    usersPerBiz[b].append(u)

In [94]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [95]:
#############################
## Bias Model              ##
#############################

In [96]:
mu = sum([r for _,_,r in interactionsTrain]) / len(interactionsTrain)

In [97]:
class LatentFactorModelBiasOnly(tf.keras.Model):
    def __init__(self, mu, lamb):
        super(LatentFactorModelBiasOnly, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(bizIDs)],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i]
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        pred = self.alpha + beta_u + beta_i
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [98]:
optimizer = tf.keras.optimizers.Adam(0.1)

In [99]:
modelBiasOnly = LatentFactorModelBiasOnly(mu, 0.00001)

In [100]:
def trainingStepBiasOnly(model, interactions):
    Nsamples = int(nTrain/1)
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(bizIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
        (grad, var) in zip(gradients, model.trainable_variables)
        if grad is not None)
    return loss.numpy()

In [101]:
old_error = 10
for i in range(15):
    obj = trainingStepBiasOnly(modelBiasOnly, interactionsTrain)
    if (i % 10 == 9): 
        print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.3765776


In [102]:
biasOnlyPredictions =\
    [modelBiasOnly.predict(userIDs[u],bizIDs[i]).numpy() for u,i,_ in interactionsTest]

In [103]:
MSE(biasOnlyPredictions, labelsTest)

0.7694666557289634

In [104]:
#############################
## Baseline (pred mean)    ##
#############################

In [105]:
alwaysPredictMean = [4.3 for _ in interactionsTest]

In [106]:
MSE(alwaysPredictMean, labelsTest)

0.9443056275559695

In [107]:
#############################
## Latent Factor Model     ##
#############################

In [108]:
mu = sum([r for _,_,r in interactionsTrain]) / len(interactionsTrain)

In [109]:
optimizer = tf.keras.optimizers.Adam(0.1)

In [110]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(bizIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(bizIDs),K],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [111]:
gamma_size = 2
modelLFM = LatentFactorModel(mu, gamma_size, 0.00001)

In [112]:
def trainingStep(model, interactions):
    Nsamples = int(nTrain/1)
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(bizIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [113]:
for i in range(15):
    obj = trainingStep(modelLFM, interactionsTrain)
    if (i % 5 == 4): 
        print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 5, objective = 0.3883172
iteration 10, objective = 0.36982706
iteration 15, objective = 0.35706365


In [114]:
LFMPredictions = [modelLFM.predict(userIDs[u],bizIDs[i]).numpy() for u,i,_ in interactionsValid]

In [115]:
MSE(LFMPredictions, labelsValid)

0.7642740458451497