In [235]:
import gzip
import matplotlib.pyplot as plt
import numpy
import random
import scipy
import tensorflow as tf
from collections import defaultdict
from scipy.spatial import distance
import dateutil
import os
from sklearn.metrics import roc_auc_score
import json, time
import pandas as pd
def parseData(fname):
    for l in open(fname):
        yield eval(l)
def readDataFull(path):
    data = []
    for line in gzip.open(path):
        d = eval(line)
        data.append(d)  
    return data

In [236]:
dataFolder = 'data/filtered_data/'
users_file_name = dataFolder + 'filtered_users.json'
places_file_name = dataFolder + 'filtered_places.json'
reviews_file_name = dataFolder + 'filtered_reviews.json'

In [237]:
data_user  = pd.DataFrame(json.load(open(users_file_name)))
data_places = pd.DataFrame(json.load(open(places_file_name)))
data_reviews  = pd.DataFrame(json.load(open(reviews_file_name)))

In [238]:
userIDs = {}
itemIDs = {}
interactions = []
interactionsPerUser = defaultdict(list)
userVisitedPlaces = defaultdict(set)
uniquePlaces = set()
for _i, d in data_reviews.iterrows():
    u = d['gPlusUserId']
    i = d['gPlusPlaceId']
    t = d['unixReviewTime']
    r = d['rating']
    uniquePlaces.add(i)
    #dt = dateutil.parser.parse(t)
    #t = int(dt.timestamp())
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((t,u,i,r))
    interactionsPerUser[u].append((t,i,r))
    userVisitedPlaces[u].add(i)

In [239]:
len(interactions)

308846

In [240]:
interactions.sort()

In [241]:
userInteractionAvg = sum ([len(interactionsPerUser[u]) for u in interactionsPerUser])  / len(interactionsPerUser)
userInteractionAvg

15.4423

In [242]:
itemIDs['dummy'] = len(itemIDs)
uniquePlacesList = list(uniquePlaces)

In [243]:
interactionstrain = []
interactionstest = []
for u in interactionsPerUser:
    interactionsPerUser[u].sort()
    list_users = interactionsPerUser[u]
    lastItem = 'dummy'
    for (t,i,r) in list_users[:-1]:
        interactionstrain.append((t,u,i,lastItem,r)) # positive example
        #getrandomPlace = place_not_visited(u)
        #interactionstrain.append((t,u,getrandomPlace,lastItem,-1)) # negative example
        lastItem = i
    (t,i,r) = list_users[-1]
    #getrandomPlace = place_not_visited(u)
    interactionstest.append((t,u,i,lastItem,r))
    #interactionstest.append((t,u,getrandomPlace,lastItem,-1))
    lastItem = i

In [244]:
nUsers,nItems = len(userIDs),len(itemIDs)
items = list(itemIDs.keys())
print(nUsers,nItems)
print(len(interactionstrain))
print(len(interactionstest))

20000 218133
288846
20000


In [245]:
itemsPerUser = defaultdict(set)
for _,u,i,_,_ in interactionstrain:
    itemsPerUser[u].add(i)

In [246]:
optimizer = tf.keras.optimizers.Adam(0.02)

In [294]:
class FPMC(tf.keras.Model):
    def __init__(self, K, lamb, UI = 1, IJ = 1):
        super(FPMC, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaUI = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaIU = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.gammaIJ = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.gammaJI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        
        # Regularization coefficient
        self.lamb = lamb
        # Which terms to include
        self.UI = UI
        self.IJ = IJ

    # Prediction for a single instance
    def predict(self, u, i, j):
        p = self.betaI[i] + self.UI * tf.tensordot(self.gammaUI[u], self.gammaIU[i], 1) +\
                            self.IJ * tf.tensordot(self.gammaIJ[i], self.gammaJI[j], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaUI) +\
                            tf.nn.l2_loss(self.gammaIU) +\
                            tf.nn.l2_loss(self.gammaIJ) +\
                            tf.nn.l2_loss(self.gammaJI))

    def call(self, sampleU, # user
                   sampleI, # item
                   sampleJ, # previous item
                   sampleK): # negative item
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        j = tf.convert_to_tensor(sampleJ, dtype=tf.int32)
        k = tf.convert_to_tensor(sampleK, dtype=tf.int32)
        
        gamma_ui = tf.nn.embedding_lookup(self.gammaUI, u)
        gamma_iu = tf.nn.embedding_lookup(self.gammaIU, i)
        gamma_ij = tf.nn.embedding_lookup(self.gammaIJ, i)
        gamma_ji = tf.nn.embedding_lookup(self.gammaJI, j)
        
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        x_uij = beta_i + self.UI * tf.reduce_sum(tf.multiply(gamma_ui, gamma_iu), 1) +\
                         self.IJ * tf.reduce_sum(tf.multiply(gamma_ij, gamma_ji), 1)
        gamma_uk = tf.nn.embedding_lookup(self.gammaUI, u)
        gamma_ku = tf.nn.embedding_lookup(self.gammaIU, k)
        gamma_kj = tf.nn.embedding_lookup(self.gammaIJ, k)
        gamma_jk = tf.nn.embedding_lookup(self.gammaJI, j)
        beta_k = tf.nn.embedding_lookup(self.betaI, k)
        x_ukj = beta_k + self.UI * tf.reduce_sum(tf.multiply(gamma_uk, gamma_ku), 1) +\
                         self.IJ * tf.reduce_sum(tf.multiply(gamma_kj, gamma_jk), 1)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_uij - x_ukj)))

In [249]:
modelFPMC = FPMC(5, 0.00001)

In [250]:
def trainingStep(model, interactionstrain):
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ, sampleK = [], [], [], []
        for _ in range(100000):
            _,u,i,j,_ = random.choice(interactionstrain) # positive sample
            k = random.choice(items) # negative sample
            while k in itemsPerUser[u]:
                k = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])
            sampleK.append(itemIDs[k])
        

        loss = model(sampleU,sampleI,sampleJ,sampleK)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [298]:
for i in range(20):
    obj = trainingStep(modelFPMC, interactionstrain)
    if (i % 10 == 9): 
        print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.6111991
iteration 20, objective = 0.61060315


In [303]:
interactionsTestPerUser = defaultdict(list)
itemSet = set()

for _,u,i,j,_ in interactionstest:
#for _,u,i,j,_ in interactionstrain:
    interactionsTestPerUser[u].append((i,j))
    itemSet.add(i)
    itemSet.add(j)
    
# def AUCu(model, u, N):
#     win = 0

#     positive = [interactionsTestPerUser[u][0]] * N
#     negative = random.sample(itemSet,N)
#     for (i,j),k in zip(positive,negative):
#         sp = model.predict(userIDs[u], itemIDs[i], itemIDs[j]).numpy()
#         sn = model.predict(userIDs[u], itemIDs[k], itemIDs[j]).numpy()
#         if sp > sn:
#             win += 1
#     return win/N

def AUCu(model, u, N):
    win = 0
    if N > len(interactionsTestPerUser[u]):
        N = len(interactionsTestPerUser[u])
    positive = random.sample(interactionsTestPerUser[u],N)
    negative = random.sample(itemSet,N)
    for (i,j),k in zip(positive,negative):
        sp = model.predict(userIDs[u], itemIDs[i], itemIDs[j]).numpy()
        sn = model.predict(userIDs[u], itemIDs[k], itemIDs[j]).numpy()
        #print(sp)
        #print(sn)
        if sp > sn:
            win += 1
    return win/N

def AUC(model):
    av = []
    cnt = 0
    for u in interactionsTestPerUser:
        if cnt > 1000:
            break
        cnt += 1
        av.append(AUCu(model, u, 1))
    return sum(av) / len(av)

In [304]:
AUC(modelFPMC)

0.3676323676323676

In [171]:
# Popular Baseline
class PopularBaseline():
    def __init__(self):
        placesCount = defaultdict(int)
        for i, r in data_reviews.iterrows():
            placesCount[r['gPlusPlaceId']] += 1
        popularPlaces = sorted([(placesCount[p], p) for p in placesCount], reverse=False)
        self.popScore = [0] * len(itemIDs)
        for i, (c, p) in enumerate(popularPlaces):
            self.popScore[itemIDs[p]] = i
    
    def predict(self, u, i, j):
        return numpy.array(self.popScore[i])

In [172]:
model = PopularBaseline()

In [187]:
interactionsTestPerUser = defaultdict(set)
itemSet = set()
for _,u,i,j,_ in interactionstest:
    interactionsTestPerUser[u].add((i,j))
    itemSet.add(i)
    itemSet.add(j)
    
def AUCu(model, u, N):
    win = 0
    positive = [random.sample(interactionsTestPerUser[u],1)[0]] * N
    negative = random.sample(itemSet,N)
    for (i,j),k in zip(positive,negative):
        sp = model.predict(userIDs[u], itemIDs[i], itemIDs[j])
        sn = model.predict(userIDs[u], itemIDs[k], itemIDs[j])
        if sp > sn:
            win += 1
    return win/N

def AUC(model):
    av = []
    cnt = 0
    for u in interactionsTestPerUser:
        if cnt % 5000 == 0:
            print(cnt)
        cnt += 1
        av.append(AUCu(model, u, 10))
    return sum(av) / len(av)
AUC(model)

0
5000
10000
15000


0.514024999999996

In [186]:
[random.sample(interactionsTestPerUser[u],1)] * 10

[[('105013582067549662050', '106632236204204927665')],
 [('105013582067549662050', '106632236204204927665')],
 [('105013582067549662050', '106632236204204927665')],
 [('105013582067549662050', '106632236204204927665')],
 [('105013582067549662050', '106632236204204927665')],
 [('105013582067549662050', '106632236204204927665')],
 [('105013582067549662050', '106632236204204927665')],
 [('105013582067549662050', '106632236204204927665')],
 [('105013582067549662050', '106632236204204927665')],
 [('105013582067549662050', '106632236204204927665')]]

In [257]:
x = set()

In [258]:
x.add(1)

In [267]:
len(interactionstest)

20000