# CSE 258 - Assignment 2

In [11]:
import gzip
import json
import time
from collections import defaultdict
import plotly.graph_objects as go
import tensorflow as tf
from IPython.display import Image

In [2]:
dataFolder = 'data'

## Reduce dataset

In [20]:
us0Min, us0Max = 24.18, 49.03
us1Min, us1Max = -125.57, -58.65
usPlacesPath = dataFolder + '/us_places.json'
usReviewsPath = dataFolder + '/us_reviews.json'
finalReviewsPath = dataFolder + '/final_reviews.json'
usUsersPath = dataFolder + '/us_users.json'
reviewsPath = dataFolder + '/reviews.clean.json.gz' 
usersPath = dataFolder + '/users.clean.json.gz'

In [None]:
def readData(path):
    data = []
    for line in gzip.open(path):
        d = eval(line)
        data.append(d)
        if len(data) > 100000:
            break
    return data

In [26]:
def filterPlaces(path):
    placeIds = []
    f = open(usPlacesPath, 'w')
    for line in gzip.open(path):
        d = eval(line)
        if d['gps'] and d['gps'][0] > us0Min and d['gps'][0] < us0Max and d['gps'][1] > us1Min and d['gps'][1] < us1Max:
            f.write(json.dumps(d) + '\n')
            placeIds.append(d['gPlusPlaceId'])
    f.close()
    return set(placeIds)

def filterReviews(path, placeIds):
    f = open(usReviewsPath, 'w')
    for line in gzip.open(path):
        d = eval(line)
        if d['gPlusPlaceId'] in placeIds:
            f.write(json.dumps(d) + '\n')
    f.close()

def keepUsers(path):
    itemsPerUser = defaultdict(set)
    for line in open(path):
        d = json.loads(line)
        itemsPerUser[d['gPlusUserId']].add(d['gPlusPlaceId']) 
    k = 5
    users = []
    reviews = []
    for u in itemsPerUser:
        if len(itemsPerUser[u]) > k:
            users.append(u)
        else:
            for item in itemsPerUser[u]:
                reviews.append((u, item))
    return users, reviews

def writeUsers(path, users):
    f = open(usUsersPath, 'w')
    for line in gzip.open(path):
        d = eval(line)
        if d['gPlusUserId'] in users:
            f.write(json.dumps(d) + '\n')
    f.close()

def removeReviews(path, reviews):
    f = open(finalReviewsPath, 'w')
    for line in open(path):
        d = json.loads(line)
        if (d['gPlusUserId'], d['gPlusPlaceId']) not in reviews:
            f.write(json.dumps(d) + '\n')
    f.close()

In [5]:
t1 = time.time()
placeIds = filterPlaces(dataFolder + '/places.clean.json.gz')
t2 = time.time()
print('Time taken in secs', t2 - t1)
len(placeIds)

Time taken in secs 282.09713673591614


1308271

In [6]:
t1 = time.time()
filterReviews(dataFolder + '/reviews.clean.json.gz', placeIds)
t2 = time.time()
print('Time taken in secs', t2 - t1)

Time taken in secs 596.6306154727936


In [25]:
t1 = time.time()
kpUsers, rmReviews = keepUsers(usReviewsPath)
t2 = time.time()
print('Time taken in secs', t2 - t1)

Time taken in secs 443.93540477752686


In [27]:
t1 = time.time()
writeUsers(usersPath, set(kpUsers))
t2 = time.time()
print('Time taken in secs', t2 - t1)

Time taken in secs 319.38766956329346


In [28]:
len(rmReviews)

3416657

In [29]:
t1 = time.time()
removeReviews(usReviewsPath, set(rmReviews))
t2 = time.time()
print('Time taken in secs', t2 - t1)

Time taken in secs 395.1437101364136


## Exploratory data analysis

In [None]:
lat, long, text = [], [], []
for d in places:
    if d['gps']:
        lat.append(d['gps'][0])
        long.append(d['gps'][1])
        text.append(d['name'])

In [None]:
fig = go.Figure(data=go.Scattergeo(
        lon = long,
        lat = lat,
        text = text,
        mode = 'markers'
        ))
imgBytes = fig.to_image(format="png")
Image(imgBytes)

## Model

In [None]:
class FPMC(tf.keras.Model):
    def __init__(self, K, lamb, UI = 1, IJ = 1):
        super(FPMC, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaUI = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaIU = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.gammaIJ = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.gammaJI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb
        # Which terms to include
        self.UI = UI
        self.IJ = IJ

    # Prediction for a single instance
    def predict(self, u, i, j):
        p = self.betaI[i] + self.UI * tf.tensordot(self.gammaUI[u], self.gammaIU[i], 1) +\
                            self.IJ * tf.tensordot(self.gammaIJ[i], self.gammaJI[j], 1)
        return p
    
    def predictSample(self, sampleU, sampleI, sampleJ):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        j = tf.convert_to_tensor(sampleJ, dtype=tf.int32)
        gamma_ui = tf.nn.embedding_lookup(self.gammaUI, u)
        gamma_iu = tf.nn.embedding_lookup(self.gammaIU, i)
        gamma_ij = tf.nn.embedding_lookup(self.gammaIJ, i)
        gamma_ji = tf.nn.embedding_lookup(self.gammaJI, j)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        return beta_i + self.UI * tf.reduce_sum(tf.multiply(gamma_ui, gamma_iu), 1) +\
                         self.IJ * tf.reduce_sum(tf.multiply(gamma_ij, gamma_ji), 1)
    

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaUI) +\
                            tf.nn.l2_loss(self.gammaIU) +\
                            tf.nn.l2_loss(self.gammaIJ) +\
                            tf.nn.l2_loss(self.gammaJI))

    def call(self, sampleU, # user
                   sampleI, # item
                   sampleJ, # previous item
                   sampleK): # negative item
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        j = tf.convert_to_tensor(sampleJ, dtype=tf.int32)
        k = tf.convert_to_tensor(sampleK, dtype=tf.int32)
        gamma_ui = tf.nn.embedding_lookup(self.gammaUI, u)
        gamma_iu = tf.nn.embedding_lookup(self.gammaIU, i)
        gamma_ij = tf.nn.embedding_lookup(self.gammaIJ, i)
        gamma_ji = tf.nn.embedding_lookup(self.gammaJI, j)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        x_uij = beta_i + self.UI * tf.reduce_sum(tf.multiply(gamma_ui, gamma_iu), 1) +\
                         self.IJ * tf.reduce_sum(tf.multiply(gamma_ij, gamma_ji), 1)
        gamma_uk = tf.nn.embedding_lookup(self.gammaUI, u)
        gamma_ku = tf.nn.embedding_lookup(self.gammaIU, k)
        gamma_kj = tf.nn.embedding_lookup(self.gammaIJ, k)
        gamma_jk = tf.nn.embedding_lookup(self.gammaJI, j)
        beta_k = tf.nn.embedding_lookup(self.betaI, k)
        x_ukj = beta_k + self.UI * tf.reduce_sum(tf.multiply(gamma_uk, gamma_ku), 1) +\
                         self.IJ * tf.reduce_sum(tf.multiply(gamma_kj, gamma_jk), 1)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_uij - x_ukj)))

## Evaluate