# Latent Factor Model (Tensorflow) - Assignment 2 CSE 158

In [87]:
import json
import numpy as np
import tensorflow as tf
import random
from collections import defaultdict
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

## Data Manipulation and Train/Test Split

In [62]:
data = []

with open('renttherunway_final_data.json', 'r') as file:
    for line in file:
        json_object = json.loads(line)
        data.append(json_object)

In [63]:
data[0]

{'fit': 'fit',
 'user_id': '420272',
 'bust size': '34d',
 'item_id': '2260466',
 'weight': '137lbs',
 'rating': '10',
 'rented for': 'vacation',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': 'hourglass',
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': '5\' 8"',
 'size': 14,
 'age': '28',
 'review_date': 'April 20, 2016'}

In [64]:
# (eventually clean data uniformly using repo script)

userIDs = {}
itemIDs = {}
interactions = []

for d in data:
    u = d['user_id']
    i = d['item_id']
    r = d['rating']
    
    # removing None rating
    if r is None:
        continue
    r = int(r)
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((u,i,r))

In [65]:
random.shuffle(interactions)
len(interactions)

192462

In [66]:
# train 0.8 | test 0.2
nTrain = int(len(interactions) * 0.8)
nTest = len(interactions) - nTrain
interactionsTrain = interactions[:nTrain]
interactionsTest = interactions[nTrain:]

In [67]:
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)
for u,i,r in interactionsTrain:
    itemsPerUser[u].append(i)
    usersPerItem[i].append(u)

## Model Training

In [68]:
# initializing mu at mean rating
mu = sum([r for _,_,r in interactionsTrain]) / len(interactionsTrain)

# gradient descent optimizer (can edit learning rate)
optimizer = tf.keras.optimizers.Adam(0.1)

In [69]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        self.alpha = tf.Variable(mu)
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.lamb = lamb

    # prediction for a single instance
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))
    
    # prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [70]:
# initialize model (can change num facotrs/regularization rate)
modelLFM = LatentFactorModel(mu, 5, 0.00001)

In [71]:
def trainingStep(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [72]:
# batch size 100
for i in range(100):
    obj = trainingStep(modelLFM, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.80615824
iteration 20, objective = 0.62087655
iteration 30, objective = 0.582744
iteration 40, objective = 0.56782377
iteration 50, objective = 0.541464
iteration 60, objective = 0.5288838
iteration 70, objective = 0.52186334
iteration 80, objective = 0.5144605
iteration 90, objective = 0.5113201
iteration 100, objective = 0.51073587


## Results

In [86]:
preds = []
truth = []
for k in range(len(interactionsTest)):
    u,i,r = interactionsTest[k]
    preds.append(modelLFM.predict(userIDs[u], itemIDs[i]).numpy())
    truth.append(r)

In [93]:
mse = mean_squared_error(truth, preds)
rmse = mean_squared_error(truth, preds, squared=False)

print(f"RMSE: {rmse}")
print(f"MSE: {mse}")

RMSE: 1.4457771244353923
MSE: 2.090271493540672


Generally, RMSE of 1.5 implies that, on average, the model's predictions are off by approximately 1.5 rating points from the actual ratings. MSE of about 2.09 imples that, on average. the squared error of the predictions from the actual values is 2.09 rating points squared.