In [5]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model
import pandas as pd

import tensorflow as tf
from tensorflow.keras import Model

In [6]:
data_dir = "C:/Users/samsung/UCSD/Fall22/CSE258/archive"
df_interactions = pd.read_csv(f"{data_dir}/RAW_interactions.csv")
print(df_interactions.shape)
print(len(df_interactions))
print(df_interactions.columns)

(1132367, 5)
1132367
Index(['user_id', 'recipe_id', 'date', 'rating', 'review'], dtype='object')


In [7]:
userIDs, itemIDs = {}, {}
interactions = []

for i in range(len(df_interactions)):
    d = df_interactions.iloc[i]
    u,i = d['user_id'], d['recipe_id']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((u,i,d['rating']))

nUsers, nItems = len(userIDs), len(itemIDs)
print(nUsers)
print(nItems)

226570
231637


In [8]:
from sklearn.model_selection import train_test_split
y = [r for _,_,r in interactions]
trainData, X_test, y_train, y_test = train_test_split(interactions, y, test_size=0.20, stratify=None, random_state=6752)
validData = X_test[:int(len(X_test)*0.1)]
testData = X_test[int(len(X_test)*0.1):]

In [20]:
mu = sum([r for _,_,r in interactions]) / len(interactions)
mu 

4.411016039852804

In [62]:
# user-item
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb_1, lamb_2):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu, dtype=tf.float32)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.lamb_1 = lamb_1
        self.lamb_2 = lamb_2

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb_1 * tf.reduce_sum(self.betaU**2) +\
               self.lamb_1 * tf.reduce_sum(self.betaI**2) +\
               self.lamb_2 * tf.reduce_sum(self.gammaU**2) +\
               self.lamb_2 * tf.reduce_sum(self.gammaI**2)
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [63]:
lamb_b = 0.000001
lamb_g = 0.000001
best_validMSE = 10
K = 10
lr = 0.01
batch = 1024
epoch = 10000

model = LatentFactorModel(mu, 5, lamb_b, lamb_g)
optimizer = tf.keras.optimizers.Adam(lr)

In [19]:
def trainingStep(interactions):
    Nsamples = batch
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)
            
        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [20]:
print(f"===== K: {K} lamb_b: {lamb_b} lamb_g: {lamb_g}=====")
model = LatentFactorModel(mu, K, lamb_b, lamb_g) 
model.load_weights(f'./as1/batch1024_lr0.01_weights_K10_lambb1e-06_lambg1e-06/best_epoch260_model') #################3
weight_dir = f'./as1/batch{batch}_lr{lr}_weights_K{K}_lambb{lamb_b}_lambg{lamb_g}'
validMSE = 0
best_validMSE = 10
valid_error = []
for i in range(261, 260+epoch):
    obj = trainingStep(trainData)
    print("iteration " + str(i) + ", objective = " + str(obj))
    if i % 10 == 0 and i > 0: 
        predictions = []
        for u,b,r in validData:
            pred = model.predict(userIDs[u], itemIDs[b]).numpy()
            predictions.append([pred, r])
        validMSE = sum([(x-y)**2 for x,y in predictions]) / len(predictions)
        valid_error.append(validMSE)
        if validMSE < best_validMSE:
            model.save_weights(f'{weight_dir}/best_epoch{i}_model', save_format='tf')
        print("iteration " + str(i) + ", validMSE = " + str(validMSE))

===== K: 10 lamb_b: 1e-06 lamb_g: 1e-06=====
iteration 261, objective = 0.8337669
iteration 262, objective = 0.82722497
iteration 263, objective = 0.7763771
iteration 264, objective = 0.71063906
iteration 265, objective = 0.7612779
iteration 266, objective = 0.91031533
iteration 267, objective = 0.7229339
iteration 268, objective = 0.7174513
iteration 269, objective = 0.8675228
iteration 270, objective = 0.7732925
iteration 270, validMSE = 1.5691668741777316
iteration 271, objective = 0.65226334
iteration 272, objective = 0.7569027
iteration 273, objective = 0.82788914
iteration 274, objective = 0.72874796
iteration 275, objective = 0.96551806
iteration 276, objective = 0.8921397
iteration 277, objective = 0.81425583
iteration 278, objective = 0.69947624
iteration 279, objective = 0.81613743
iteration 280, objective = 0.78229004
iteration 280, validMSE = 1.5504047444219098
iteration 281, objective = 0.7939201
iteration 282, objective = 0.7015826
iteration 283, objective = 0.6362364
ite

KeyboardInterrupt: 

In [47]:
testData = X_test[int(len(X_test)*0.1):][-2000:]
len(testData)

2000

In [77]:
# test
# lamb 0.01(epoch250) -> 1.5911313
# lambb 0.01 / lambg 0.001 -> 1.5906503355443455
loaded_model = LatentFactorModel(mu, 10, lamb_b, lamb_g)
loaded_model.load_weights(f'./as1/batch1024_lr0.01_weights_K10_lambb1e-06_lambg1e-06/best_epoch250_model')
predictions = []
for u,b,r,_,_,_ in testData:
    pred = loaded_model.predict(userIDs[u], itemIDs[b]).numpy()
    predictions.append([pred, r])
testMSE = sum([(x-y)**2 for x,y in predictions]) / len(predictions)
print(testMSE)

print(len(predictions))
# open the file in the write mode
with open('./lfm_result.csv', 'w') as f:
    # create the csv writer
    for p,t in predictions:
        f.writelines(f"{p},{t}\n")

1.468220046792608
2000


Latent Factor Model with Side Features

1. Generate Train Data with Side Features

In [9]:
data_dir = "C:/Users/samsung/UCSD/Fall22/CSE258/archive"
df_recipes = pd.read_csv(f"{data_dir}/RAW_recipes.csv")
df_interactions = pd.read_csv(f"{data_dir}/RAW_interactions.csv")

In [10]:
data = pd.merge(df_recipes, df_interactions, right_on='recipe_id',left_on='id')
data[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']] = data.nutrition.str.split(",",expand=True) 
data['calories'] = data['calories'].apply(lambda x: x.replace("[" ,""))
data['carbohydrates'] = data['carbohydrates'].apply(lambda x: x.replace("]" ,""))
data[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']] =  data[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']].astype(float)

In [11]:
userIDs, itemIDs = {}, {}
interactions_v2 = []

for i in range(len(data)):
    d = data.iloc[i]
    u,i = d['user_id'], d['recipe_id']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    calories = d['calories']
    minutes = d['minutes']
    steps = d['n_steps']
    interactions_v2.append((u,i,d['rating'],calories,minutes,steps))

nUsers, nItems = len(userIDs), len(itemIDs)
print(nUsers)
print(nItems)

226570
231637


In [12]:
from sklearn.model_selection import train_test_split
trainData, X_test = train_test_split(interactions_v2, test_size=0.20, stratify=None, random_state=6752)
validData = X_test[:int(len(X_test)*0.1)]
testData = X_test[int(len(X_test)*0.1):]

Latent Factor Model with side features

Calories

In [13]:
import numpy as np
calories = set()
for i in range(len(data)):
    d = data.iloc[i]
    calories.add(d['calories'])
q1 = np.percentile(list(calories),25)
q2 = np.percentile(list(calories),50)
q3 = np.percentile(list(calories),75)
q4 = np.percentile(list(calories),100)

def convert_cal(x):
    if x <= q1:
        return 0
    elif x > q1 and x <= q2:
        return 1
    elif x > q2 and x <= q3:
        return 2
    elif x > q3 and x <= q4:
        return 3



In [91]:
# user-item-calories
class LatentFactorModel_calories(tf.keras.Model):
    def __init__(self, mu, K, lamb_1, lamb_2):
        super(LatentFactorModel_calories, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu, dtype=tf.float32)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.betaCal = tf.Variable(tf.random.normal([4],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.gammaCal = tf.Variable(tf.random.normal([4,K],stddev=0.001))
        self.lamb_1 = lamb_1
        self.lamb_2 = lamb_2

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i, cal):
        cal = int(cal)
        cal_class = convert_cal(cal)
        p = self.alpha + self.betaU[u] + self.betaI[i] + cal*self.betaCal[cal_class] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1) +\
            tf.tensordot(self.gammaI[u], self.gammaCal[cal_class], 1)  
        return p

    # Regularizer
    def reg(self):
            return self.lamb_1 * tf.reduce_sum(self.betaU**2) +\
               self.lamb_1 * tf.reduce_sum(self.betaI**2) +\
               self.lamb_1 * tf.reduce_sum(self.betaCal**2) +\
               self.lamb_2 * tf.reduce_sum(self.gammaU**2) +\
               self.lamb_2 * tf.reduce_sum(self.gammaI**2) +\
               self.lamb_2 * tf.reduce_sum(self.gammaCal**2)   
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI, sampleCal):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        cal = tf.convert_to_tensor(sampleCal, dtype=tf.float32)
        cal_class = [convert_cal(c) for c in sampleCal]
        cal_calss = tf.convert_to_tensor(cal_class, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        beta_cal = tf.nn.embedding_lookup(self.betaCal, cal_class)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        gamma_cal = tf.nn.embedding_lookup(self.gammaCal, cal_class)
        pred = self.alpha + beta_u + beta_i + cal*beta_cal +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1) +\
               tf.reduce_sum(cal*tf.multiply(gamma_i, gamma_cal), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR, sampleCal):
        pred = self.predictSample(sampleU, sampleI, sampleCal)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [88]:
def trainingStep(interactions):
    Nsamples = batch
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR, sampleCal = [], [], [], []
        for _ in range(Nsamples):
            u,i,r,cal,_,_, = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)
            sampleCal.append(cal)
            
        loss = model(sampleU,sampleI,sampleR,sampleCal)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [92]:
lamb_b = 0.000001
lamb_g = 0.000001
best_validMSE = 10
K = 10
lr = 0.01
batch = 1024
epoch = 10000

model = LatentFactorModel_calories(mu, K, lamb_b, lamb_g)
optimizer = tf.keras.optimizers.Adam(lr)

In [93]:
print(f"===== K: {K} lamb_b: {lamb_b} lamb_g: {lamb_g}=====")
model = LatentFactorModel_calories(mu, K, lamb_b, lamb_g)
weight_dir = f'./as4/batch{batch}_lr{lr}_weights_K{K}_lambb{lamb_b}_lambg{lamb_g}'
validMSE = 0
best_validMSE = 10
valid_error = []
for i in range(epoch):
    obj = trainingStep(trainData)
    print("iteration " + str(i) + ", objective = " + str(obj))
    if i % 10 == 0 and i > 0: 
        predictions = []
        for u,b,r,cal,_,_ in validData:
            pred = model.predict(userIDs[u], itemIDs[b], cal).numpy()
            predictions.append([pred, r])
        validMSE = sum([(x-y)**2 for x,y in predictions]) / len(predictions)
        valid_error.append(validMSE)
        if validMSE < best_validMSE:
            model.save_weights(f'{weight_dir}/best_epoch{i}_model', save_format='tf')
        print("iteration " + str(i) + ", validMSE = " + str(validMSE))

===== K: 10 lamb_b: 1e-06 lamb_g: 1e-06=====


InvalidArgumentError: Incompatible shapes: [1024] vs. [1024,10] [Op:Mul]

In [85]:
# test
# lamb 0.01(epoch250) -> 1.5911313
# lambb 0.01 / lambg 0.001 -> 1.5906503355443455
loaded_model = LatentFactorModel_calories(mu, 10, lamb_b, lamb_g)
loaded_model.load_weights(f'./as3/batch1024_lr0.01_weights_K10_lambb1e-06_lambg1e-06/best_epoch250_model')
predictions = []
for u,b,r,cal,_,_ in testData:
    pred = loaded_model.predict(userIDs[u], itemIDs[b], convert_cal(cal)).numpy()
    predictions.append([pred, r])
testMSE = sum([(x-y)**2 for x,y in predictions]) / len(predictions)
print(testMSE)
print(len(predictions))

# open the file in the write mode
with open('./lfm_calories_result.csv', 'w') as f:
    # create the csv writer
    for p,t in predictions:
        f.writelines(f"{p},{t}\n")

1.5566137961189648
2000


In [82]:
# test
# lamb 0.01(epoch250) -> 1.5911313
# lambb 0.01 / lambg 0.001 -> 1.5906503355443455
loaded_model = LatentFactorModel_calories(mu, 10, lamb_b, lamb_g)
loaded_model.load_weights(f'./as2/batch1024_lr0.01_weights_K10_lambb1e-06_lambg1e-06/best_epoch250_model')
predictions = []
for u,b,r,cal,_,_ in testData:
    pred = loaded_model.predict(userIDs[u], itemIDs[b], convert_cal(cal)).numpy()
    predictions.append([pred, r])
testMSE = sum([(x-y)**2 for x,y in predictions]) / len(predictions)
print(testMSE)
print(len(predictions))

1.6184779944859702
2000


In [None]:
from matplotlib import pyplot as plt
fig, axes = plt.subplots(ncols=1, figsize=(8, 8))

x = np.arange(1, n_iter) * step_size
with plt.style.context('fivethirtyeight'):
    axes.plot(x, rmse_train, label='MSE-train', color='b')
    axes.plot(x, rmse_test, label='MSE-test', color='r')
axes.set_ylabel('MSE', color='r')
axes.legend()