In [1]:
import sys
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import keras
from keras import layers
from keras import ops

# Import utils from subfolder of project, works for immediate subfolders of PROJECT_ROOT
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..")) # adjust relative import as necessary
sys.path.append(PROJECT_ROOT)
from utils.data_processing import get_filtered_review_data, get_metadata

In [2]:
CATEGORY = 'Video_Games'

X_train, y_train, X_val, y_val, X_test, y_test = get_filtered_review_data(CATEGORY)
metadata = get_metadata(CATEGORY)

Loading preprocessed data from data/Video_Games_min5_test1_val1_cols['user_id', 'product_id', 'timestamp', 'title', 'text', 'helpful_vote'].pkl
Loading metadata from data/Video_Games_metadata.pkl


In [3]:
class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_products, embedding_size, **kwargs):
        super().__init__(**kwargs)
        self.num_users = num_users
        self.num_products = num_products
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.product_embedding = layers.Embedding(
            num_products,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.product_bias = layers.Embedding(num_products, 1)

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        product_vector = self.product_embedding(inputs[:, 1])
        product_bias = self.product_bias(inputs[:, 1])

        dot_user_product = ops.tensordot(user_vector, product_vector, 2)
        a = dot_user_product + user_bias + product_bias
        
        return ops.nn.sigmoid(a)

In [4]:
num_users = len(pd.concat([X_train['user_id'], X_val['user_id'], X_test['user_id']]).unique())
num_products = len(pd.concat([X_train['product_id'], X_val['product_id'], X_test['product_id']]).unique())
EMBEDDING_SIZE = 1

model = RecommenderNet(num_users, num_products, EMBEDDING_SIZE)
model.compile(
    loss=keras.losses.BinaryCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.001),)

In [None]:
X_train = X_train[["user_id", "product_id"]].values
y_train = y_train.values

X_val = X_val[["user_id", "product_id"]].values
y_val = y_val.values

X_test = X_test[["user_id", "product_id"]].values
y_test = y_test.values

In [6]:
history = model.fit(
    x=X_train,
    y=y_train,
    batch_size=64,
    epochs=5,
    verbose=1,
    validation_data=(X_val, y_val),
)

Epoch 1/5
[1m9767/9767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step - loss: 0.6177 - val_loss: 0.4828
Epoch 2/5
[1m9767/9767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 0.4412 - val_loss: 0.4662
Epoch 3/5
[1m9767/9767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - loss: 0.4211 - val_loss: 0.4543
Epoch 4/5
[1m9767/9767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 0.4070 - val_loss: 0.4458
Epoch 5/5
[1m9767/9767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - loss: 0.3963 - val_loss: 0.4392


In [22]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_test).flatten()
model_error = mean_squared_error(y_test, y_pred) ** 0.5
print(f'RMSE: {model_error}')

[1m2962/2962[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 860us/step
RMSE: 0.30784993426955987


## Naive Models

In [None]:
global_mean_rating = y_train.mean()

def global_model(X):
    # Predicts the mean score across all ratings, regardless of product or user
    return np.ones(len(X)) * global_mean_rating

y_pred = global_model(X_test)
model_error = mean_squared_error(y_test, y_pred) ** 0.5
print(f'RMSE: {model_error}')

RMSE: 0.32371858806334664


In [None]:
train_features = pd.DataFrame(X_train, columns=['user_id', 'product_id'])
train_features['rating'] = y_train

In [None]:
user_ratings = train_features.groupby(['user_id']).mean()['rating']

def user_model(X):
    # Predicts the mean score of previous user ratings, regardless of product
    users = X[:,0]
    return user_ratings[users]

y_pred = user_model(X_test)
model_error = mean_squared_error(y_test, y_pred) ** 0.5
print(f'RMSE: {model_error}')

RMSE: 0.3115740813686763


0.8202042357398146

In [51]:
product_ratings = train_features.groupby(['product_id']).mean()['rating']

# Handle cases where unknown products are shown
product_ratings[-1] = global_mean_rating
unique_products = train_features['product_id'].unique()

def product_model(X):
    # Predicts the mean score of previous product ratings, regardless of user
    products = np.where(np.isin(X[:,1],unique_products), X[:,1], -1)
    return product_ratings[products]

y_pred = product_model(X_test)
model_error = mean_squared_error(y_test, y_pred) ** 0.5
print(f'RMSE: {model_error}')

RMSE: 0.31694089998797526
