In [1]:
# import all relevant libraries
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from zipfile import ZipFile
import warnings
import os
import tensorflow as tf
import keras
from keras import layers
from keras import ops
from keras import activations
from keras.models import load_model, Sequential, Model
from keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dropout, BatchNormalization, LeakyReLU
from keras.callbacks import EarlyStopping, LearningRateScheduler
from keras.regularizers import l2, l1_l2
from keras.optimizers import Adam, AdamW, RMSprop
from keras.optimizers.schedules import ExponentialDecay,CosineDecayRestarts
from tensorflow.keras import mixed_precision
from keras.initializers import GlorotUniform

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.utils import shuffle
warnings.filterwarnings('ignore')
%matplotlib inline

2024-12-13 08:03:13.416305: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-13 08:03:13.577684: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734073393.639074     465 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734073393.657486     465 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-13 08:03:13.810023: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
# checking that GPU is available for processing
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
# Load relevant databases
df_movies = pd.read_csv("../data/movies.csv")
df_genometags = pd.read_csv("../data/genome-tags.csv", index_col = 0)
df_genome_scores = pd.read_csv("../data/genome-scores.csv")
df = pd.read_csv("../data/ratings.csv") 

In [4]:
# preprocessing
# only include Movies with genome tags
movie_genomes = df_genome_scores.drop_duplicates(subset=["movieId"])
movie_genomes = movie_genomes["movieId"]
df = df[df["movieId"].isin(movie_genomes)]
df_genome_scores = df_genome_scores[df_genome_scores.tagId != 742] #removing the outlier "original"

# encode users and movies
user_ids = df["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = df["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
df["user"] = df["userId"].map(user2user_encoded)
df["movie"] = df["movieId"].map(movie2movie_encoded)
#get lenghts for implementing Keras layers
num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
num_genomes = len(df_genome_scores["tagId"].unique())
df["rating"] = df["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(df["rating"])
max_rating = max(df["rating"])

# Normalize genome scores 

df_genome_scores["relevance"] = df_genome_scores["relevance"] / df_genome_scores.groupby("movieId")["relevance"].transform('sum')
df_genome_scores = df_genome_scores[["movieId", "tagId",  "relevance"]]
df_genome_scores.sort_values("movieId")
# Create a matrix of genome tag relevance scores for each movie
# This will give us a matrix of size (num_movies, num_genomes), where each entry is the relevance of a tag for a movie

# Initialize an empty matrix with zeros (num_movies x num_genomes)
movie_genome_matrix = np.zeros((df_genome_scores["movieId"].nunique(), num_genomes))
# Map movieId to row indices
movie_to_row = {movie_id: idx for idx, movie_id in enumerate(df_genome_scores["movieId"].unique())}
# Map tagId to column indices
tag_to_col = {tag_id: idx for idx, tag_id in enumerate(df_genome_scores["tagId"].unique())}
# Populate the matrix with relevance scores
for _, row in df_genome_scores.iterrows():
    movie_idx = movie_to_row[row["movieId"]]
    tag_idx = tag_to_col[row["tagId"]]
    movie_genome_matrix[movie_idx, tag_idx] = row["relevance"]
# Convert the matrix to a DataFrame (optional, for easier inspection)
movie_genome_df = pd.DataFrame(movie_genome_matrix, columns=df_genome_scores["tagId"].unique())
# Now movie_genome_df contains the relevance scores for each movie, with a column for each tag.
# This matrix is of shape (num_movies, 1128) where each row is a movie and each column is a genome tag relevance.

print(
    "Number of users: {}, Number of Movies: {}, Number of Genome Tags: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, num_genomes, min_rating, max_rating
    )
)



Number of users: 138493, Number of Movies: 10370, Number of Genome Tags: 1127, Min rating: 0.5, Max rating: 5.0


In [5]:
# Calculate frequency of each rating
ratings = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating))  
rating_counts = ratings.value_counts()
rating_freq = {rating: count / len(df) for rating, count in rating_counts.items()}

# Create a weight map (inverse of the frequency)
rating_weights = {rating: 1 / freq for rating, freq in rating_freq.items()}

In [6]:
#Try using batches, due to the extremely large 20M Dataset
#Not using batches results in MemoryErrors
# Define the batch size
BATCH_SIZE = 512 

# Prepare the feature matrix x and target vector y
x_genomes = np.asarray(movie_genome_df)
x = df[["user", "movie"]].values  # Features
y = ratings.values
#x, y = shuffle(x, y, random_state=42)

# Randomly split users
unique_users = np.unique(x[:, 0])  # x[:, 0] is the user column

# Split users into training and testing
train_users, test_users = train_test_split(unique_users, test_size=0.2, random_state=42)

# Filter original dataset based on the split user ids
train_mask = np.isin(x[:, 0], train_users)
test_mask = np.isin(x[:, 0], test_users)

# Create training and testing sets based on user split
x_train = x[train_mask]
y_train = y[train_mask]
x_test = x[test_mask]
y_test = y[test_mask]

# Function to yield batches of data for training

def data_generator(x_data, y_data, x_genomes, batch_size=BATCH_SIZE):
    num_samples = len(x_data)
    
    while True:  # Loop over the data forever (for Keras)
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            batch_x = x_data[start:end]
            batch_y = y_data[start:end]
            
            # Get movie IDs and use them to index into x_genomes
            batch_movie_ids = batch_x[:, 1].astype(int)
            batch_genomes = x_genomes[batch_movie_ids]
            
             # Get the rating for the batch
            batch_ratings = batch_y
            
            # Compute weights for each rating in the batch
            batch_weights = np.array([rating_weights.get(rating, 1) for rating in batch_ratings])
           
            # Separate user, movie, and genome features
            batch_user = batch_x[:, 0].astype(np.int32)
            batch_movie = batch_x[:, 1].astype(np.int32)
            
            # Convert to TensorFlow tensors
            batch_user = tf.convert_to_tensor(batch_user, dtype=tf.int32)
            batch_movie = tf.convert_to_tensor(batch_movie, dtype=tf.int32)
            batch_genomes = tf.convert_to_tensor(batch_genomes, dtype=tf.float32)
            batch_y = tf.convert_to_tensor(batch_y, dtype=tf.float32)

            # Yield the batch for Keras
            yield (batch_user, batch_movie, batch_genomes), (batch_y, batch_weights)
            
# Create data generators for training and testing
train_generator = data_generator(x_train, y_train, x_genomes, batch_size=BATCH_SIZE)
test_generator = data_generator(x_test, y_test, x_genomes, batch_size=BATCH_SIZE) 


In [12]:
# Create a user input, creating a layer for our user data
DROPOUT = 0.2
user_input = Input(shape=[1], name="User-Input")
user_embedding = Embedding(num_users, 500, name="User-Embedding", embeddings_regularizer=l2(0.01))(user_input)
user_vec = Flatten(name="Flatten-Users")(user_embedding)
#user_vec = Dropout(DROPOUT)(user_vec)
# Create a movie input, correlating the movies to each user
movie_input = Input(shape=[1], name="Movie-Input")
movie_embedding = Embedding(num_movies, 500, name="Movie-Embedding", embeddings_regularizer=l2(0.01))(movie_input)
movie_vec = Flatten(name="Flatten-Movies")(movie_embedding)
#movie_vec = Dropout(DROPOUT)(movie_vec)
# Create a genome input, futher describing the movies
genome_input = Input(shape=[num_genomes], name = "Genome-Input")
genome_vec = Dense(1127, name="Dense-Genome", activation = "relu")(genome_input)
#genome_vec = Dropout(DROPOUT)(genome_vec)
user_bias = Embedding(num_users, 1)(user_input)
user_bias = Flatten()(user_bias)
movie_bias = Embedding(num_movies, 1)(movie_input)
movie_bias = Flatten()(movie_bias)
#concatenate the layers
conc = Concatenate()([user_vec, movie_vec, genome_vec])

# add fully-connected-layers
fc1 = Dense(512, activation='relu', kernel_initializer=GlorotUniform(), kernel_regularizer=l2(0.001))(conc)
fc1 = LeakyReLU(alpha=0.1)(fc1)  # Use LeakyReLU instead of ReLU
fc1 = Dropout(DROPOUT)(fc1)
fc2 = Dense(256, activation='relu', kernel_initializer=GlorotUniform(), kernel_regularizer=l2(0.001))(fc1)
fc2 = LeakyReLU(alpha=0.1)(fc2)  # Use LeakyReLU instead of ReLU
fc2 = Dropout(DROPOUT)(fc2)
fc3 = Dense(128, activation='relu', kernel_initializer=GlorotUniform(), kernel_regularizer=l2(0.001))(fc2)
fc3 = LeakyReLU(alpha=0.1)(fc3)  # Use LeakyReLU instead of ReLU
fc3 = Dropout(DROPOUT)(fc3)
fc4 = Dense(64, activation='relu', kernel_initializer=GlorotUniform(), kernel_regularizer=l2(0.001))(fc3)
fc4 = LeakyReLU(alpha=0.1)(fc4)
out = Dense(1)(fc4)

out_with_biases = (lambda x: x[0] + x[1] + x[2])([out, user_bias, movie_bias])

# Create model and compile it
model = Model([user_input, movie_input, genome_input], out_with_biases)



#optimizer = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999)

#optimizer = RMSprop(learning_rate=0.0001, rho=0.9, epsilon=1e-07)
# Huber loss combines MSE and MAE and is more robust to outliers // unused 
#loss = tf.keras.losses.Huber(delta=1.0)  
optimizer = RMSprop(learning_rate=1e-4, rho=0.9, epsilon=1e-07)
loss = tf.keras.losses.MeanSquaredError()
model.compile(
    loss=loss,
    optimizer=optimizer,
    metrics=[tf.keras.metrics.MeanSquaredError(), tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.RootMeanSquaredError()]
)

In [14]:
# create an EarlyStopping, stopping training if val_loss doesnt improve after 10 epochs
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
def lr_schedule(epoch, lr):
    if epoch % 5 == 0 and epoch > 0:
        lr = lr * 0.9  # Decay learning rate by 10% every 5 epochs
    return lr

lr_scheduler = LearningRateScheduler(lr_schedule)
# Train the model based on the data split
history = model.fit(
    train_generator, 
    epochs=100, 
    verbose = 1,
    callbacks = [early_stopping, lr_scheduler],
    validation_data=test_generator,
    steps_per_epoch=len(x_train) // BATCH_SIZE, 
    validation_steps=len(x_test) // BATCH_SIZE)

Epoch 1/100
[1m30841/30841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 10ms/step - loss: 18.1413 - mean_absolute_error: 0.1806 - mean_squared_error: 0.0535 - root_mean_squared_error: 0.2300 - val_loss: 0.0461 - val_mean_absolute_error: 0.1662 - val_mean_squared_error: 0.0461 - val_root_mean_squared_error: 0.2146 - learning_rate: 1.0000e-04
Epoch 2/100
[1m30841/30841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 10ms/step - loss: 0.0449 - mean_absolute_error: 0.1650 - mean_squared_error: 0.0449 - root_mean_squared_error: 0.2119 - val_loss: 0.0452 - val_mean_absolute_error: 0.1652 - val_mean_squared_error: 0.0452 - val_root_mean_squared_error: 0.2126 - learning_rate: 1.0000e-04
Epoch 3/100
[1m30841/30841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 10ms/step - loss: 0.0440 - mean_absolute_error: 0.1631 - mean_squared_error: 0.0440 - root_mean_squared_error: 0.2099 - val_loss: 0.0450 - val_mean_absolute_error: 0.1649 - val_mean_squared_error: 0.0450 - v

In [15]:
# Saving the model
model_dir = "../models/"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
model.save(os.path.join(model_dir, "movie_recommendation_model_with_genomes_retrained.keras")) 


In [None]:
movie_genome_df.to_csv("../data/movie_genome_df.csv")

In [None]:
# Load model to ensure it saved properly
loaded_model = load_model(os.path.join(model_dir, "movie_recommendation_model_with_genomes0_1614.keras"))
loaded_model.summary()

In [None]:
#Plot training and validation loss
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss - mean absolute error")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()

#Plot training and validation accuracy
plt.plot(history.history["mean_squared_error"])
plt.plot(history.history["val_mean_squared_error"])

plt.title("model accuracy")
plt.ylabel("mean absolute error")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()

In [None]:
def data_generator_evaluate(x_data, y_data, x_genomes, batch_size=BATCH_SIZE):
    num_samples = len(x_data)
    
    while True:  # Loop over the data forever (for Keras)
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            batch_x = x_data[start:end]
            batch_y = y_data[start:end]
            
            # Get movie IDs and use them to index into x_genomes
            batch_movie_ids = batch_x[:, 1].astype(int)
            batch_genomes = x_genomes[batch_movie_ids]
            
             # Get the rating for the batch
            batch_ratings = batch_y
            
            # Compute weights for each rating in the batch
            batch_weights = np.array([rating_weights.get(rating, 1) for rating in batch_ratings])
           
            # Separate user, movie, and genome features
            batch_user = batch_x[:, 0].astype(np.int32)
            batch_movie = batch_x[:, 1].astype(np.int32)
            
            # Convert to TensorFlow tensors
            batch_user = tf.convert_to_tensor(batch_user, dtype=tf.int32)
            batch_movie = tf.convert_to_tensor(batch_movie, dtype=tf.int32)
            batch_genomes = tf.convert_to_tensor(batch_genomes, dtype=tf.float32)
            batch_y = tf.convert_to_tensor(batch_y, dtype=tf.float32)

            # Yield the batch for Keras
            yield (batch_user, batch_movie, batch_genomes), batch_y
            
            
evaluation_generator = data_generator_evaluate(x_test, y_test, x_genomes, batch_size=BATCH_SIZE) 

In [None]:
loaded_model.evaluate(evaluation_generator)