In [1]:
# import all relevant libraries
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from zipfile import ZipFile
import warnings
import os
import tensorflow as tf
import keras
from keras import layers
from keras import ops
from keras import activations
from keras.models import load_model, Sequential, Model
from keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dropout, BatchNormalization, LeakyReLU
from keras.callbacks import EarlyStopping, LearningRateScheduler
from keras.regularizers import l2, l1_l2
from keras.optimizers import Adam, AdamW, RMSprop
from keras.optimizers.schedules import ExponentialDecay,CosineDecayRestarts
from tensorflow.keras import mixed_precision
from keras.initializers import GlorotUniform

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.utils import shuffle
warnings.filterwarnings('ignore')
%matplotlib inline

2024-11-28 10:52:30.167094: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-28 10:52:30.176855: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732787550.186655    8136 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732787550.189546    8136 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-28 10:52:30.199322: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
# checking that GPU is available for processing
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
# Load relevant databases
df_movies = pd.read_csv("../data/movies.csv", index_col = 0)
df = pd.read_csv("../data/ratings.csv") 

In [4]:
# preprocessing

# encode users and movies
user_ids = df["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = df["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
df["user"] = df["userId"].map(user2user_encoded)
df["movie"] = df["movieId"].map(movie2movie_encoded)
#get lenghts for implementing Keras layers
num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
df["rating"] = df["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(df["rating"])
max_rating = max(df["rating"])


print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)



Number of users: 138493, Number of Movies: 26744, Min rating: 0.5, Max rating: 5.0


In [5]:
# Calculate frequency of each rating
ratings = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating))
rating_counts = ratings.value_counts()
rating_freq = {rating: count / len(df) for rating, count in rating_counts.items()}

# Create a weight map (inverse of the frequency)
rating_weights = {rating: 1 / freq for rating, freq in rating_freq.items()}

In [6]:
#Try using batches, due to the extremely large 20M Dataset
#Not using batches results in MemoryErrors
# Define the batch size
BATCH_SIZE = 128 

# Prepare the feature matrix x and target vector y
x = df[["user", "movie"]].values  # Features
y = ratings.values
#x, y = shuffle(x, y, random_state=42)

# Randomly split users
unique_users = np.unique(x[:, 0])  # x[:, 0] is the user column

# Split users into training and testing
train_users, test_users = train_test_split(unique_users, test_size=0.2, random_state=42)

# Filter original dataset based on the split user ids
train_mask = np.isin(x[:, 0], train_users)
test_mask = np.isin(x[:, 0], test_users)

# Create training and testing sets based on user split
x_train = x[train_mask]
y_train = y[train_mask]
x_test = x[test_mask]
y_test = y[test_mask]

# Function to yield batches of data for training

def data_generator(x_data, y_data, batch_size=BATCH_SIZE):
    num_samples = len(x_data)
    
    while True:  # Loop over the data forever (for Keras)
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            batch_x = x_data[start:end]
            batch_y = y_data[start:end]
            
             # Get the rating for the batch
            batch_ratings = batch_y
            
            # Compute weights for each rating in the batch
            batch_weights = np.array([rating_weights.get(rating, 1) for rating in batch_ratings])
           
            # Separate user, movie, and genome features
            batch_user = batch_x[:, 0].astype(np.int32)
            batch_movie = batch_x[:, 1].astype(np.int32)
            
            # Convert to TensorFlow tensors
            batch_user = tf.convert_to_tensor(batch_user, dtype=tf.int32)
            batch_movie = tf.convert_to_tensor(batch_movie, dtype=tf.int32)
            batch_y = tf.convert_to_tensor(batch_y, dtype=tf.float32)

            # Yield the batch for Keras
            yield (batch_user, batch_movie), (batch_y, batch_weights)
            
# Create data generators for training and testing
train_generator = data_generator(x_train, y_train, batch_size=BATCH_SIZE)
test_generator = data_generator(x_test, y_test, batch_size=BATCH_SIZE) 


In [7]:
# Create a user input, creating a layer for our user data
DROPOUT = 0.2
user_input = Input(shape=[1], name="User-Input")
user_embedding = Embedding(num_users, 250, name="User-Embedding", embeddings_regularizer=l2(0.01))(user_input)
user_vec = Flatten(name="Flatten-Users")(user_embedding)
#user_vec = Dropout(DROPOUT)(user_vec)
# Create a movie input, correlating the movies to each user
movie_input = Input(shape=[1], name="Movie-Input")
movie_embedding = Embedding(num_movies, 250, name="Movie-Embedding", embeddings_regularizer=l2(0.01))(movie_input)
movie_vec = Flatten(name="Flatten-Movies")(movie_embedding)
#movie_vec = Dropout(DROPOUT)(movie_vec)
#concatenate the layers
conc = Concatenate()([user_vec, movie_vec])

# add fully-connected-layers
fc1 = Dense(512, activation='swish', kernel_initializer=GlorotUniform(), kernel_regularizer=l2(0.01))(conc)
fc1 = Dropout(DROPOUT)(fc1)  # Dropout with 20% rate, to prevent overfitting
#fc1 = BatchNormalization()(fc1)
fc2 = Dense(256, activation='swish', kernel_initializer=GlorotUniform(), kernel_regularizer=l2(0.01))(fc1)
fc2 = Dropout(DROPOUT)(fc2)  # Dropout with 20% rate, to prevent overfitting
#fc2 = BatchNormalization()(fc2)
fc3 = Dense(128, activation='swish', kernel_initializer=GlorotUniform(), kernel_regularizer=l2(0.01))(fc2)
fc3 = Dropout(DROPOUT)(fc3)  # Dropout with 20% rate, to prevent overfitting
#fc3 = BatchNormalization()(fc3)
out = Dense(1)(fc3)
# Create model and compile it
model = Model([user_input, movie_input], out)



optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

#optimizer = RMSprop(learning_rate=0.0001, rho=0.9, epsilon=1e-07)
# Huber loss combines MSE and MAE and is more robust to outliers // unused
#loss = tf.keras.losses.Huber(delta=1.0)  

loss = tf.keras.losses.MeanAbsoluteError()
model.compile(
    loss=loss,
    optimizer=optimizer,
    metrics=[tf.keras.metrics.MeanSquaredError(), tf.keras.metrics.MeanAbsoluteError()]
)

I0000 00:00:1732787563.471771    8136 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7537 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:01:00.0, compute capability: 8.6


In [8]:
# create an EarlyStopping, stopping training if val_loss doesnt improve after 10 epochs
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
def lr_schedule(epoch, lr):
    if epoch % 5 == 0 and epoch > 0:
        lr = lr * 0.9  # Decay learning rate by 10% every 5 epochs
    return lr

lr_scheduler = LearningRateScheduler(lr_schedule)
# Train the model based on the data split
history = model.fit(
    train_generator, 
    epochs=100, 
    verbose = 1,
    callbacks = [early_stopping, lr_scheduler],
    validation_data=test_generator,
    steps_per_epoch=len(x_train) // BATCH_SIZE, 
    validation_steps=len(x_test) // BATCH_SIZE)

Epoch 1/100


I0000 00:00:1732787565.196740    8222 service.cc:148] XLA service 0x7f412001ccf0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732787565.196770    8222 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2024-11-28 10:52:45.227476: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1732787565.339829    8222 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m    22/124589[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10:17[0m 5ms/step - loss: 270.3899 - mean_absolute_error: 0.6287 - mean_squared_error: 0.4555

I0000 00:00:1732787566.394273    8222 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m124589/124589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m660s[0m 5ms/step - loss: 0.5842 - mean_absolute_error: 0.1872 - mean_squared_error: 0.0583 - val_loss: 0.1936 - val_mean_absolute_error: 0.1922 - val_mean_squared_error: 0.0679 - learning_rate: 0.0010
Epoch 2/100
[1m124589/124589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m658s[0m 5ms/step - loss: 0.1879 - mean_absolute_error: 0.1859 - mean_squared_error: 0.0576 - val_loss: 0.1891 - val_mean_absolute_error: 0.1882 - val_mean_squared_error: 0.0663 - learning_rate: 0.0010
Epoch 3/100
[1m124589/124589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m657s[0m 5ms/step - loss: 0.1878 - mean_absolute_error: 0.1859 - mean_squared_error: 0.0578 - val_loss: 0.1879 - val_mean_absolute_error: 0.1872 - val_mean_squared_error: 0.0641 - learning_rate: 0.0010
Epoch 4/100
[1m 84276/124589[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m3:18[0m 5ms/step - loss: 0.1878 - mean_absolute_error: 0.1860 - mean_squared_error: 0.0576

KeyboardInterrupt: 

In [None]:
# Saving the model
model_dir = "../models/"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
model.save(os.path.join(model_dir, "movie_recommendation_model.keras"))
# Load model to ensure it saved properly
loaded_model = load_model(os.path.join(model_dir, "movie_recommendation_model.keras"))
loaded_model.summary()

In [None]:
model.evaluate()

In [None]:
#Plot training and validation loss
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()

#Plot training and validation accuracy
plt.plot(history.history["mae"])
plt.plot(history.history["mse"])
plt.title("model accuracy")
plt.ylabel("accuracy")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()

In [None]:
#Show top 10 movie recommendations to a user
# Let us get a user and see the top recommendations.
user_id = df.userId.sample(1).iloc[0]
movies_watched_by_user = df[df.userId == user_id]
movies_not_watched = movie_df[
    ~movie_df["movieId"].isin(movies_watched_by_user.movieId.values)
]["movieId"]
movies_not_watched = list(
    set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
)
movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
user_encoder = user2user_encoded.get(user_id)
user_movie_array = np.hstack(
    ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
)
ratings = model.predict(user_movie_array).flatten()
top_ratings_indices = ratings.argsort()[-10:][::-1]
recommended_movie_ids = [
    movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
]

print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)
print("Movies with high ratings from user")
print("----" * 8)
top_movies_user = (
    movies_watched_by_user.sort_values(by="rating", ascending=False)
    .head(5)
    .movieId.values
)
movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.genres)

print("----" * 8)
print("Top 10 movie recommendations")
print("----" * 8)
recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)