In [1]:
import pandas as pd
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5001")

In [2]:
ratings = pd.read_csv('../data/scores.csv')
df_users = pd.read_csv('../data/usuarios.csv')
df_movies = pd.read_csv('../data/peliculas.csv')

df_movies.loc[df_movies['IMDB URL'].isna(), 'IMDB URL'] = ''

u_unique = ratings.user_id.unique()
user2Idx = {o:i+1 for i,o in enumerate(u_unique)}

m_unique = ratings.movie_id.unique()
movie2Idx = {o:i+1 for i,o in enumerate(m_unique)}

ratings.user_id = ratings.user_id.apply(lambda x: user2Idx[x])

ratings.movie_id = ratings.movie_id.apply(lambda x: movie2Idx[x])

ratings.head(5)

Unnamed: 0,id,user_id,movie_id,rating,Date
0,0,1,1,3,1997-12-04 15:55:49
1,1,2,2,3,1998-04-04 19:22:22
2,2,3,3,1,1997-11-07 07:18:36
3,3,4,4,2,1997-11-27 05:02:03
4,4,5,5,1,1998-02-02 05:33:16


In [3]:
from sklearn.model_selection import train_test_split
ratings_train, ratings_val = train_test_split(ratings, test_size=0.2)

In [4]:
n_users = int(ratings.user_id.nunique())
n_movies = int(ratings.movie_id.nunique())
n_users_train = int(ratings_train.user_id.nunique())
n_movies_train = int(ratings_train.movie_id.nunique())
print(n_users, n_movies, n_users_train, n_movies_train)

943 1682 943 1646


In [5]:
max_rating = ratings_train['rating'].max()
min_rating = ratings_train['rating'].min()
av_rating = ratings_train['rating'].mean()
max_rating, min_rating, av_rating

(5, 1, 3.5310125)

In [46]:
#Seteo del experimento
experiment_name = "Neural Network"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/981061820529434616', creation_time=1730667820019, experiment_id='981061820529434616', last_update_time=1730667820019, lifecycle_stage='active', name='Neural Network', tags={}>

In [7]:
from keras.layers import Input, Embedding, Flatten, Dropout, Concatenate, Dense, Activation, Lambda
from keras import Model
from keras.regularizers import l2
from keras.optimizers import Adam

2024-11-28 13:33:38.728983: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [91]:
mlflow.start_run(run_name="Latent factor 5")

<ActiveRun: >

In [92]:
n_latent_factors_user = 5
mlflow.log_param("n_latent_factors_user", n_latent_factors_user)
n_latent_factors_movie = 5
mlflow.log_param("n_latent_factors_movie", n_latent_factors_movie)

5

In [93]:
movie_embedding_regularizer = 0.001
mlflow.log_param("movie_embedding_regularizer_l2", movie_embedding_regularizer)

movie_input = Input(shape=[1],name='Item')
movie_embedding = Embedding(n_movies + 1, n_latent_factors_movie, name='Movie-Embedding', embeddings_regularizer = l2(movie_embedding_regularizer))(movie_input)
movie_vec = Flatten(name='FlattenMovies')(movie_embedding)
#movie_vec = Dropout(0.2)(movie_vec)

user_input = Input(shape=[1],name='User')
user_vec = Flatten(name='FlattenUsers')(Embedding(n_users + 1, 
n_latent_factors_user,name='User-Embedding')(user_input))
#user_vec = Dropout(0.2)(user_vec)

concat = Concatenate(name='Concat')([movie_vec, user_vec])
#concat = Dropout(0.2)(concat)

x = Dense(50,name='FullyConnected-1', activation='relu')(concat)
#x = Dropout(0.5)(x)
# x = Dense(50,name='FullyConnected-1', activation='relu')(concat)
# x = Dropout(0.5)(x)


## Se pueden sacar las siguientes dos lineas para no forzar a sigmoidea
x = Dense(1, activation='sigmoid',name='Activation')(x)
x = Lambda(lambda z: (max_rating - min_rating) * z + min_rating)(x)
##

model = Model([user_input, movie_input], x)
model.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Item (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 User (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 Movie-Embedding (Embedding)    (None, 1, 5)         8415        ['Item[0][0]']                   
                                                                                                  
 User-Embedding (Embedding)     (None, 1, 5)         4720        ['User[0][0]']                   
                                                                                            

In [94]:
import keras.backend as K 
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [95]:
lr = 0.001
model.compile(Adam(learning_rate=lr), 'mean_squared_error', metrics=[root_mean_squared_error])
mlflow.log_param("lr", lr)

0.001

In [96]:
from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='weights.hdf5', verbose=1, save_best_only=True, monitor='val_root_mean_squared_error')

In [97]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
patience = 7
early_stopping = EarlyStopping(monitor='val_root_mean_squared_error', patience=patience, restore_best_weights=True)
mlflow.log_param("early_stopping_patience", patience)

7

In [98]:
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',      # Métrica a monitorear (puede ser 'val_loss' o 'loss')
    factor=0.5,              # Factor de reducción del learning rate (e.g., reduce a la mitad)
    patience=2,              # Número de épocas sin mejora antes de reducir
    min_lr=1e-6,             # Learning rate mínimo permitido
    verbose=1                # Mostrar logs cuando se reduzca el LR
)
mlflow.log_param("reduce_lr", reduce_lr)

<keras.callbacks.ReduceLROnPlateau at 0x18b428ee0>

In [99]:
batch_size = 320
epochs = 100
mlflow.log_param("batch_size", batch_size)
mlflow.log_param("epochs", epochs)

history = model.fit([ratings_train.user_id, ratings_train.movie_id], 
                    ratings_train.rating, 
                    validation_data=([ratings_val.user_id, ratings_val.movie_id], ratings_val.rating), 
                    batch_size = batch_size,
                    callbacks = [checkpointer],
                    epochs=epochs, verbose=1)

Epoch 1/100
Epoch 1: val_root_mean_squared_error improved from inf to 0.95456, saving model to weights.hdf5
Epoch 2/100
Epoch 2: val_root_mean_squared_error improved from 0.95456 to 0.94188, saving model to weights.hdf5
Epoch 3/100
Epoch 3: val_root_mean_squared_error improved from 0.94188 to 0.93882, saving model to weights.hdf5
Epoch 4/100
Epoch 4: val_root_mean_squared_error improved from 0.93882 to 0.93833, saving model to weights.hdf5
Epoch 5/100
Epoch 5: val_root_mean_squared_error improved from 0.93833 to 0.93801, saving model to weights.hdf5
Epoch 6/100
Epoch 6: val_root_mean_squared_error improved from 0.93801 to 0.93726, saving model to weights.hdf5
Epoch 7/100
Epoch 7: val_root_mean_squared_error improved from 0.93726 to 0.93579, saving model to weights.hdf5
Epoch 8/100
Epoch 8: val_root_mean_squared_error improved from 0.93579 to 0.93394, saving model to weights.hdf5
Epoch 9/100
Epoch 9: val_root_mean_squared_error improved from 0.93394 to 0.93137, saving model to weights.h

In [54]:
for key, value in history.history.items():
  mlflow.log_metric(key, value[-1]) 

In [49]:
model.evaluate([ratings_val.user_id, ratings_val.movie_id], ratings_val.rating)



[0.8871934413909912, 0.9262680411338806]

In [100]:
model.load_weights('weights.hdf5')
mse, rmse = model.evaluate([ratings_val.user_id, ratings_val.movie_id], ratings_val.rating)
mlflow.log_metric("val_mse", mse) 
mlflow.log_metric("val_rmse", rmse) 



In [101]:
movie_embeddings_layer = model.layers[2]
user_embeddings_layer = model.layers[3]

movie_embeddings_layer.name, user_embeddings_layer.name

('Movie-Embedding', 'User-Embedding')

In [102]:
mlflow.keras.log_model(model, "best_model")



INFO:tensorflow:Assets written to: /var/folders/7f/19f36bv57_72qpq3mfj1x6240000gn/T/tmpolxub5_u/model/data/model/assets


INFO:tensorflow:Assets written to: /var/folders/7f/19f36bv57_72qpq3mfj1x6240000gn/T/tmpolxub5_u/model/data/model/assets


<mlflow.models.model.ModelInfo at 0x18b6e1a00>

In [103]:
mlflow.end_run()

2024/11/28 14:11:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run Latent factor 5 at: http://127.0.0.1:5001/#/experiments/981061820529434616/runs/bb6fb7b6bf92488bbc6717fcb20c1368.
2024/11/28 14:11:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5001/#/experiments/981061820529434616.


#### Hay una diferencia de 1 entre n_movies, n_users y  el shape de las matrices de embeddigs

In [104]:
movie_embeddings_matrix = movie_embeddings_layer.get_weights()[0]
user_embeddings_matrix = user_embeddings_layer.get_weights()[0]

movie_embeddings_matrix.shape, user_embeddings_matrix.shape, n_movies, n_users

((1683, 5), (944, 5), 1682, 943)

In [105]:
import numpy as np

np.save('../data/vector_db/movie_embeddings_matrix.npy', movie_embeddings_matrix)
np.save('../data/vector_db/user_embeddings_matrix.npy', user_embeddings_matrix)
np.save('../data/vector_db/user2Idx.npy', user2Idx)
np.save('../data/vector_db/movie2Idx.npy', movie2Idx)