In [4]:
import pandas as pd
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5001")

In [None]:
ratings = pd.read_csv('../data/scores.csv')
df_users = pd.read_csv('../data/usuarios.csv')
df_movies = pd.read_csv('../data/peliculas.csv')

df_movies.loc[df_movies['IMDB URL'].isna(), 'IMDB URL'] = ''

u_unique = ratings.user_id.unique()
user2Idx = {o:i+1 for i,o in enumerate(u_unique)}

m_unique = ratings.movie_id.unique()
movie2Idx = {o:i+1 for i,o in enumerate(m_unique)}

ratings.user_id = ratings.user_id.apply(lambda x: user2Idx[x])

ratings.movie_id = ratings.movie_id.apply(lambda x: movie2Idx[x])

ratings.head(5)

AttributeError: 'DataFrame' object has no attribute 'userId'

In [41]:
from sklearn.model_selection import train_test_split
ratings_train, ratings_val = train_test_split(ratings, test_size=0.2)

In [42]:
n_users = int(ratings.user_id.nunique())
n_movies = int(ratings.movie_id.nunique())
n_users_train = int(ratings_train.user_id.nunique())
n_movies_train = int(ratings_train.movie_id.nunique())
print(n_users, n_movies, n_users_train, n_movies_train)

943 1682 943 1649


In [43]:
max_rating = ratings_train['rating'].max()
min_rating = ratings_train['rating'].min()
av_rating = ratings_train['rating'].mean()
max_rating, min_rating, av_rating

(5, 1, 3.5301875)

In [44]:
#Seteo del experimento
experiment_name = "Neural Network"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/981061820529434616', creation_time=1730667820019, experiment_id='981061820529434616', last_update_time=1730667820019, lifecycle_stage='active', name='Neural Network', tags={}>

In [45]:
from keras.layers import Input, Embedding, Flatten, Dropout, Concatenate, Dense, Activation, Lambda
from keras import Model
from keras.regularizers import l2
from keras.optimizers import Adam

In [46]:
mlflow.start_run(run_name="Early Stoping + latent factor 5 (3)")

<ActiveRun: >

In [47]:
n_latent_factors_user = 5
mlflow.log_param("n_latent_factors_user", n_latent_factors_user)
n_latent_factors_movie = 5
mlflow.log_param("n_latent_factors_movie", n_latent_factors_movie)

5

In [48]:
movie_embedding_regularizer = 0.001
mlflow.log_param("movie_embedding_regularizer_l2", movie_embedding_regularizer)

movie_input = Input(shape=[1],name='Item')
movie_embedding = Embedding(n_movies + 1, n_latent_factors_movie, name='Movie-Embedding', embeddings_regularizer = l2(movie_embedding_regularizer))(movie_input)
movie_vec = Flatten(name='FlattenMovies')(movie_embedding)
#movie_vec = Dropout(0.2)(movie_vec)

user_input = Input(shape=[1],name='User')
user_vec = Flatten(name='FlattenUsers')(Embedding(n_users + 1, 
n_latent_factors_user,name='User-Embedding')(user_input))
#user_vec = Dropout(0.2)(user_vec)

concat = Concatenate(name='Concat')([movie_vec, user_vec])
#concat = Dropout(0.2)(concat)

x = Dense(50,name='FullyConnected-1', activation='relu')(concat)
#x = Dropout(0.5)(x)
# x = Dense(50,name='FullyConnected-1', activation='relu')(concat)
# x = Dropout(0.5)(x)


## Se pueden sacar las siguientes dos lineas para no forzar a sigmoidea
x = Dense(1, activation='sigmoid',name='Activation')(x)
x = Lambda(lambda z: (max_rating - min_rating) * z + min_rating)(x)
##

model = Model([user_input, movie_input], x)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Item (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 User (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 Movie-Embedding (Embedding)    (None, 1, 5)         8415        ['Item[0][0]']                   
                                                                                                  
 User-Embedding (Embedding)     (None, 1, 5)         4720        ['User[0][0]']                   
                                                                                            

In [49]:
import keras.backend as K 
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [50]:
lr = 0.001
model.compile(Adam(learning_rate=lr), 'mean_squared_error', metrics=[root_mean_squared_error])
mlflow.log_param("lr", lr)

0.001

In [51]:
from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='weights1.hdf5', verbose=1, save_best_only=True, monitor='val_root_mean_squared_error')

In [52]:
from tensorflow.keras.callbacks import EarlyStopping
patience = 5
early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
mlflow.log_param("early_stopping_patience", patience)

5

In [53]:
batch_size = 320
epochs = 100
mlflow.log_param("batch_size", batch_size)
mlflow.log_param("epochs", epochs)

history = model.fit([ratings_train.user_id, ratings_train.movie_id], 
                    ratings_train.rating, 
                    validation_data=([ratings_val.user_id, ratings_val.movie_id], ratings_val.rating), 
                    batch_size = batch_size,
                    callbacks = [checkpointer, early_stopping],
                    epochs=epochs, verbose=1)

Epoch 1/100
Epoch 1: val_root_mean_squared_error improved from inf to 0.95237, saving model to weights1.hdf5
Epoch 2/100
Epoch 2: val_root_mean_squared_error improved from 0.95237 to 0.93993, saving model to weights1.hdf5
Epoch 3/100
Epoch 3: val_root_mean_squared_error improved from 0.93993 to 0.93689, saving model to weights1.hdf5
Epoch 4/100
Epoch 4: val_root_mean_squared_error improved from 0.93689 to 0.93594, saving model to weights1.hdf5
Epoch 5/100
Epoch 5: val_root_mean_squared_error improved from 0.93594 to 0.93527, saving model to weights1.hdf5
Epoch 6/100
Epoch 6: val_root_mean_squared_error did not improve from 0.93527
Epoch 7/100
Epoch 7: val_root_mean_squared_error improved from 0.93527 to 0.93420, saving model to weights1.hdf5
Epoch 8/100
Epoch 8: val_root_mean_squared_error improved from 0.93420 to 0.93174, saving model to weights1.hdf5
Epoch 9/100
Epoch 9: val_root_mean_squared_error improved from 0.93174 to 0.92906, saving model to weights1.hdf5
Epoch 10/100
Epoch 10:

In [54]:
for key, value in history.history.items():
  mlflow.log_metric(key, value[-1]) 

In [55]:
model.evaluate([ratings_val.user_id, ratings_val.movie_id], ratings_val.rating)



[0.8476566672325134, 0.9037073254585266]

In [56]:
model.load_weights('weights1.hdf5')
mse, rmse = model.evaluate([ratings_val.user_id, ratings_val.movie_id], ratings_val.rating)
mlflow.log_metric("val_mse", mse) 
mlflow.log_metric("val_rmse", rmse) 



In [60]:
movie_embeddings_layer = model.layers[2]
user_embeddings_layer = model.layers[3]

movie_embeddings_layer.name, user_embeddings_layer.name

('Movie-Embedding', 'User-Embedding')

In [57]:
mlflow.keras.log_model(model, "best_model")



INFO:tensorflow:Assets written to: /var/folders/7f/19f36bv57_72qpq3mfj1x6240000gn/T/tmpx15stbtv/model/data/model/assets


INFO:tensorflow:Assets written to: /var/folders/7f/19f36bv57_72qpq3mfj1x6240000gn/T/tmpx15stbtv/model/data/model/assets


<mlflow.models.model.ModelInfo at 0x1884bbac0>

In [58]:
mlflow.end_run()

2024/11/17 17:28:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run Early Stoping + latent factor 5 (3) at: http://127.0.0.1:5001/#/experiments/981061820529434616/runs/4b88bd999cfa4d8e806b18d75ba56761.
2024/11/17 17:28:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5001/#/experiments/981061820529434616.


#### Hay una diferencia de 1 entre n_movies, n_users y  el shape de las matrices de embeddigs

In [61]:
movie_embeddings_matrix = movie_embeddings_layer.get_weights()[0]
user_embeddings_matrix = user_embeddings_layer.get_weights()[0]

movie_embeddings_matrix.shape, user_embeddings_matrix.shape, n_movies, n_users

((1683, 5), (944, 5), 1682, 943)

In [62]:
import numpy as np

np.save('../data/vector_db/movie_embeddings_matrix_1.npy', movie_embeddings_matrix)
np.save('../data/vector_db/user_embeddings_matrix_1.npy', user_embeddings_matrix)
np.save('../data/vector_db/user2Idx_1.npy', user2Idx)
np.save('../data/vector_db/movie2Idx_1.npy', movie2Idx)