In [103]:
import pandas as pd
import numpy as np
from keras import Model, Sequential
from keras.layers import Embedding, Input, Flatten, Dot, Add
from keras.regularizers import l2
from keras.optimizers import Adam
import keras.backend as K 

from datetime import datetime

import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5001")

In [104]:
ratings = pd.read_csv('../data/scores.csv')
df_users = pd.read_csv('../data/usuarios.csv')
df_movies = pd.read_csv('../data/peliculas.csv')

df_movies.loc[df_movies['IMDB URL'].isna(), 'IMDB URL'] = ''

u_unique = ratings.user_id.unique()
user2Idx = {o:i+1 for i,o in enumerate(u_unique)}

m_unique = ratings.movie_id.unique()
movie2Idx = {o:i+1 for i,o in enumerate(m_unique)}

ratings.user_id = ratings.user_id.apply(lambda x: user2Idx[x])

ratings.movie_id = ratings.movie_id.apply(lambda x: movie2Idx[x])

ratings.head(5)

Unnamed: 0,id,user_id,movie_id,rating,Date
0,0,1,1,3,1997-12-04 15:55:49
1,1,2,2,3,1998-04-04 19:22:22
2,2,3,3,1,1997-11-07 07:18:36
3,3,4,4,2,1997-11-27 05:02:03
4,4,5,5,1,1998-02-02 05:33:16


In [19]:
from sklearn.model_selection import train_test_split
ratings_train, ratings_val = train_test_split(ratings, test_size=0.2)

In [20]:
n_users = int(ratings.user_id.nunique())
n_movies = int(ratings.movie_id.nunique())
n_users_train = int(ratings_train.user_id.nunique())
n_movies_train = int(ratings_train.movie_id.nunique())
print(n_users, n_movies, n_users_train, n_movies_train)

943 1682 943 1655


In [6]:
#Seteo del experimento
experiment_name = "Baselines"
mlflow.set_experiment(experiment_name)

2024/11/03 19:26:57 INFO mlflow.tracking.fluent: Experiment with name 'Baselines' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/826553565838014976', creation_time=1730672817055, experiment_id='826553565838014976', last_update_time=1730672817055, lifecycle_stage='active', name='Baselines', tags={}>

In [160]:
mlflow.start_run(run_name="Early Stopping + latent factor 5 + lr 0.0014")

<ActiveRun: >

In [161]:
n_latent_factors = 5
mlflow.log_param("n_latent_factors", n_latent_factors)

5

In [162]:
# l2_reg = l2(0.00025)
movie_embedding_regularizer = 0.001
l2_reg = l2(0.00)
mlflow.log_param("movie_embedding_regularizer_l2", movie_embedding_regularizer)
mlflow.log_param("user_embedding_regularizer_l2", 0.00)
movie_input = Input(shape=[1], name='Item')
movie_embedding = Embedding(n_movies + 1, 
                            n_latent_factors, 
                            embeddings_regularizer = l2(movie_embedding_regularizer),
                            name='Movie-Embedding')(movie_input)
movie_vec = Flatten(name='FlattenMovies')(movie_embedding)

m_biases = Flatten(name='movie_biases_flt')(Embedding(n_movies + 1, 1, name="movie_biases", embeddings_regularizer = l2_reg)(movie_input))

user_input = Input(shape=[1],name='User')
user_vec = Flatten(name='FlattenUsers')(Embedding(n_users + 1, n_latent_factors,embeddings_regularizer = l2_reg,name='User-Embedding')(user_input))
u_biases = Flatten(name='user_biases_flt')(Embedding(n_users + 1, 1, name="user_biases", embeddings_regularizer = l2_reg)(user_input))

In [163]:
prod = Dot(axes=1, name='DotProduct')([movie_vec, user_vec])
out = Add()([prod, u_biases, m_biases])
model = Model([user_input, movie_input], out)
model.summary()

Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Item (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 User (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 Movie-Embedding (Embedding)    (None, 1, 5)         8415        ['Item[0][0]']                   
                                                                                                  
 User-Embedding (Embedding)     (None, 1, 5)         4720        ['User[0][0]']                   
                                                                                           

In [164]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [165]:
lr = 0.0014
model.compile(Adam(learning_rate=lr), 'mean_squared_error', metrics=[root_mean_squared_error])
mlflow.log_param("lr", lr)

0.0014

In [166]:
from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='weights1.hdf5', verbose=1, save_best_only=True, monitor='val_root_mean_squared_error')

In [167]:
from tensorflow.keras.callbacks import EarlyStopping
patience = 5
early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
mlflow.log_param("early_stopping_patience", patience)

5

In [168]:
batch_size = 320
epochs = 100
mlflow.log_param("batch_size", batch_size)
mlflow.log_param("epochs", epochs)

history = model.fit([ratings_train.user_id, ratings_train.movie_id], 
                    ratings_train.rating, 
                    batch_size=batch_size,
                    validation_data=([ratings_val.user_id, ratings_val.movie_id], ratings_val.rating), 
                    epochs=epochs, 
                    callbacks = [checkpointer, early_stopping],
                    verbose=1)

Epoch 1/100
Epoch 1: val_root_mean_squared_error improved from inf to 3.25344, saving model to weights1.hdf5
Epoch 2/100
Epoch 2: val_root_mean_squared_error improved from 3.25344 to 2.27224, saving model to weights1.hdf5
Epoch 3/100
Epoch 3: val_root_mean_squared_error improved from 2.27224 to 1.59860, saving model to weights1.hdf5
Epoch 4/100
Epoch 4: val_root_mean_squared_error improved from 1.59860 to 1.34544, saving model to weights1.hdf5
Epoch 5/100
Epoch 5: val_root_mean_squared_error improved from 1.34544 to 1.23259, saving model to weights1.hdf5
Epoch 6/100
Epoch 6: val_root_mean_squared_error improved from 1.23259 to 1.16727, saving model to weights1.hdf5
Epoch 7/100
Epoch 7: val_root_mean_squared_error improved from 1.16727 to 1.12464, saving model to weights1.hdf5
Epoch 8/100
Epoch 8: val_root_mean_squared_error improved from 1.12464 to 1.09385, saving model to weights1.hdf5
Epoch 9/100
Epoch 9: val_root_mean_squared_error improved from 1.09385 to 1.07033, saving model to w

In [169]:
for key, value in history.history.items():
  mlflow.log_metric(key, value[-1]) 

In [170]:
model.evaluate([ratings_val.user_id, ratings_val.movie_id], ratings_val.rating)



[0.8651050925254822, 0.9024324417114258]

In [171]:
model.load_weights('weights1.hdf5')
mse, rmse = model.evaluate([ratings_val.user_id, ratings_val.movie_id], ratings_val.rating)
mlflow.log_metric("val_mse", mse) 
mlflow.log_metric("val_rmse", rmse) 



In [13]:
movie_embeddings_layer = model.layers[2]
user_embeddings_layer = model.layers[3]

movie_embeddings_layer.name, user_embeddings_layer.name

('Movie-Embedding', 'User-Embedding')

In [131]:
mlflow.keras.log_model(model, "best_model")



INFO:tensorflow:Assets written to: /var/folders/7f/19f36bv57_72qpq3mfj1x6240000gn/T/tmpuj_5ulg9/model/data/model/assets


INFO:tensorflow:Assets written to: /var/folders/7f/19f36bv57_72qpq3mfj1x6240000gn/T/tmpuj_5ulg9/model/data/model/assets


<mlflow.models.model.ModelInfo at 0x7f9fca012310>

In [172]:
mlflow.end_run()

2024/11/03 20:10:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run Early Stopping + latent factor 5 + lr 0.0014 at: http://127.0.0.1:5001/#/experiments/826553565838014976/runs/bdfe7355c87849a0951c10d2c07f5cfa.
2024/11/03 20:10:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5001/#/experiments/826553565838014976.


#### Hay una diferencia de 1 entre n_movies, n_users y  el shape de las matrices de embeddigs

In [16]:
movie_embeddings_matrix = movie_embeddings_layer.get_weights()[0]
user_embeddings_matrix = user_embeddings_layer.get_weights()[0]

movie_embeddings_matrix.shape, user_embeddings_matrix.shape, n_movies, n_users

((1683, 3), (944, 3), 1682, 943)

In [17]:
np.save('../data/vector_db/movie_embeddings_matrix.npy', movie_embeddings_matrix)
np.save('../data/vector_db/user_embeddings_matrix.npy', user_embeddings_matrix)
np.save('../data/vector_db/user2Idx.npy', user2Idx)
np.save('../data/vector_db/movie2Idx.npy', movie2Idx)