In [4]:
import pandas as pd
import numpy as np
from keras import Model, Sequential
from keras.layers import Embedding, Input, Flatten, Dot, Add
from keras.regularizers import l2
from keras.optimizers import Adam
import keras.backend as K 

from datetime import datetime

In [6]:
ratings = pd.read_csv('../data/scores.csv')
df_users = pd.read_csv('../data/usuarios.csv')
df_movies = pd.read_csv('../data/peliculas.csv')

df_movies.loc[df_movies['IMDB URL'].isna(), 'IMDB URL'] = ''

u_unique = ratings.user_id.unique()
user2Idx = {o:i+1 for i,o in enumerate(u_unique)}

m_unique = ratings.movie_id.unique()
movie2Idx = {o:i+1 for i,o in enumerate(m_unique)}

ratings.user_id = ratings.user_id.apply(lambda x: user2Idx[x])

ratings.movie_id = ratings.movie_id.apply(lambda x: movie2Idx[x])

ratings.head(5)

Unnamed: 0,id,user_id,movie_id,rating,Date
0,0,1,1,3,1997-12-04 15:55:49
1,1,2,2,3,1998-04-04 19:22:22
2,2,3,3,1,1997-11-07 07:18:36
3,3,4,4,2,1997-11-27 05:02:03
4,4,5,5,1,1998-02-02 05:33:16


In [7]:
from sklearn.model_selection import train_test_split
ratings_train, ratings_val = train_test_split(ratings, test_size=0.2)

In [8]:
n_users = int(ratings.user_id.nunique())
n_movies = int(ratings.movie_id.nunique())
n_users_train = int(ratings_train.user_id.nunique())
n_movies_train = int(ratings_train.movie_id.nunique())
print(n_users, n_movies, n_users_train, n_movies_train)

943 1682 943 1648


In [9]:
n_latent_factors = 3

In [10]:
# l2_reg = l2(0.00025)
l2_reg = l2(0.00)
movie_input = Input(shape=[1], name='Item')
movie_embedding = Embedding(n_movies + 1, 
                            n_latent_factors, 
                            embeddings_regularizer = l2(0.001),
                            name='Movie-Embedding')(movie_input)
movie_vec = Flatten(name='FlattenMovies')(movie_embedding)

m_biases = Flatten(name='movie_biases_flt')(Embedding(n_movies + 1, 1, name="movie_biases", embeddings_regularizer = l2_reg)(movie_input))

user_input = Input(shape=[1],name='User')
user_vec = Flatten(name='FlattenUsers')(Embedding(n_users + 1, n_latent_factors,embeddings_regularizer = l2_reg,name='User-Embedding')(user_input))
u_biases = Flatten(name='user_biases_flt')(Embedding(n_users + 1, 1, name="user_biases", embeddings_regularizer = l2_reg)(user_input))

In [11]:
prod = Dot(axes=1, name='DotProduct')([movie_vec, user_vec])
out = Add()([prod, u_biases, m_biases])
model = Model([user_input, movie_input], out)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Item (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 User (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 Movie-Embedding (Embedding)    (None, 1, 3)         5049        ['Item[0][0]']                   
                                                                                                  
 User-Embedding (Embedding)     (None, 1, 3)         2832        ['User[0][0]']                   
                                                                                              

In [12]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [13]:
model.compile(Adam(learning_rate=0.001), 'mean_squared_error', metrics=[root_mean_squared_error])

In [19]:
history = model.fit([ratings_train.user_id, ratings_train.movie_id], 
                    ratings_train.rating, 
                    batch_size=320,
                    validation_data=([ratings_val.user_id, ratings_val.movie_id], ratings_val.rating), 
                    epochs=100, 
                    #callbacks = [plot_metrics],
                    verbose=1)

Epoch 1/100

<Figure size 640x480 with 0 Axes>

Epoch 2/100

<Figure size 640x480 with 0 Axes>

Epoch 3/100

<Figure size 640x480 with 0 Axes>

Epoch 4/100

<Figure size 640x480 with 0 Axes>

Epoch 5/100

<Figure size 640x480 with 0 Axes>

Epoch 6/100

<Figure size 640x480 with 0 Axes>

Epoch 7/100

<Figure size 640x480 with 0 Axes>

Epoch 8/100

<Figure size 640x480 with 0 Axes>

Epoch 9/100

<Figure size 640x480 with 0 Axes>

Epoch 10/100

<Figure size 640x480 with 0 Axes>

Epoch 11/100

<Figure size 640x480 with 0 Axes>

Epoch 12/100

<Figure size 640x480 with 0 Axes>

Epoch 13/100

<Figure size 640x480 with 0 Axes>

Epoch 14/100

<Figure size 640x480 with 0 Axes>

Epoch 15/100

<Figure size 640x480 with 0 Axes>

Epoch 16/100

<Figure size 640x480 with 0 Axes>

Epoch 17/100

<Figure size 640x480 with 0 Axes>

Epoch 18/100

<Figure size 640x480 with 0 Axes>

Epoch 19/100

<Figure size 640x480 with 0 Axes>

Epoch 20/100

<Figure size 640x480 with 0 Axes>

Epoch 21/100

<Figure size 640x480 with 0 Axes>

Epoch 22/100

<Figure size 640x480 with 0 Axes>

Epoch 23/100

<Figure size 640x480 with 0 Axes>

Epoch 24/100

<Figure size 640x480 with 0 Axes>

Epoch 25/100

<Figure size 640x480 with 0 Axes>

Epoch 26/100

<Figure size 640x480 with 0 Axes>

Epoch 27/100

<Figure size 640x480 with 0 Axes>

Epoch 28/100

<Figure size 640x480 with 0 Axes>

Epoch 29/100

<Figure size 640x480 with 0 Axes>

Epoch 30/100

<Figure size 640x480 with 0 Axes>

Epoch 31/100

<Figure size 640x480 with 0 Axes>

Epoch 32/100

<Figure size 640x480 with 0 Axes>

Epoch 33/100

<Figure size 640x480 with 0 Axes>

Epoch 34/100

<Figure size 640x480 with 0 Axes>

Epoch 35/100

KeyboardInterrupt: 

<Figure size 640x480 with 0 Axes>

In [11]:
model.evaluate([ratings_val.user_id, ratings_val.movie_id], ratings_val.rating)



[0.852039635181427, 0.9004318118095398]

In [13]:
movie_embeddings_layer = model.layers[2]
user_embeddings_layer = model.layers[3]

movie_embeddings_layer.name, user_embeddings_layer.name

('Movie-Embedding', 'User-Embedding')

#### Hay una diferencia de 1 entre n_movies, n_users y  el shape de las matrices de embeddigs

In [16]:
movie_embeddings_matrix = movie_embeddings_layer.get_weights()[0]
user_embeddings_matrix = user_embeddings_layer.get_weights()[0]

movie_embeddings_matrix.shape, user_embeddings_matrix.shape, n_movies, n_users

((1683, 3), (944, 3), 1682, 943)

In [17]:
np.save('../data/vector_db/movie_embeddings_matrix.npy', movie_embeddings_matrix)
np.save('../data/vector_db/user_embeddings_matrix.npy', user_embeddings_matrix)
np.save('../data/vector_db/user2Idx.npy', user2Idx)
np.save('../data/vector_db/movie2Idx.npy', movie2Idx)