In [1]:
import pandas as pd
import numpy as np
#import tensorflow as tf
#from tensorflow import keras
from keras import Model, Sequential
from keras.layers import Embedding, Input, Flatten, Dot, Add
from keras.regularizers import l2
from keras.optimizers import Adam
import keras.backend as K 

from datetime import datetime

2024-08-19 19:44:51.960759: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
ratings = pd.read_csv('../data/scores.csv')
df_users = pd.read_csv('../data/usuarios.csv')
df_movies = pd.read_csv('../data/peliculas.csv')

u_unique = ratings.user_id.unique()
user2Idx = {o:i+1 for i,o in enumerate(u_unique)}

m_unique = ratings.movie_id.unique()
movie2Idx = {o:i+1 for i,o in enumerate(m_unique)}

ratings.user_id = ratings.user_id.apply(lambda x: user2Idx[x])

ratings.movie_id = ratings.movie_id.apply(lambda x: movie2Idx[x])

ratings.head()

Unnamed: 0,id,user_id,movie_id,rating,Date
0,0,1,1,3,1997-12-04 15:55:49
1,1,2,2,3,1998-04-04 19:22:22
2,2,3,3,1,1997-11-07 07:18:36
3,3,4,4,2,1997-11-27 05:02:03
4,4,5,5,1,1998-02-02 05:33:16


In [3]:
from sklearn.model_selection import train_test_split
ratings_train, ratings_val = train_test_split(ratings, test_size=0.2)

In [4]:
n_users = int(ratings.user_id.nunique())
n_movies = int(ratings.movie_id.nunique())
n_users_train = int(ratings_train.user_id.nunique())
n_movies_train = int(ratings_train.movie_id.nunique())
print(n_users, n_movies, n_users_train, n_movies_train)

943 1682 943 1642


In [5]:
n_latent_factors = 3

In [6]:
# l2_reg = l2(0.00025)
l2_reg = l2(0.00)
movie_input = Input(shape=[1], name='Item')
movie_embedding = Embedding(n_movies + 1, 
                            n_latent_factors, 
                            embeddings_regularizer = l2(0.001),
                            name='Movie-Embedding')(movie_input)
movie_vec = Flatten(name='FlattenMovies')(movie_embedding)

m_biases = Flatten(name='movie_biases_flt')(Embedding(n_movies + 1, 1, name="movie_biases", embeddings_regularizer = l2_reg)(movie_input))

user_input = Input(shape=[1],name='User')
user_vec = Flatten(name='FlattenUsers')(Embedding(n_users + 1, n_latent_factors,embeddings_regularizer = l2_reg,name='User-Embedding')(user_input))
u_biases = Flatten(name='user_biases_flt')(Embedding(n_users + 1, 1, name="user_biases", embeddings_regularizer = l2_reg)(user_input))

In [7]:
prod = Dot(axes=1, name='DotProduct')([movie_vec, user_vec])
out = Add()([prod, u_biases, m_biases])
model = Model([user_input, movie_input], out)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Item (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 User (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 Movie-Embedding (Embedding)    (None, 1, 3)         5049        ['Item[0][0]']                   
                                                                                                  
 User-Embedding (Embedding)     (None, 1, 3)         2832        ['User[0][0]']                   
                                                                                              

In [8]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [9]:
model.compile(Adam(learning_rate=0.001), 'mean_squared_error', metrics=[root_mean_squared_error])

In [10]:
history = model.fit([ratings_train.user_id, ratings_train.movie_id], 
                    ratings_train.rating, 
                    batch_size=320,
                    validation_data=([ratings_val.user_id, ratings_val.movie_id], ratings_val.rating), 
                    epochs=100, 
                    #callbacks = [plot_losses],
                    verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [11]:
model.evaluate([ratings_val.user_id, ratings_val.movie_id], ratings_val.rating)



[0.852039635181427, 0.9004318118095398]

In [13]:
movie_embeddings_layer = model.layers[2]
user_embeddings_layer = model.layers[3]

movie_embeddings_layer.name, user_embeddings_layer.name

('Movie-Embedding', 'User-Embedding')

#### Hay una diferencia de 1 entre n_movies, n_users y  el shape de las matrices de embeddigs

In [16]:
movie_embeddings_matrix = movie_embeddings_layer.get_weights()[0]
user_embeddings_matrix = user_embeddings_layer.get_weights()[0]

movie_embeddings_matrix.shape, user_embeddings_matrix.shape, n_movies, n_users

((1683, 3), (944, 3), 1682, 943)

In [17]:
np.save('../data/vector_db/movie_embeddings_matrix.npy', movie_embeddings_matrix)
np.save('../data/vector_db/user_embeddings_matrix.npy', user_embeddings_matrix)
np.save('../data/vector_db/user2Idx.npy', user2Idx)
np.save('../data/vector_db/movie2Idx.npy', movie2Idx)

In [2]:
movie_embeddings_matrix = np.load('../data/vector_db/movie_embeddings_matrix.npy')
user_embeddings_matrix = np.load('../data/vector_db/user_embeddings_matrix.npy')
user2Idx = np.load('../data/vector_db/user2Idx.npy', allow_pickle=True).item()
movie2Idx = np.load('../data/vector_db/movie2Idx.npy', allow_pickle=True).item()

In [5]:
df_users['userIdx'] = df_users['id'].apply(lambda x: user2Idx[x])
df_movies['movieIdx'] = df_movies['id'].apply(lambda x: movie2Idx[x])

In [6]:

#from config import client
from opensearchpy import OpenSearch
from opensearchpy import Field, Boolean, Float, Integer, Document, Keyword, Text, DenseVector, Nested, Date, Object


host = 'localhost'
port = 9200
auth = ('admin', 'J@r#o19b3')

client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
)


client.cluster.health()



{'cluster_name': 'docker-cluster',
 'status': 'yellow',
 'timed_out': False,
 'number_of_nodes': 1,
 'number_of_data_nodes': 1,
 'discovered_master': True,
 'discovered_cluster_manager': True,
 'active_primary_shards': 6,
 'active_shards': 6,
 'relocating_shards': 0,
 'initializing_shards': 0,
 'unassigned_shards': 2,
 'delayed_unassigned_shards': 0,
 'number_of_pending_tasks': 0,
 'number_of_in_flight_fetch': 0,
 'task_max_waiting_in_queue_millis': 0,
 'active_shards_percent_as_number': 75.0}

In [10]:
class KNNVector(Field):
    name = "knn_vector"
    def __init__(self, dimension, method, **kwargs):
        super(KNNVector, self).__init__(dimension=dimension, method=method, **kwargs)

method = {
    "name": "hnsw",
    "space_type": "cosinesimil",
    "engine": "nmslib"
}

index_name = 'movie'

#Ver función (dinámico y no dinámico)
client.indices.delete('movie')

class Movie(Document):
    movie_id = Keyword()
    url = Keyword()
    name = Text()
    vector = KNNVector(
        movie_embeddings_matrix.shape[1],
        method
    )
    created_at = Date()
    #terror = Boolean()

    class Index:
        name = index_name
        settings = {
                'index': {
                'knn': True
            }
        }

    def save(self, ** kwargs):
        self.meta.id = self.movie_id
        return super(Movie, self).save(** kwargs)

   
Movie.init(using=client)

client.indices.exists('movie')
client.indices.get('movie')



{'movie': {'aliases': {},
  'mappings': {'properties': {'created_at': {'type': 'date'},
    'movie_id': {'type': 'keyword'},
    'name': {'type': 'text'},
    'url': {'type': 'keyword'},
    'vector': {'type': 'knn_vector',
     'dimension': 3,
     'method': {'engine': 'nmslib',
      'space_type': 'cosinesimil',
      'name': 'hnsw',
      'parameters': {}}}}},
  'settings': {'index': {'replication': {'type': 'DOCUMENT'},
    'number_of_shards': '1',
    'provided_name': 'movie',
    'knn': 'true',
    'creation_date': '1724107801648',
    'number_of_replicas': '1',
    'uuid': 'xEmMB10nTLKmyX5wPSCgcA',
    'version': {'created': '136377827'}}}}}

In [48]:
df_movies.columns

Index(['id', 'Name', 'Release Date', 'IMDB URL', 'unknown', 'Action',
       'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
       'movieIdx'],
      dtype='object')

In [11]:
for i, row in df_movies.iterrows():
    try:

        mv = Movie(
            movie_id = row.id,
            url = row['IMDB URL'],
            name = row['Name'],
            vector = list(movie_embeddings_matrix[row.movieIdx]),
            created_at = datetime.now()
        )
        #print(list(movie_embeddings_matrix[row.movieIdx]))
        mv.save(using=client)
    except Exception as e:
        print(f"Error al guardar la película {row['Name']}: {e}")



Error al guardar la película unknown: RequestError(400, 'mapper_parsing_exception', 'failed to parse')




Error al guardar la película The Deadly Cure (1996): RequestError(400, 'mapper_parsing_exception', 'failed to parse')
Error al guardar la película Boys in Venice (1996): RequestError(400, 'mapper_parsing_exception', 'failed to parse')




In [12]:
#Movie.search(using=client).count()

movie_idx_to_search = 5

df_movies[df_movies['movieIdx'] == movie_idx_to_search]
# %%
movie_embeddings_matrix[movie_idx_to_search]

query = {
    "size": 5,
    "query": {
        "knn": {
        "vector": {
            "vector": movie_embeddings_matrix[movie_idx_to_search],
            "k" : 20
        }
        }
    }
}

response = client.search(index='movie', body=query)

for h in response['hits']['hits']:
  print(h)



{'_index': 'movie', '_id': '346', '_score': 1.0, '_source': {'movie_id': 346, 'url': 'http://us.imdb.com/M/title-exact?imdb-title-119396', 'name': 'Jackie Brown (1997)', 'vector': [0.0675366222858429, 0.09568929672241211, 0.22328245639801025], 'created_at': '2024-08-19T19:50:08.608149'}}
{'_index': 'movie', '_id': '987', '_score': 0.99915195, '_source': {'movie_id': 987, 'url': 'http://us.imdb.com/M/title-exact?Underworld%20(1997)', 'name': 'Underworld (1997)', 'vector': [0.000920130405575037, 0.0010935215977951884, 0.0028165383264422417], 'created_at': '2024-08-19T19:50:12.465129'}}
{'_index': 'movie', '_id': '285', '_score': 0.9991291, '_source': {'movie_id': 285, 'url': 'http://us.imdb.com/M/title-exact?Secrets%20&%20Lies%20(1996)', 'name': 'Secrets & Lies (1996)', 'vector': [0.06075748801231384, 0.08659423142671585, 0.22390972077846527], 'created_at': '2024-08-19T19:50:08.133847'}}
{'_index': 'movie', '_id': '1411', '_score': 0.99898773, '_source': {'movie_id': 1411, 'url': 'http:/