In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
movie_embeddings_matrix = np.load('../data/vector_db/movie_embeddings_matrix.npy')
user_embeddings_matrix = np.load('../data/vector_db/user_embeddings_matrix.npy')
user2Idx = np.load('../data/vector_db/user2Idx.npy', allow_pickle=True).item()
movie2Idx = np.load('../data/vector_db/movie2Idx.npy', allow_pickle=True).item()

In [4]:
ratings = pd.read_csv('../data/scores.csv')
df_users = pd.read_csv('../data/usuarios.csv')
df_movies = pd.read_csv('../data/peliculas.csv')
df_movies.loc[df_movies['IMDB URL'].isna(), 'IMDB URL'] = ''

user_movies = ratings[ratings['user_id'] == 196]
user_movies = user_movies.loc[user_movies['rating'].idxmax()]
user_movies

id                         1896
user_id                     196
movie_id                    655
rating                        5
Date        1997-12-04 16:09:53
Name: 1896, dtype: object

In [5]:
df_users['userIdx'] = df_users['id'].apply(lambda x: user2Idx[x])
df_movies['movieIdx'] = df_movies['id'].apply(lambda x: movie2Idx[x])


In [7]:
import sys
import os
notebook_dir = os.path.dirname(os.path.abspath('recommendation_system.ipynb'))
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(project_root)

#from config.config import client
from src.services.database.models import MovieV, UserV
from src.services.database.vector_db_service import VectorDbService
vdbs = VectorDbService()
client = vdbs.client
client.cluster.health()

{'cluster_name': 'docker-cluster',
 'status': 'yellow',
 'timed_out': False,
 'number_of_nodes': 1,
 'number_of_data_nodes': 1,
 'discovered_master': True,
 'discovered_cluster_manager': True,
 'active_primary_shards': 10,
 'active_shards': 10,
 'relocating_shards': 0,
 'initializing_shards': 0,
 'unassigned_shards': 6,
 'delayed_unassigned_shards': 0,
 'number_of_pending_tasks': 0,
 'number_of_in_flight_fetch': 0,
 'task_max_waiting_in_queue_millis': 0,
 'active_shards_percent_as_number': 62.5}

In [9]:
try:
    #client.indices.delete('movie')
    MovieV.init(using=client)
    print("Índice inicializado correctamente.")
except Exception as e:
    print(f"Error al inicializar el índice: {e}")

Índice inicializado correctamente.


In [11]:
try:
    #client.indices.delete('user')
    UserV.init(using=client)
    print("Índice inicializado correctamente.")
except Exception as e:
    print(f"Error al inicializar el índice: {e}")

Índice inicializado correctamente.


In [12]:
for i, row in df_movies.iterrows():
    try:

        mv = MovieV(
            movie_id = row.id,
            url = row['IMDB URL'],
            name = row['Name'],
            vector = list(movie_embeddings_matrix[row.movieIdx]),
            created_at = datetime.now()
        )
        mv.save(using=client)
        #Movie.search(using=client).count()
        #mv.save(using=client)
    except Exception as e:
        print(f"Error al guardar la película {row['Name']}: {e}")

In [13]:
for i, row in df_users.iterrows():
    try:

        uv = UserV(
            user_id = row.id,
            occupation = row['Occupation'],
            created_at = datetime.now(),
            active_since = row['Active Since'],
            vector = list(user_embeddings_matrix[row.userIdx])
        )
        uv.save(using=client)
    except Exception as e:
        print(f"Error al guardar el usuario {row.id}: {e}")

In [14]:
MovieV.search(using=client).count()

1682

In [15]:
UserV.search(using=client).count()

943

In [16]:
movie_idx_to_search = 5

df_movies[df_movies['movieIdx'] == movie_idx_to_search]
# %%
movie_embeddings_matrix[movie_idx_to_search]

array([-0.01547333,  0.1139739 , -0.06937604, -0.06899811,  0.03958286],
      dtype=float32)

In [17]:
user_idx_to_search = 5

df_users[df_users['userIdx'] == user_idx_to_search]
# %%
user_embeddings_matrix[user_idx_to_search]

array([-0.22776741, -0.02918367,  0.01204615, -0.00425285,  0.0612423 ],
      dtype=float32)

In [18]:
movie_idx_to_search = 5

df_movies[df_movies['movieIdx'] == movie_idx_to_search]
# %%
print(type(movie_embeddings_matrix[movie_idx_to_search]))

query = {
    "size": 5,
    "query": {
        "knn": {
        "vector": {
            "vector": movie_embeddings_matrix[movie_idx_to_search],
            "k" : 20
        }
        }
    }
}

response = client.search(index='movie', body=query)

for h in response['hits']['hits']:
  print(h)

<class 'numpy.ndarray'>
{'_index': 'movie', '_id': '346', '_score': 0.99999976, '_source': {'movie_id': 346, 'url': 'http://us.imdb.com/M/title-exact?imdb-title-119396', 'name': 'Jackie Brown (1997)', 'vector': [-0.015473331324756145, 0.11397390067577362, -0.06937604397535324, -0.06899810582399368, 0.03958285599946976], 'created_at': '2024-11-28T14:15:45.992197'}}
{'_index': 'movie', '_id': '276', '_score': 0.99130285, '_source': {'movie_id': 276, 'url': 'http://us.imdb.com/M/title-exact?Leaving%20Las%20Vegas%20(1995)', 'name': 'Leaving Las Vegas (1995)', 'vector': [-0.0052984063513576984, 0.09505090117454529, -0.04360811784863472, -0.05844560265541077, 0.024079328402876854], 'created_at': '2024-11-28T14:15:45.256478'}}
{'_index': 'movie', '_id': '475', '_score': 0.99048996, '_source': {'movie_id': 475, 'url': 'http://us.imdb.com/Title?Trainspotting+(1996)', 'name': 'Trainspotting (1996)', 'vector': [-0.01807694137096405, 0.1501435935497284, -0.069170281291008, -0.0681825578212738, 0.0