In [12]:
import numpy as np
import pandas as pd
from datetime import datetime

In [13]:
movie_embeddings_matrix = np.load('../data/vector_db/movie_embeddings_matrix_1.npy')
user_embeddings_matrix = np.load('../data/vector_db/user_embeddings_matrix_1.npy')
user2Idx = np.load('../data/vector_db/user2Idx_1.npy', allow_pickle=True).item()
movie2Idx = np.load('../data/vector_db/movie2Idx_1.npy', allow_pickle=True).item()

In [18]:
df_users = pd.read_csv('../data/usuarios.csv')
df_movies = pd.read_csv('../data/peliculas.csv')
ratings = pd.read_csv('../data/scores.csv')
df_movies.loc[df_movies['IMDB URL'].isna(), 'IMDB URL'] = ''

user_movies = ratings[ratings['user_id'] == 196]
user_movies = user_movies.loc[user_movies['rating'].idxmax()]
user_movies

id                         1896
user_id                     196
movie_id                    655
rating                        5
Date        1997-12-04 16:09:53
Name: 1896, dtype: object

In [19]:
df_users['userIdx'] = df_users['id'].apply(lambda x: user2Idx[x])
df_movies['movieIdx'] = df_movies['id'].apply(lambda x: movie2Idx[x])

In [21]:
import sys
import os
notebook_dir = os.path.dirname(os.path.abspath('recommendation_system.ipynb'))
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(project_root)

#from config.config import client
from src.services.database.models import MovieV, UserV
from src.services.database.vector_db_service import VectorDbService
vdbs = VectorDbService()
client = vdbs.client
client.cluster.health()

{'cluster_name': 'docker-cluster',
 'status': 'yellow',
 'timed_out': False,
 'number_of_nodes': 1,
 'number_of_data_nodes': 1,
 'discovered_master': True,
 'discovered_cluster_manager': True,
 'active_primary_shards': 8,
 'active_shards': 8,
 'relocating_shards': 0,
 'initializing_shards': 0,
 'unassigned_shards': 4,
 'delayed_unassigned_shards': 0,
 'number_of_pending_tasks': 0,
 'number_of_in_flight_fetch': 0,
 'task_max_waiting_in_queue_millis': 0,
 'active_shards_percent_as_number': 66.66666666666666}

In [23]:
try:
    #client.indices.delete('movie')
    MovieV.init(using=client)
    print("Índice inicializado correctamente.")
except Exception as e:
    print(f"Error al inicializar el índice: {e}")

Índice inicializado correctamente.


In [25]:
try:
    #client.indices.delete('user')
    UserV.init(using=client)
    print("Índice inicializado correctamente.")
except Exception as e:
    print(f"Error al inicializar el índice: {e}")

Índice inicializado correctamente.


In [26]:
for i, row in df_movies.iterrows():
    try:

        mv = MovieV(
            movie_id = row.id,
            url = row['IMDB URL'],
            name = row['Name'],
            vector = list(movie_embeddings_matrix[row.movieIdx]),
            created_at = datetime.now()
        )
        mv.save(using=client)
        #Movie.search(using=client).count()
        #mv.save(using=client)
    except Exception as e:
        print(f"Error al guardar la película {row['Name']}: {e}")

In [27]:
for i, row in df_users.iterrows():
    try:

        uv = UserV(
            user_id = row.id,
            occupation = row['Occupation'],
            created_at = datetime.now(),
            active_since = row['Active Since'],
            vector = list(user_embeddings_matrix[row.userIdx])
        )
        uv.save(using=client)
    except Exception as e:
        print(f"Error al guardar el usuario {row.id}: {e}")

In [28]:
MovieV.search(using=client).count()

1682

In [29]:
UserV.search(using=client).count()

943

In [30]:
movie_idx_to_search = 5

df_movies[df_movies['movieIdx'] == movie_idx_to_search]
# %%
movie_embeddings_matrix[movie_idx_to_search]

array([ 0.04046524,  0.08258271, -0.01827831,  0.11976125,  0.01657145],
      dtype=float32)

In [35]:
user_idx_to_search = 5

df_users[df_users['userIdx'] == user_idx_to_search]
# %%
user_embeddings_matrix[user_idx_to_search]

array([-0.01620449, -0.02153726,  0.13703047,  0.07455067, -0.20351009],
      dtype=float32)

In [31]:
movie_idx_to_search = 5

df_movies[df_movies['movieIdx'] == movie_idx_to_search]
# %%
print(type(movie_embeddings_matrix[movie_idx_to_search]))

query = {
    "size": 5,
    "query": {
        "knn": {
        "vector": {
            "vector": movie_embeddings_matrix[movie_idx_to_search],
            "k" : 20
        }
        }
    }
}

response = client.search(index='movie', body=query)

for h in response['hits']['hits']:
  print(h)

<class 'numpy.ndarray'>
{'_index': 'movie', '_id': '346', '_score': 1.0, '_source': {'movie_id': 346, 'url': 'http://us.imdb.com/M/title-exact?imdb-title-119396', 'name': 'Jackie Brown (1997)', 'vector': [0.04046523943543434, 0.08258271217346191, -0.01827831193804741, 0.11976125091314316, 0.016571447253227234], 'created_at': '2024-11-18T15:22:44.046736'}}
{'_index': 'movie', '_id': '387', '_score': 0.989446, '_source': {'movie_id': 387, 'url': 'http://us.imdb.com/M/title-exact?Age%20of%20Innocence,%20The%20(1993)', 'name': 'Age of Innocence, The (1993)', 'vector': [0.012251359410583973, 0.02205602265894413, -0.004896262194961309, 0.024609360843896866, 0.0036130468361079693], 'created_at': '2024-11-18T15:22:44.260518'}}
{'_index': 'movie', '_id': '644', '_score': 0.98052853, '_source': {'movie_id': 644, 'url': 'http://us.imdb.com/M/title-exact?Thin%20Blue%20Line,%20The%20(1988)', 'name': 'Thin Blue Line, The (1988)', 'vector': [0.04187830537557602, 0.08485984057188034, 0.008547393605113