In [1]:
#imports
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)


import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import numpy as np
from datetime import datetime

### I. Preprocesamiento

In [2]:
items_df = pd.read_csv('../data/items_202505280739.csv')
n_nan = items_df['description'].isna().sum()
n_iguales = (items_df['description'] == items_df['title']).sum()

#Eliminamos los items que no tienen descripción o que tienen el mismo título que la descripción
items_df = items_df[
    (~items_df['description'].isna()) & (items_df['description'] != items_df['title'])
]



#items_df['description'] = items_df['description'].fillna("")

In [6]:
items_df['description'].iloc[-1]

'Remera confeccionada en 100% algodón, posee cuello redondo y estampado frontal "U.S.A 1990". Su diseño simple pero con personalidad está inspirado en la estética noventera, ideal para quienes aman lo clásico con onda. Confeccionada en algodón suave y liviano, tiene un calce relajado perfecto para todos los días.\r\n\r\n✔ Estampa frontal estilo retro\r\n✔ Cuello redondo clásico\r\n✔ Tela liviana y cómoda\r\n✔ Corte unisex y relajado\r\n✔ Disponible en varios colores básicos\r\n\r\n<b> Cuidados </b>\r\n\r\n-Lavado con agua fría.\r\n-Lavar del revés.\r\n-Usar secado natural.'

### II. Extracción de features

In [3]:

# creating the tf-idf Vectorizer to analyze, at word level, unigrams and bigrams
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1)) 
 
# applying the vectorizer on the 'tags' column
tfidf_matrix = tf.fit_transform(items_df['description'])
 
# compute the cosine similarity between all the samples in the matrix
cosine_sim = cosine_similarity(tfidf_matrix) 
 
# saving the values in a DataFrame for better visualisation
cosine_sim_df = pd.DataFrame(cosine_sim, index=items_df['title'], columns=items_df['title']) 


### III. Reducción de dimensionalidad

In [4]:
svd = TruncatedSVD(n_components=839, random_state=42)
#svd.fit(tfidf_matrix)
# Varianza explicada acumulada
# var_acumulada = np.cumsum(svd.explained_variance_ratio_)

# # ¿Cuántos componentes necesito para retener el 90% de la varianza?
# n_optimo = np.argmax(var_acumulada >= 0.90) + 1
# print(f"Para retener el 90% de la varianza, necesito {n_optimo} componentes.")

reduced_matrix = svd.fit_transform(tfidf_matrix)


In [5]:
#Guardamos el vector reducido en el dataframe
items_df = items_df.reset_index(drop=True)
items_df['text_vector'] = list(reduced_matrix)

In [19]:
items_df.head()

Unnamed: 0,id,title,prototype_id,description,brief_description,height,width,depth,meta_title,meta_description,...,cuc_image_url,cuc_image_two,cuc_image_url_two,related_category_id,for_search,codigoProducto,fitType,last_unit_quantity,prop_desc,text_vector
0,8,VESTIDO GREGORIO,1,Vestido estampado con largo modular maxi. Tien...,VESTIDO GREGORIO,1.0,1.0,1.0,VESTIDO GREGORIO,VESTIDO GREGORIO,...,,False,,,True,sweet_3,,,,"[0.3082522658753316, -0.16165018154074312, -0...."
1,10,Legging Poly Blue,1,"Jean de tonalidad azul oscuro, chupin, tiro al...","Jean de tonalidad azul oscuro, chupin, tiro al...",1.0,1.0,1.0,LEGGING POLY BLUE,"Jean de tonalidad azul oscuro, chupin, tiro al...",...,,False,,,True,sweet_2,,,,"[0.27558201146934536, -0.038430255864428005, 0..."
2,11,LOLI HIGH GINETTE,1,"Jean de tonalidad oscura, de corte recto en la...",DENIM,1.0,1.0,1.0,LOLI HIGH GINETTE,"Jean de tonalidad oscura, de corte recto en la...",...,,False,,,True,sweet_2,,,,"[0.4157201322171712, -0.11173354366388447, 0.3..."
3,12,CLEAN HIGH SPIN COLORS,1,"Jean de color, skinny al cuerpo, tiro alto a l...",VINTAGE,1.0,1.0,1.0,CLEAN HIGH SPIN COLORS,"Jean de color, skinny al cuerpo, tiro alto a l...",...,,False,,,True,,,,,"[0.38736524751644047, -0.07456073404358475, 0...."
4,13,CLEAN HIGH JOLY,1,"Jean de tonalidad oscura, skinny al cuerpo, ti...",DENIM,1.0,1.0,1.0,CLEAN HIGH JOLY,"Jean de tonalidad oscura, skinny al cuerpo, ti...",...,,False,,,True,sweet_2,,,,"[0.35371749368947625, -0.03752419171415911, 0...."


### IV. Persistencia en BBDD vectorial

In [None]:

from src.services.database.vector_db_service import VectorDbService
from src.services.database.models import Item

vdbs = VectorDbService()
client = vdbs.client
client.cluster.health()

{'cluster_name': 'docker-cluster',
 'status': 'yellow',
 'timed_out': False,
 'number_of_nodes': 1,
 'number_of_data_nodes': 1,
 'discovered_master': True,
 'discovered_cluster_manager': True,
 'active_primary_shards': 6,
 'active_shards': 6,
 'relocating_shards': 0,
 'initializing_shards': 0,
 'unassigned_shards': 1,
 'delayed_unassigned_shards': 0,
 'number_of_pending_tasks': 0,
 'number_of_in_flight_fetch': 0,
 'task_max_waiting_in_queue_millis': 0,
 'active_shards_percent_as_number': 85.71428571428571}

In [23]:
try:
    #client.indices.delete('items')
    Item.init(using=client)
    print("Índice inicializado correctamente.")
except Exception as e:
    print(f"Error al inicializar el índice: {e}")

Índice inicializado correctamente.


In [24]:
for i, row in items_df.iterrows():
    try:

        it = Item(
            item_id = row.id,
            #url = row['IMDB URL'],
            title = row['title'],
            description = row['description'],
            text_vector = list(items_df['text_vector'][i]),
            created_at = datetime.now()
        )
        it.save(using=client)
        #Movie.search(using=client).count()
        #mv.save(using=client)
    except Exception as e:
        print(f"Error al guardar la película {row['title']}: {e}")

Error al guardar la película SWEATER OFIDA: RequestError(400, 'mapper_parsing_exception', "failed to parse field [text_vector] of type [knn_vector] in document with id '586'. Preview of field's value: 'null'")


In [None]:
# def get_recommendations(movie_id, similarity_df, movies_df, k=10):
 
# # partitions the matrix such that the indices are in the position they would be in a sorted array
# ix = similarity_df.loc[:,movie_id].to_numpy().argpartition(range(-1,-k,-1))
 
# # gets the corresponding movie titles for k+1 sorted indices    
# closest = similarity_df.columns[ix[-1:-(k+2):-1]] 
 
# # removes the queried title from the results
# closest = closest.drop(movie_id, errors='ignore')    
 
# # returns top k most similar movies
# return pd.DataFrame(closest).merge(movies_df).head(k) 