In [None]:
# 1- Cargar API KEY de PINECONE

from dotenv import load_dotenv
import os

# Cargar las variables del archivo .env
load_dotenv()

# Acceder a las claves
PINECONE_API_KEY= os.getenv("BD_KEY")



In [2]:
# 2- Impmortar las librerias necesarias
from pinecone import Pinecone,ServerlessSpec,Index
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer




  from tqdm.autonotebook import tqdm


In [3]:
# 3- Cargar el archivo PDF
pdf_location = r'C:\Users\anben\Desktop\resume_Aida Benito.pdf'
loader = PyPDFLoader(pdf_location)
documents = loader.load()

# Dividir el documento en fragmentos pequeños
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_documents = text_splitter.split_documents(documents)

# Ver los primeros fragmentos para asegurar que la división fue correcta
print(f"Primeros 2 fragmentos: {split_documents[:2]}")



Primeros 2 fragmentos: [Document(page_content='AÍDABENITO\nanbenito@yahoo.com.ar\nQuilmes, BuenosAires, Argentina\nCell: (54911)6-497-6858-Skype: anbenito1975@gmail.com\nLinkedIn:https://www.linkedin.com/in/aida-benito/\nINFORMATIONSYSTEMSENGINEER\nExperiencedDataEngineer andBusiness IntelligenceConsultant. Possessesadvancedknowledgeof\ndatabases and various BI tools, as well as knowledge of programming languages, AWS and GCP\nEcosystems. Has strong communication skills. Passionate about Data Science, BigData, Internet of', metadata={'source': 'C:\\Users\\anben\\Desktop\\resume_Aida Benito.pdf', 'page': 0}), Document(page_content='Things, Artificial Intelligence, research and learning. Able to handle several projects simultaneously.\nHaspassedtheFirst CertificateinEnglishexam.\nCorecompetenciesinclude:\nFlexibility-Leadership-Teamwork-Conflict resolution-Analytical thinking-Decisionmaking\nCustomerService-Creativethinking\nPROFESSIONALEXPERIENCE\nMPSGroup-Client: DeltaAirlines\n● Desig

In [5]:
# 4- Inicializar Pinecone
from pinecone import ServerlessSpec
import time
pc=Pinecone(api_key=PINECONE_API_KEY)
cloud = 'aws'
region = 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)



index_name = "resume-index"
if index_name in pc.list_indexes().names():
  pc.delete_index(index_name)
  print("index {} borrado".format(index_name))

time.sleep(1)

# Si el índice no existe, lo creamos
if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=384,  
        metric='cosine',
        spec=spec)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()


index resume-index borrado


{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [8]:
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generar embeddings usando el modelo 
texts = [doc.page_content for doc in split_documents]
embeddings = embed_model.encode(texts)

# Preparar los datos para subirlos a Pinecone
#upsert_data = [(str(i), embeddings[i], {}) for i in range(len(embeddings))]

# Subir los embeddings al índice
#index.upsert(vectors=upsert_data)
upsert_data_generator = [
    {
        'id': str(i), 
        'values': embedding, 
        'metadata': {'text': texts[i]}  # Los metadatos (por ejemplo, texto del documento)
    }
    for i, embedding in enumerate(embeddings)
]

index.upsert(vectors=upsert_data_generator)

print(f"Embeddings generados y almacenados en el índice '{index_name}' de Pinecone.")

Embeddings generados y almacenados en el índice 'resume-index' de Pinecone.
