# Ingestando embeddings a Pinecone

In [1]:
%%capture
!pip install pinecone
!pip install tiktoken

In [2]:
# import pinecone
from pinecone import Pinecone, ServerlessSpec
import os, getpass
import tiktoken

In [3]:
api_key = getpass.getpass("Ingresa tu API Key de Pinecone : ")

## Creando un index en Pinecone

In [4]:
index_name = "knowledge-base-eliminatorias"
dimension = 1536

# Conectarse con la única región permitida
pc = Pinecone(api_key=api_key)

if index_name not in [idx.name for idx in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",          # ✅ Solo AWS
            region="us-east-1"    # ✅ Solo us-east-1
        )
    )
    print(f"✅ Índice '{index_name}' creado correctamente.")
else:
    print(f"ℹ️ El índice '{index_name}' ya existe.")


✅ Índice 'knowledge-base-eliminatorias' creado correctamente.


## Generando fragmentos de sitios web

![Imagen](https://imgmedia.larepublica.pe/640x371/larepublica/original/2023/10/17/652f24917388f967a923a805.webp)

In [8]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.25-py3-none-any.whl.metadata (2.9 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain-community)
  Downloading aiohttp-3.12.13-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp<4.0.0,>=3.8.3->langchain-community)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp<4.0.0,>=3.8.3->langchain-community)
  Downloading aiosignal-1.3.2-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting fro

In [9]:
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter

loader = WebBaseLoader(
    [
    "https://www.marca.com/co/2023/10/17/652e070f22601d73648b4585.html", 
    "https://hiraoka.com.pe/blog/post/eliminatorias-sudamericanas-mundial-2026-calendario-partidos-y-fechas"
    ]
)
data = loader.load()

#Genera varios fragmentos de 400 tokens
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 500, 
    chunk_overlap = 20
)

docs = text_splitter.split_documents(data)

USER_AGENT environment variable not set, consider setting it to identify your requests.
Created a chunk of size 1077, which is longer than the specified 500
Created a chunk of size 1371, which is longer than the specified 500
Created a chunk of size 3405, which is longer than the specified 500


In [10]:
len(docs), docs

(28,
 [Document(metadata={'source': 'https://www.marca.com/co/2023/10/17/652e070f22601d73648b4585.html', 'title': 'Tabla de posiciones Eliminatorias Mundial 2026: Clasificación y resultados de la sexta fecha en Sudamérica | Marca', 'description': 'Tras unos cuantos meses de parón de selecciones, y con el 21 de abril como última fecha de las Eliminatorias rumbo al Mundial 2026, la selección nacional regresa a los terrenos de ', 'language': 'es-co'}, page_content='Tabla de posiciones Eliminatorias Mundial 2026: Clasificación y resultados de la sexta fecha en Sudamérica | Marca \n\n\n \n\n \n\n\n \n \n\n\n  \n\n\n \n\nEs noticia: Próximo partido Real MadridClasificación Grupo HNorma ocho segundosReal MadridPaíses Bajos - EspańaAsencioJoan GarcíaLamine YamalLakersAl Ain - Juventus dónde verReal Madrid dónde verAlcaraz Munar donde verJuan LebrónBodegaPartidos hoy Mundial de ClubesPC FútbolFerrariPrecio luz hoyClasificación LigaMundial de ClubesFútbol hoy \n\nPortada de Marca\n\n\nEdición se

In [11]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Ingrese la API Key de OpenAI : ")

## Cargando datos a Pinecone

In [13]:
!pip install openai

Collecting openai
  Downloading openai-1.88.0-py3-none-any.whl.metadata (25 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting tqdm>4 (from openai)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading openai-1.88.0-py3-none-any.whl (734 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m734.3/734.3 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)
Downloading jiter-0.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (352 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, jiter, distro, openai
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [openai]2m3/4[0m [openai]
[1A[2KSuccessfully installed distro-1.9.0 jiter-0.

In [14]:
from langchain.embeddings.openai import OpenAIEmbeddings
from pinecone import Pinecone as PineconeClient
from uuid import uuid4

# Inicializa Pinecone v3
api_key = getpass.getpass("Ingresa tu API Key de Pinecone : ")
index_name = "knowledge-base-eliminatorias"
pc = PineconeClient(api_key=api_key)
index = pc.Index(index_name)

# Embeddings
embedder = OpenAIEmbeddings()
texts = [doc.page_content for doc in docs]
metadatas = [doc.metadata for doc in docs]

# Embed y subir
vectors = []
for i, text in enumerate(texts):
    embedding = embedder.embed_query(text)
    vectors.append({
        "id": str(uuid4()),
        "values": embedding,
        "metadata": metadatas[i] if i < len(metadatas) else {}
    })

index.upsert(vectors=vectors)
print("✅ Documentos insertados en Pinecone.")


✅ Documentos insertados en Pinecone.
