In [1]:
import os
import json
import PyPDF2
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain_aws import BedrockLLM
from langchain.chains import RetrievalQA
from langchain.prompts import ChatPromptTemplate

In [2]:
# Parámetros Qdrant
QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333))
COLLECTION_NAME = "documentos"

# Parámetros Amazon Bedrock
BEDROCK_PROFILE = os.getenv("BEDROCK_PROFILE", "bedrock-admin")
BEDROCK_MODEL_ID = os.getenv("BEDROCK_MODEL_ID", "amazon.titan-text-express-v1")
# Ajustes de generación
LLM_MAX_TOKENS = 256
LLM_STOP_SEQUENCES = ["\n\n"]  # corta generación tras doble salto

In [3]:
# Lee todo el texto de un PDF
def read_pdf(file_path: str) -> str:
    text = ""
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text + "\n"
    return text

# Divide texto en chunks con solapamiento
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> list[str]:
    chunks, start = [], 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

In [4]:
# 3.1 Iniciar cliente Qdrant y el embebedor
qdrant_client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# 3.2 Leer y generar chunks
pdf_path = "documento.pdf"
print("Leyendo PDF…")
document_text = read_pdf(pdf_path)
print("Chunking…")
chunks = chunk_text(document_text, chunk_size=1000, overlap=100)
print(f"{len(chunks)} chunks generados.")

# 3.3 Detectar dimensión del embedding
try:
    dimension = embedder.client.get_sentence_embedding_dimension()
except AttributeError:
    dimension = len(embedder.embed_query("prueba"))  # fallback

# 3.4 Crear colección Qdrant si es necesario
existing = [col.name for col in qdrant_client.get_collections().collections]
if COLLECTION_NAME not in existing:
    print("Creando colección Qdrant…")
    qdrant_client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=dimension, distance="Cosine")
    )

# 3.5 Calcular embeddings y subirlos
print("Calculando embeddings…")
vectors = embedder.embed_documents(chunks)
points = [PointStruct(id=i, vector=v, payload={"text": c})
          for i, (c, v) in enumerate(zip(chunks, vectors))]
print("Subiendo a Qdrant…")
qdrant_client.upsert(collection_name=COLLECTION_NAME, points=points)
print("Embeddings listos.")

Leyendo PDF…
Chunking…
478 chunks generados.
Calculando embeddings…
Subiendo a Qdrant…
Embeddings listos.


In [5]:
# 4.1 Instanciar vectorstore y MMR retriever
tlr_vectorstore = QdrantVectorStore(
    client=qdrant_client,
    collection_name=COLLECTION_NAME,
    embedding=embedder
)
retriever = tlr_vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"fetch_k": 10, "k": 5, "lambda_mult": 0.5}
)

# 4.2 LLM de Bedrock con controles de generación
tlr_llm = BedrockLLM(
    credentials_profile_name=BEDROCK_PROFILE,
    model_id=BEDROCK_MODEL_ID,
    max_tokens=LLM_MAX_TOKENS,
    stop_sequences=LLM_STOP_SEQUENCES
)

# 4.3 Prompt anti-repetición y cadena refine
prompt = ChatPromptTemplate.from_template(f"Información relevante:{context} Pregunta: {question} Por favor, responde de forma concisa en un único párrafo, sin repetir ideas ni frases."
)
tlr_qa_agent = RetrievalQA.from_chain_type(
    llm=tlr_llm,
    chain_type="refine",
    retriever=retriever,
    return_source_documents=False,
    chain_type_kwargs={"prompt": prompt}
)

ValidationError: 1 validation error for BedrockLLM
stop_sequences
  Extra inputs are not permitted [type=extra_forbidden, input_value=['\n\n'], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/extra_forbidden

In [None]:
# Procesa un JSON de preguntas y escribe otro con respuestas
def process_questions(input_json: str, output_json: str) -> None:
    with open(input_json, 'r', encoding='utf-8') as f:
        qa_list = json.load(f)

    results = []
    for item in qa_list:
        q = item.get("Q") or item.get("question")
        expected = item.get("A") or item.get("expected_answer")
        print(f"-> Pregunta: {q}")
        out = tlr_qa_agent.invoke({"query": q})
        gen = out.get("result", "").strip()
        results.append({"question": q, "expected_answer": expected, "generated_answer": gen})

    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)
    print(f"Resultados guardados en {output_json}")

# Ejecutar para ambos conjuntos si se invoca el notebook como script
if __name__ == "__main__":
    process_questions("Expert-questions.json", "Expert-questions-output.json")
    process_questions("Not-expert-questions.json", "Not-expert-questions-output.json")