In [1]:
%pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
%pip install -qU "langchain[cohere]" langchain-postgres
%pip install -qU boto3 pypdf
%pip install -qU langchain-cohere

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import boto3
import tempfile
from langchain_community.document_loaders import PyPDFLoader
import os
import getpass
from langchain.chat_models import init_chat_model
from langchain_cohere import CohereEmbeddings
from langchain_postgres import PGVector
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain import hub
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

# Configuración

In [2]:
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass("LangSmith API Key: ")
os.environ["COHERE_API_KEY"] = getpass.getpass("Enter API key for Cohere: ")
POSTGRES_URI = getpass.getpass("PostgreSQL URI (postgresql+psycopg://...): ")

LangSmith API Key:  ········
Enter API key for Cohere:  ········
PostgreSQL URI (postgresql+psycopg://...):  ········


In [3]:
llm = init_chat_model("command-r-plus", model_provider="cohere")
embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")

In [4]:
vector_store = PGVector(
    embeddings=embeddings,
    collection_name="docs_cohere",
    connection=POSTGRES_URI,
)

# Carga de datos

In [5]:
bucket_name = "docsragusa"
prefix = "" 

s3 = boto3.client("s3")
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

all_docs = []

for obj in response.get("Contents", []):
    key = obj["Key"]
    if key.endswith(".pdf"):
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
            s3.download_fileobj(bucket_name, key, tmp)
            tmp.flush()
            loader = PyPDFLoader(tmp.name)
            docs = loader.load()
            all_docs.extend(docs)

print(f"Cargados {len(all_docs)} documentos PDF desde S3.")

Cargados 27 documentos PDF desde S3.


In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = splitter.split_documents(all_docs)

In [7]:
_ = vector_store.add_documents(splits)

# Recuperación de información

In [8]:
prompt = hub.pull("rlm/rag-prompt")

class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], 
                              "context": docs_content,
                              "instruction": "Por favor responde en español de forma clara y precisa."
                            })
    response = llm.invoke(messages)
    return {"answer": response.content}

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [9]:
while True:
    user_question = input("💗 Por favor escribe tu pregunta (o 'salir' para terminar): ")
    if user_question.lower() in ("salir", "exit", "quit"):
        print("👋 ¡Gracias por usar el chat! Hasta luego.")
        break
    response = graph.invoke({"question": user_question})
    print("\nBOT🤖: ")
    print(response["answer"])
    print("\n" + "-"*40 + "\n")


💗 Por favor escribe tu pregunta (o 'salir' para terminar):  ¿quien escribio el cuadernillo?



BOT🤖: 
Pilar Gaspar, Silvia Gonzalez, Mariela Helman y Ravia Zuberman escribieron el cuadernillo.

----------------------------------------



💗 Por favor escribe tu pregunta (o 'salir' para terminar):  salir


👋 ¡Gracias por usar el chat! Hasta luego.
