In [13]:
%pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
%pip install -qU langchain-ollama langchain-postgres
%pip install -qU boto3 pypdf

In [2]:
import boto3
import tempfile
from langchain_community.document_loaders import PyPDFLoader
import os
import getpass
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.outputs import ChatResult, ChatGeneration
import requests
from langchain_core.embeddings import Embeddings
from langchain_postgres import PGVector
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain import hub
from langgraph.graph import START, StateGraph, END
from typing_extensions import List, TypedDict
from langchain_core.runnables import Runnable
from langchain_core.documents import Document

# Configuración

In [14]:
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass("LangSmith API Key: ")
POSTGRES_URI = getpass.getpass("PostgreSQL URI (postgresql+psycopg://...): ")

LangSmith API Key: ··········
PostgreSQL URI (postgresql+psycopg://...): ··········


In [21]:
ACCESS_KEY = getpass.getpass("Digite Access Key: ")
SECRET_KEY = getpass.getpass("Digite Secret Key: ")
SESSION_TOKEN = getpass.getpass("Digite Session Token: ")

Digite Access Key: ··········
Digite Secret Key: ··········
Digite Session Token: ··········


In [22]:
session = boto3.Session(
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY,
    aws_session_token=SESSION_TOKEN
)

In [17]:
class CustomNgrokChat(BaseChatModel):
    def _llm_type(self) -> str:
        return "custom-ngrok-chat"

    def _generate(self, messages, stop=None, **kwargs) -> ChatResult:

        prompt = "\n".join([m.content for m in messages if isinstance(m, HumanMessage)])

        response = requests.post(
            "https://4dae-34-16-212-190.ngrok-free.app/generate",
            json={
                "model": "llama3.2",
                "prompt": prompt,
                "stream": False
            }
        )
        answer = response.json()["response"]
        return ChatResult(generations=[ChatGeneration(message=AIMessage(content=answer))])

llm = CustomNgrokChat()

In [18]:
class RemoteOllamaEmbeddings(Embeddings):
    def __init__(self, endpoint: str, model: str = "nomic-embed-text"):
        self.endpoint = endpoint
        self.model = model

    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        results = []
        for text in texts:
            res = requests.post(
                f"{self.endpoint}/api/embeddings",
                json={"model": self.model, "prompt": text}
            )
            res.raise_for_status()
            results.append(res.json()["embedding"])
        return results

    def embed_query(self, text: str) -> list[float]:
        return self.embed_documents([text])[0]

embeddings = RemoteOllamaEmbeddings(endpoint="http://ec2-54-234-198-62.compute-1.amazonaws.com:11434")


In [19]:
vector_store = PGVector(
    embeddings=embeddings,
    collection_name="pdf_docs",
    connection=POSTGRES_URI,
)

# Carga de datos

In [24]:
bucket_name = "docsragusa"
prefix = ""

s3 = session.client("s3")
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

all_docs = []

for obj in response.get("Contents", []):
    key = obj["Key"]
    if key.endswith(".pdf"):
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
            s3.download_fileobj(bucket_name, key, tmp)
            tmp.flush()
            loader = PyPDFLoader(tmp.name)
            docs = loader.load()
            all_docs.extend(docs)

print(f"Cargados {len(all_docs)} documentos PDF desde S3.")

Cargados 56 documentos PDF desde S3.


In [25]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = splitter.split_documents(all_docs)

In [26]:
_ = vector_store.add_documents(splits)

# Recuperación de información

In [27]:
prompt = hub.pull("rlm/rag-prompt")

class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def retrieve(state: State):
    print(state["question"])
    retrieved_docs = vector_store.similarity_search(state["question"])
    print(retrieved_docs)
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    chat_prompt_value = prompt.invoke({
        "question": state["question"],
        "context": docs_content,
        "instruction": "Answer clearly and concisely."
    })

    # Usa directamente la lista de mensajes para la llamada al LLM
    response = llm.invoke(chat_prompt_value.messages)

    return {"answer": response.content}

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [28]:
def corregir_codificacion(texto):
    try:
        # Intenta re-interpretar la cadena
        return texto.encode('latin1').decode('utf8')
    except:
        return texto

In [29]:
while True:
    user_question = input("Por favor escribe tu pregunta (o 'salir' para terminar): ")
    if user_question.lower() in ("salir", "exit", "quit"):
        print("👋 ¡Gracias por usar el chat! Hasta luego.")
        break
    response = graph.invoke({"question": user_question})
    answer = response["answer"]
    answer_corregida = corregir_codificacion(answer)
    print("\nBOT🤖: ")
    print(answer_corregida)
    print("\n" + "-"*100 + "\n")

Por favor escribe tu pregunta (o 'salir' para terminar): what is the class about?
what is the class about?
[Document(id='7f0d4b5e-d848-4daa-82d8-81b6856b4326', metadata={'page': 1, 'source': '/tmp/tmpfhiaz1n9.pdf', 'creator': 'PyPDF', 'moddate': '2025-05-29T16:35:44+00:00', 'producer': 'iLovePDF', 'page_label': '2', 'total_pages': 4, 'creationdate': ''}, page_content="Student : [Inaudible].  \nInstructor (Andrew Ng) : Endo —  \nStudent : [Inaudible].  \nInstructor (Andrew Ng) : Oh, I see, industry. Okay. Cool. Great, great. So as you can \ntell from a cross-section of this class, I think we're a very diverse audience in this room, \nand that's one of the things that makes this class fun to teach and fun to be in, I think."), Document(id='10895945-41d3-4fe7-9f81-c06ef4c27180', metadata={'page': 1, 'source': '/tmp/tmp7aat61w5.pdf', 'creator': 'PyPDF', 'moddate': '2025-05-29T16:35:44+00:00', 'producer': 'iLovePDF', 'page_label': '2', 'total_pages': 4, 'creationdate': ''}, page_content="St