In [None]:
# !pip install -qU langchain_experimental langchain_openai langchain_community langchain ragas faiss-cpu tiktoken


In [None]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key:")


In [None]:
wget https://gutenberg.org/cache/epub/14586/pg14586.txt -O the_brain.txt


In [None]:
with open("./the_brain.txt") as f:
    the_brain = f.read()


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=0,
    length_function=len,
)

naive_chunks = text_splitter.split_text(the_brain)

for chunk in naive_chunks[40:55]:
    print(chunk + "\n")


In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

semantic_chunker = SemanticChunker(
    OpenAIEmbeddings(model="text-embedding-3-large"), 
    breakpoint_threshold_type="percentile"
)


In [None]:
semantic_chunks = semantic_chunker.create_documents([the_brain])

for semantic_chunk in semantic_chunks:
    if "MDT is associated with the basic" in semantic_chunk.page_content:
        print(semantic_chunk.page_content)
        print(len(semantic_chunk.page_content))


In [None]:
from langchain_community.vectorstores import FAISS

semantic_chunk_vectorstore = FAISS.from_documents(
    semantic_chunks, 
    embedding=OpenAIEmbeddings(model="text-embedding-3-large")
)

# Limitaremos semantic_chunk_vectorstore a k=1 para demostrar el poder de la estrategia de chunking semántico,
# manteniendo un conteo de tokens similar entre el contexto recuperado semánticamente y el contexto recuperado de manera simple.

semantic_chunk_retriever = semantic_chunk_vectorstore.as_retriever(search_kwargs={"k": 1})

semantic_chunk_retriever.invoke("What is MDT?")


In [None]:
from langchain import hub

# Descargar el prompt del modelo RAG
prompt = hub.pull("lm/rag-prompt")

# Generación
# Utilizaremos ChatOpenAI para mantener la simplicidad del ejemplo

from langchain_openai import ChatOpenAI

llm = ChatOpenAI()

# LCEL RAG Chain
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

semantic_rag_chain = {
    "context": semantic_chunk_retriever, 
    "question": RunnablePassthrough()
}

# Definiendo el flujo de la cadena
prompt | llm | StrOutputParser()


In [None]:
semantic_rag_chain.invoke("What is MDT?")
