In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

In [2]:
from dotenv import load_dotenv
import os

In [3]:
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

# Load dos modelos (Embeddings e LLM)

embeddings_model = OpenAIEmbeddings()
llm = ChatOpenAI(model_name= "gpt-4o-mini", max_tokens = 50, openai_api_key=openai_api_key)

In [4]:
# Carregar o PDF

pdf_link = "assets/teste.pdf"

loader = PyPDFLoader(pdf_link, extract_images=False)
pages = loader.load_and_split()
pages = loader.load()

print(f"Pages: {len(pages)}")

Pages: 3


In [5]:
#Separar em Chunks (Pedaços de documento)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=40,
    length_function=len,
    add_start_index=True,
    is_separator_regex=False,
)

chunks = text_splitter.split_documents(pages)
print(f"Chunks: {len(chunks)}")

Chunks: 21


In [6]:
# Path do DB
CHROMA_PATH = "generated_vector_db"

In [7]:
# Salvar no Vector DB - Chroma
db = Chroma.from_documents(chunks, embedding=embeddings_model, persist_directory=CHROMA_PATH)

In [8]:
# Carregar DB
vector_db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings_model)

# Load Retriever
retriever = vector_db.as_retriever(search_kwargs={"k": 3})

In [9]:
# Define o prompt template
prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that answers questions about documents, based on the context: \n\n{context}"),
    ("user", "Please answer the question: {question}")
])

In [10]:
# Cria um question-answering chain
chain = create_stuff_documents_chain(llm, prompt_template)

In [11]:
# Cria uma função que recebe os chunks mais relevantes (context) e a pergunta do usuário, retorna a resposta da llm
def ask(question):
    context = retriever.invoke(question)
    answer = chain.invoke({"context": context, "question": question})
    return answer

In [12]:
# Responde o usuário
user_question = input("Digite sua pergunta relacionada ao documento: ")
answer = ask(user_question)
print("\nResposta: ", answer)

Digite sua pergunta relacionada ao documento:  Qual é o titulo do documento?



Resposta:  O título do documento é "Proposições legislativas".
