In [1]:
import dotenv

dotenv.load_dotenv()

True

In [48]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

"""
NVIDIA annual report 2024 : https://s201.q4cdn.com/141608511/files/doc_financials/2024/ar/NVIDIA-2024-Annual-Report.pdf
"""

# Read data from text file to gain fortune knowledge
raw_documents = PyPDFLoader("data/NVIDIA-2024-Annual-Report.pdf").load_and_split()

# Chunk with RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5120, chunk_overlap=200)
documents = text_splitter.split_documents(raw_documents)

# Embedding text into vector and store into vector database
embedding_model = OpenAIEmbeddings()

db = FAISS.from_documents(documents=documents, 
                           embedding=embedding_model)

# Set type of retriever
retriever = db.as_retriever()

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

template = """Answer the question based on the following context only:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI()


def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])


chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

chain.invoke("How much does Nvidia income tax expense has in 2023 and 2024 ?")


'In 2023, Nvidia had an income tax benefit of $187 million, and in 2024, Nvidia had an income tax expense of $4,058 million.'