In [25]:
from dotenv import load_dotenv

load_dotenv()

True

# Get the data

In [2]:

# !wget https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/1cbe8fe7-e08a-46e3-8dcc-b429fc06c1a4.pdf -O nvidia.pdf

# Load in the data and chunk it

In [3]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyMuPDFLoader(
    "nvidia.pdf",
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200
)

documents = text_splitter.split_documents(documents)

In [4]:
len(documents)

494

# Read in embeddings, create vector store, and retriever

In [5]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)
vector_store = FAISS.from_documents(documents, embeddings)
retriever = vector_store.as_retriever()

# Retrieved documents for question 1

In [18]:
question_1 = "Who is the E-VP, Operations - and how old are they?"
retrieved_documents = retriever.invoke(question_1)
retrieved_documents

[Document(page_content="Executive Vice President and Chief Financial Officer\nAjay K. Puri\n69\nExecutive Vice President, Worldwide Field Operations\nDebora Shoquist\n69\nExecutive Vice President, Operations\nTimothy S. Teter\n57\nExecutive Vice President and General Counsel\nJen-Hsun Huang co-founded NVIDIA in 1993 and has served as our President, Chief Executive Officer, and a member of the Board of Directors since our\ninception. From 1985 to 1993, Mr. Huang was employed at LSI Logic Corporation, a computer chip manufacturer, where he held a variety of positions including\nas Director of Coreware, the business unit responsible for LSI's SOC. From 1983 to 1985, Mr. Huang was a microprocessor designer for AMD, a semiconductor\ncompany. Mr. Huang holds a B.S.E.E. degree from Oregon State University and an M.S.E.E. degree from Stanford University.\nColette M. Kress joined NVIDIA in 2013 as Executive Vice President and Chief Financial Officer. Prior to NVIDIA, Ms. Kress most recently ser

# Retrieved documents for question 2

In [19]:
question_2 = "What is the gross carrying amount of Total Amortizable Intangible Assets for Jan 29, 2023?"
retrieved_documents = retriever.invoke(question_2)
retrieved_documents

[Document(page_content='Table of Contents\nNVIDIA Corporation and Subsidiaries\nNotes to the Consolidated Financial Statements\n(Continued)\nNote 7 - Amortizable Intangible Assets\nThe components of our amortizable intangible assets are as follows:\n \nJan 28, 2024\nJan 29, 2023\n \nGross\nCarrying\nAmount\nAccumulated\nAmortization\nNet \nCarrying\nAmount\nGross\nCarrying\nAmount\nAccumulated\nAmortization\nNet \nCarrying\nAmount\n \n(In millions)\nAcquisition-related intangible\nassets (1)\n$\n2,642 \n$\n(1,720)\n$\n922 \n$\n3,093 \n$\n(1,614)\n$\n1,479 \nPatents and licensed technology\n449 \n(259)\n190 \n446 \n(249)\n197 \nTotal intangible assets\n$\n3,091 \n$\n(1,979)\n$\n1,112 \n$\n3,539 \n$\n(1,863)\n$\n1,676 \n(1)    During the first quarter of fiscal year 2023, we commenced amortization of a $630 million in-process research and development intangible asset related to our acquisition of\nMellanox.\nAmortization expense associated with intangible assets for fiscal years 2024, 20

# Use 3.5 Turbo model from OpenAI

In [11]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Create retrieval chain

In [20]:
from langchain import hub
from langchain.retrievers import MultiQueryRetriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

retrieval_qa_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
advanced_retriever = MultiQueryRetriever.from_llm(retriever=retriever, llm=llm)
document_chain = create_stuff_documents_chain(llm, retrieval_qa_prompt)
retrieval_chain = create_retrieval_chain(advanced_retriever, document_chain)


In [21]:
response = retrieval_chain.invoke({"input": question_1})
print(response["answer"])

The Executive Vice President of Operations is Debora Shoquist, and she is 69 years old.


In [35]:
response = retrieval_chain.invoke({"input": question_2})
print(response["answer"])

The gross carrying amount of Total Amortizable Intangible Assets for Jan 29, 2023, is $3,539 million.


# Ragas

In [28]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

generator = TestsetGenerator.with_openai()

testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.25, reasoning: 0.25, multi_context: 0.5}, raise_exceptions=False)

Filename and doc_id are the same for all nodes.                   
Generating: 100%|██████████| 10/10 [01:19<00:00,  7.92s/it]


In [29]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [30]:
test_df = testset.to_pandas()
test_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,episode_done
0,How could the failure of third-party publisher...,"[developers, customers, and other third partie...",The failure of third-party publishers to make ...,simple,True
1,What are some factors that can impact the calc...,[adjustments to income taxes upon finalization...,adjustments to income taxes upon finalization ...,simple,True
2,Did the derivative financial instruments desig...,[The table below presents the notional value o...,"No, the derivative financial instruments desig...",reasoning,True
3,What could be the impact on business and finan...,[enabling or facilitating AI and may in the fu...,Concerns about AI misuse and restrictions on p...,reasoning,True
4,What are the potential financial impacts of an...,"[damages or fines, or an injunction stopping u...",An unfavorable outcome or settlement in litiga...,multi_context,True
5,What are the potential negative outcomes for o...,[adjustments to income taxes upon finalization...,Our business is exposed to the risks associate...,multi_context,True
6,What was the % change in R&D expenses between ...,[Provisions for inventory and excess inventory...,18%,multi_context,True
7,"""How are non-monetary assets and liabilities r...",[Foreign Currency Remeasurement\nWe use the U....,Non-monetary assets and liabilities such as pr...,multi_context,True
8,"""What expertise and knowledge do our sales tea...",[Members of our sales team have technical expe...,Members of our sales team have technical exper...,multi_context,True
9,What frameworks does the company's information...,[Item 1B. Unresolved Staff Comments\nNot appli...,The company's information security management ...,reasoning,True


In [31]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [36]:
answers = []
contexts = []

for question in test_questions:
  response = retrieval_chain.invoke({"input" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["context"]])

In [37]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [38]:
results = evaluate(response_dataset, metrics)

Evaluating: 100%|██████████| 50/50 [01:01<00:00,  1.24s/it]


In [39]:
results

{'faithfulness': 1.0000, 'answer_relevancy': 0.8662, 'context_recall': 1.0000, 'context_precision': 0.9197, 'answer_correctness': 0.7175}