# LangChain QA

## Installation

In [1]:
!pip install langchain openai chromadb==0.5.3 tiktoken pypdf python-dotenv

Collecting langchain-core<0.2,>=0.1.7 (from langchain)
  Using cached langchain_core-0.1.52-py3-none-any.whl.metadata (5.9 kB)
Collecting langsmith<0.1.0,>=0.0.77 (from langchain)
  Using cached langsmith-0.0.92-py3-none-any.whl.metadata (9.9 kB)
INFO: pip is looking at multiple versions of langchain-core to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-core<0.2,>=0.1.7 (from langchain)
  Using cached langchain_core-0.1.51-py3-none-any.whl.metadata (5.9 kB)
  Using cached langchain_core-0.1.50-py3-none-any.whl.metadata (5.9 kB)
  Using cached langchain_core-0.1.49-py3-none-any.whl.metadata (5.9 kB)
  Using cached langchain_core-0.1.48-py3-none-any.whl.metadata (5.9 kB)
  Using cached langchain_core-0.1.47-py3-none-any.whl.metadata (5.9 kB)
  Using cached langchain_core-0.1.46-py3-none-any.whl.metadata (5.9 kB)
  Using cached langchain_core-0.1.45-py3-none-any.whl.metadata (5.9 kB)
INFO: pip is still looking at multiple vers

In [2]:
!pip install -U langchain-chroma

Collecting langchain-core<0.3,>=0.1.40 (from langchain-chroma)
  Using cached langchain_core-0.2.24-py3-none-any.whl.metadata (6.2 kB)
Collecting langsmith<0.2.0,>=0.1.75 (from langchain-core<0.3,>=0.1.40->langchain-chroma)
  Using cached langsmith-0.1.93-py3-none-any.whl.metadata (13 kB)
Using cached langchain_core-0.2.24-py3-none-any.whl (377 kB)
Using cached langsmith-0.1.93-py3-none-any.whl (139 kB)
[0mInstalling collected packages: langsmith, langchain-core
  Attempting uninstall: langsmith
    Found existing installation: langsmith 0.0.87
    Uninstalling langsmith-0.0.87:
      Successfully uninstalled langsmith-0.0.87
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 0.1.23
    Uninstalling langchain-core-0.1.23:
      Successfully uninstalled langchain-core-0.1.23
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts

In [3]:
from dotenv import load_dotenv

load_dotenv() # OPENAI_API_KEY

True

## Imports

In [5]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

## Define LLM

In [8]:
llm = OpenAI(model_name="gpt-4o-mini") # https://platform.openai.com/docs/models/gpt-4o-mini
print(llm("tell me a joke"))



Why did the scarecrow win an award?

Because he was outstanding in his field!


## Loading Documents

In [17]:
# Load document
loader = PyPDFLoader("materials/example.pdf")
documents = loader.load()

### For multiple documents 
# loaders = [....]
# documents = []
# for loader in loaders:
#     documents.extend(loader.load())

In [18]:
documents

[Document(metadata={'source': 'materials/example.pdf', 'page': 0}, page_content='Chapter 1 Preview\n 1\nArtificial Intelligence\nIndex Report 2023\nArtificial Intelligence\nIndex Report 2023\nCHAPTER 1:  \nResearch and \nDevelopment'),
 Document(metadata={'source': 'materials/example.pdf', 'page': 1}, page_content='Chapter 1 Preview\n 2\nArtificial Intelligence\nIndex Report 2023Overview   3\nChapter Highlights   4\n1.1 Publications    5\nOverview   5\n Total Number of AI Publications   5\n By Type of Publication   6\n By Field of Study   7\n By Sector   8\n Cross-Country Collaboration  10\n Cross-Sector Collaboration  12\nAI Journal Publications  13\n Overview  13\n By Region  14\n By Geographic Area  15\n Citations  16\nAI Conference Publications  17\n Overview  17\n By Region  18\n By Geographic Area  19\n Citations 20\nAI Repositories 21\n Overview 21\n By Region 22\n By Geographic Area 23\n Citations 24\n Narrative Highlight:   \nTop Publishing Institutions 25\n All Fields 25 Comp

## Define a QA Chain

In [21]:
from langchain.chains.question_answering import load_qa_chain

chain = load_qa_chain(llm=OpenAI(), chain_type="map_reduce")
query = "The multimodal model released by which company?"
chain.run(input_documents=documents, question=query)

' It is not mentioned in the document which specific company released the multimodal model.'

## RetrievalQA

In [22]:
# Load document
loader = PyPDFLoader("materials/example.pdf")
documents = loader.load()

# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

# select which embeddings we want to use
embeddings = OpenAIEmbeddings()

# create the vectorstore
db = Chroma.from_documents(texts, embeddings)

# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":2})

# create a chain to answer questions
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)
query = "What is the total number of publications?"
result = qa({"query": query})

In [23]:
result

{'query': 'What is the total number of publications?',
 'result': ' The total number of publications is approximately 500,000 in 2021.',
 'source_documents': [Document(metadata={'page': 4, 'source': 'materials/example.pdf'}, page_content='Chapter 1 Preview 5\n496.01\n2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 20210100200300400500Number of AI Publications (in Thousands)Number of AI Publications in the World, 2010–21 \nSource: Center for Security and Emerging Technology, 2022 | Chart: 2023 AI Index Report Artificial Intelligence\nIndex Report 2023\nOverview\nThe figures below capture the total number \nof English-language and Chinese-language AI \npublications globally from 2010 to 2021—by type, \naffiliation, cross-country collaboration, and cross-\nindustry collaboration. The section also breaks down 1.1 Publications\npublication and citation data by region for AI journal \narticles, conference papers, repositories, and patents.\nTotal Number of AI Publications\nFigure 1.1.

In [24]:
retriever.get_relevant_documents(query)

[Document(metadata={'page': 4, 'source': 'materials/example.pdf'}, page_content='Chapter 1 Preview 5\n496.01\n2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 20210100200300400500Number of AI Publications (in Thousands)Number of AI Publications in the World, 2010–21 \nSource: Center for Security and Emerging Technology, 2022 | Chart: 2023 AI Index Report Artificial Intelligence\nIndex Report 2023\nOverview\nThe figures below capture the total number \nof English-language and Chinese-language AI \npublications globally from 2010 to 2021—by type, \naffiliation, cross-country collaboration, and cross-\nindustry collaboration. The section also breaks down 1.1 Publications\npublication and citation data by region for AI journal \narticles, conference papers, repositories, and patents.\nTotal Number of AI Publications\nFigure 1.1.1 shows the number of AI publications in \nthe world. From 2010 to 2021, the total number of \nAI publications more than doubled, growing from \n200,000 in 20

## VectorstoreIndexCreator

In [25]:
index = VectorstoreIndexCreator(
    text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0),
    embedding=OpenAIEmbeddings(),
    vectorstore_cls=Chroma
).from_loaders([loader])

query = "What is the total number of AI publications?"
index.query(llm=OpenAI(), question=query, chain_type="stuff")

' The total number of AI publications is almost 500,000 in 2021.'

## CoversationalRetrievalChain

In [26]:
from langchain.chains import ConversationalRetrievalChain

In [27]:
# Load document
loader = PyPDFLoader("materials/example.pdf")
documents = loader.load()

# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

# select which embeddings we want to use
embeddings = OpenAIEmbeddings()

# create the vectorstore
db = Chroma.from_documents(texts, embeddings)

# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":2})

# create a chain to answer questions
qa = ConversationalRetrievalChain.from_llm(OpenAI(), retriever)

chat_history = []
query = "What is the total number of AI publications?"
result = qa({"question": query, "chat_history": chat_history})

In [28]:
result["answer"]

' Almost 500,000 in 2021.'

In [29]:
chat_history = [(query, result["answer"])]
query = "What is this number divided by 2?"
result = qa({"question": query, "chat_history": chat_history})

In [30]:
chat_history

[('What is the total number of AI publications?', ' Almost 500,000 in 2021.')]

In [31]:
result["answer"]

' No, I cannot divide the total number of AI publications by 2 as it is not provided in the given context.'