In [13]:
# Import necessary libraries
import openai
import tiktoken
import dotenv
import os
from PyPDF2 import PdfReader
from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain


In [None]:

# Configure OpenAI API
# os.environ.get("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_TYPE = os.environ.get("AZURE_OPENAI_API_TYPE")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.environ.get("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_API_VERSION = os.environ.get("AZURE_OPENAI_API_VERSION")
AZURE_OPENAI_CHAT_MODEL = os.environ.get("AZURE_OPENAI_CHAT_MODEL")
AZURE_OPENAI_EMBEDDING_MODEL = os.environ.get("AZURE_OPENAI_EMBEDDING_MODEL")
openai.api_type = AZURE_OPENAI_API_TYPE
openai.api_version = AZURE_OPENAI_API_VERSION
openai.api_base = AZURE_OPENAI_ENDPOINT
openai.api_key = AZURE_OPENAI_API_KEY

print(AZURE_OPENAI_ENDPOINT)




In [15]:
# Configure the location of the PDF file.
pdfReader = PdfReader('../../Sample-Data/LLM-AI-Agents.pdf')



In [16]:
# Extract the text from the PDF file.
raw_text = ''
for i, page in enumerate(pdfReader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [None]:
# Show first 1000 characters of the text.
raw_text[:1000]

In [18]:
# Split the text into chunks of 1000 characters with 200 characters overlap.
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)
pdfTexts = text_splitter.split_text(raw_text)


In [19]:
# Show how many chunks of text are generated.
len(pdfTexts)

98

In [20]:
# Pass the text chunks to the Embedding Model from Azure OpenAI API to generate embeddings.
embeddings = OpenAIEmbeddings(
    openai_api_base= AZURE_OPENAI_ENDPOINT,
    openai_api_type=AZURE_OPENAI_API_TYPE,
    deployment=AZURE_OPENAI_EMBEDDING_MODEL,
    openai_api_key=AZURE_OPENAI_API_KEY,
    chunk_size=1,
)


In [21]:
# Use FAISS to index the embeddings. This will allow us to perform a similarity search on the texts using the embeddings.
# https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/faiss.html
pdfDocSearch = FAISS.from_texts(pdfTexts, embeddings)

In [22]:
# Create a Question Answering chain using the embeddings and the similarity search.
# https://docs.langchain.com/docs/components/chains/index_related_chains
# LangChain Documents in Memory - https://python.langchain.com/docs/use_cases/question_answering/how_to/question_answering#the-stuff-chain
chain = load_qa_chain(AzureChatOpenAI(openai_api_key=AZURE_OPENAI_API_KEY, deployment_name=AZURE_OPENAI_CHAT_MODEL, openai_api_base=AZURE_OPENAI_ENDPOINT, model_name=AZURE_OPENAI_CHAT_MODEL, openai_api_version=AZURE_OPENAI_API_VERSION), chain_type="stuff")

print(chain)

memory=None callbacks=None callback_manager=None verbose=False tags=None metadata=None input_key='input_documents' output_key='output_text' llm_chain=LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, prompt=ChatPromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], output_parser=None, partial_variables={}, template="Use the following pieces of context to answer the users question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}", template_format='f-string', validate_template=True), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], output_parser=None, partial_variables={}, template='{question}', template_format='f-string', validate_template=True), additional_kwargs={})]), llm=Azure

In [23]:
# Perform first sample of question answering.
inquiry = "Please tell me the key summary of this book."
docs = pdfDocSearch.similarity_search(inquiry)
chain.run(input_documents=docs, question=inquiry)

'The key summary of this book is that AI agents should be able to summarize the interaction history and provide a concise and easy-to-understand answer for users. Techniques such as chain-of-thought (CoT) and vector databases can be used to achieve this. The book also discusses the design of AI agents, with a focus on task planning and tool usage as the core competencies. However, there are weaknesses identified in LLM-based agents, particularly in understanding output formats. Overall, the book aims to enhance the abilities of AI agents in task planning and tool usage.'