## Install necessary libraries

In [2]:
!pip install sentence-transformers langchain faiss-cpu PyPDF2 requests
!pip install -U langchain-community
!pip install tiktoken
!pip install openai sentence-transformers langchain faiss-cpu PyPDF2 requests
!pip install langchain_google_genai

from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA, LLMChain, MapReduceDocumentsChain, StuffDocumentsChain
from langchain.docstore import InMemoryDocstore
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
import faiss
from PyPDF2 import PdfReader
import requests
import tiktoken







  from tqdm.autonotebook import tqdm, trange


## Load the PDF file

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

In [51]:
file_path = 'constitution.pdf'
pdf = PdfReader(file_path)
total_pages = len(pdf.pages)

## Extract text from the entire PDF

In [52]:
text = ""
for page_number in range(total_pages):
    page = pdf.pages[page_number]
    text += page.extract_text()

In [53]:
print(text)

 
NATIONAL  CONSTITUTION  CENTER   
   
 
 
 
 
  
 
THE  
CONSTITUTION  
of the United  States 
 
 
 
 
 
  
 
  
 
   
 
  
 
  
 
  
 
  
 
  
 
   
 C O N S T I T U T I O N O F T H E U N I T E D S T A T E S   
 
 
 
We the People of the United States, in Order to form a 
more perfect Union, establish Justice, insure domestic 
Tranquility, provide for the common defence, promote 
the general  Welfare, and secure the Blessings of Liberty to 
ourselves  and our Posterity,  do ordain  and establish  this 
Constitution for the United States of America  
 
 
Article.   I. 
SECTION.  1 
All legislative Powers herein granted shall be vested in a 
Congress of the United States, which shall consist of a Sen-  
ate and House of Representatives. 
SECTI ON. 2 
The House of Representatives shall be composed of Mem-  
bers chosen every second Year by the People of the several States, and the Electors in each State shall have the Qualifi-  
cations requisite for Electors of the most numerous Branc

## Count the tokens

In [54]:
# Count the tokens
encoding = tiktoken.get_encoding("gpt2") 
tokens = encoding.encode(text)
num_tokens = len(tokens)

print(f"Total number of tokens in the PDF: {num_tokens}")

Total number of tokens in the PDF: 14543


## Split text into smaller chunks for embedding

In [55]:
def split_text(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = split_text(text, chunk_size=500)

## Generate embeddings for each chunk


In [56]:
# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks)



## Create documents for the chunks

In [57]:
documents = {str(i): Document(page_content=chunk) for i, chunk in enumerate(chunks)}
print (f"You now have {len(documents)} docs intead of 1 piece of text")

You now have 19 docs intead of 1 piece of text


## Initialize FAISS vector store and add embeddings

In [58]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

## Create an in-memory docstore & Map index

In [59]:
docstore = InMemoryDocstore(documents)

# Map index to docstore IDs
index_to_docstore_id = {i: str(i) for i in range(len(documents))}

## Initialize FAISS vector store

In [60]:
vector_store = FAISS(
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
    embedding_function=model.encode
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


## Intialize Google API key and LLM Model

In [61]:
from langchain_google_genai import ChatGoogleGenerativeAI
GOOGLE_API_KEY="AIzaSyBt5QxkehTjVHJ9n9EAqZ8m_8KMAq5wA34"
llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)

## Create a prompt template for the QA task

In [62]:
qa_template = """
Question: {question}
Context: {context}
Answer:
"""

prompt = PromptTemplate(template=qa_template, input_variables=["question", "context"])

## Create a text splitter to handle splitting of documents

In [63]:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)

## Create the LLMChain for the question-answering task

In [64]:
qa_chain = LLMChain(llm=llm, prompt=prompt)

## Define a reduce_documents_chain using StuffDocumentsChain

In [65]:
reduce_documents_chain = StuffDocumentsChain(llm_chain=qa_chain, document_variable_name="context")

## Create the MapReduceDocumentsChain using the QA chain and text splitter

In [66]:
map_reduce_chain = MapReduceDocumentsChain(
    llm_chain=qa_chain,
    reduce_documents_chain=reduce_documents_chain,
    document_variable_name="context"
)

## Initialize RetrievalQA with the vector store and the map_reduce chain

In [67]:
retriever = RetrievalQA(
    retriever=vector_store.as_retriever(),
    combine_documents_chain=map_reduce_chain
)


## Function to ask questions to the PDF

In [68]:
def ask_question(question):
    result = retriever.run(question)
    return result

## Example

In [70]:
question = "describe section 2 as im 10 years old"
answer = ask_question(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: describe section 2 as im 10 years old
Answer: Section 2 of the Constitution is like a recipe for making a group of people called the House of Representatives. These people, called Representatives, are chosen by the people in each state and get to vote every two years. To be a Representative, you have to be at least 25 years old, have lived in the United States for at least seven years, and live in the state where you're chosen.

The Representatives have a special job to do: they make the laws for our country. They can also decide if the President or other important people in the government have done something wrong.
