In [1]:
!pip install PyPDF2
!pip install langchain
!pip install -qU langchain-google-genai
!pip install faiss-cpu
!pip install -U langchain-community

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting langchain
  Downloading langchain-0.2.9-py3-none-any.whl (987 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m987.7/987.7 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.20 (from langchain)
  Downloading langchain_core-0.2.21-py3-none-any.whl (372 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m372.0/372.0 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl (25 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.90-py3-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━

## Prerequisites

*   Specify Google API key
*   Download PDF




In [2]:
import os
# Specify Google API key
os.environ["GOOGLE_API_KEY"] = ""

!curl -o AssignmentSupportDocument.pdf -L 'https://docs.google.com/uc?export=download&id=1euhsXby_G-vDekfvg6Rim5VBU19lQfIX'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1450k  100 1450k    0     0   373k      0  0:00:03  0:00:03 --:--:--  561k


## Extract text from PDF

In [3]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

text = extract_text_from_pdf('AssignmentSupportDocument.pdf')

## Split text into chunks using RecursiveCharacterTextSplitter

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text(text, size):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=size,
        chunk_overlap=20,
        length_function=len,
    )
    chunks = text_splitter.split_text(text)
    return chunks

chunks = split_text(text, 500)

## Embeddings

*   Generate embeddings using GoogleGenerativeAIEmbeddings
*   Store embedding into FAISS







In [5]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.schema import Document
from langchain_community.vectorstores import FAISS

def build_docs(chunks):
    docs = []
    for i, chunk in enumerate(chunks):
        doc = Document(page_content=chunk, metadata={"source": i})
        docs.append(doc)
    return docs


def generate_and_store_embeddings(chunks):
    embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
    docs = build_docs(chunks)
    vector_store = FAISS.from_documents(docs, embedding=embeddings)
    return vector_store

vector_store = generate_and_store_embeddings(chunks)
vector_store.save_local("faiss_index")

## Retrieving results

*   Query LLM using vector_store(FAISS) as retriever through RetrievalQAWithSourcesChain
*   Using ConversationBufferMemory for maintaing context
*   Using GoogleGenerativeAI as LLM




In [6]:
from langchain_google_genai import GoogleGenerativeAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import RetrievalQAWithSourcesChain

def get_answer():
  llm = GoogleGenerativeAI(
      model="gemini-1.5-flash",
      temperature=0.7,
      max_output_tokens=1024
    )
  retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":7})
  memory = ConversationBufferMemory(
      llm=llm,
      input_key='question',
      output_key='answer',
      max_token_limit=5
  )
  chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    memory=memory
  )
  result = chain({"question": query}, return_only_outputs=True)
  final_answer = result['answer']
  final_answer = final_answer.replace("FINAL ANSWER: ", "")
  final_answer = final_answer.strip()
  return final_answer


while True:
  query = input("Enter your search query (or type 'quit' to stop): ")
  if query.lower() == 'quit':
      break

  answer = get_answer()
  print('\033[91m')
  print(answer)
  print('\033[0m')

Enter your search query (or type 'quit' to stop): What is the name of speaker?


  warn_deprecated(


[91m
The speaker's name is Nirmala Sitharaman.
[0m
Enter your search query (or type 'quit' to stop): When was this made?
[91m
This document was made on February 1, 2024.
[0m
Enter your search query (or type 'quit' to stop): Which country is talked about?
[91m
The country talked about is India.
[0m
Enter your search query (or type 'quit' to stop): What is this document about?
[91m
This document is the speech delivered by the Indian Finance Minister Nirmala Sitharaman on February 1, 2024, outlining the Interim Budget for 2024-2025.
[0m
Enter your search query (or type 'quit' to stop): Summarize the document
[91m
This document outlines the Indian government's plan for future economic development. It covers various aspects, including the current state of the economy, global context, financial sector strengthening, inflation management, urbanization, green energy, and societal changes. It also emphasizes the importance of "Viksit Bharat" (developed India) and the government's commi

## Calculating F1 score of model

In [8]:
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer

mock_questions = [
    "When was speech given",
    "Who was the speaker?",
    "Summarize the document",
    "What are the plans in infrastructure of india?",
    "Under PM Awas Yojana how many houses are they making?"
]

mock_answers = [
    "February 1, 2024",
    "The speaker was Nirmala Sitharaman, Minister of Finance.",
    "The document discusses the government's success in various areas, including governance, development, and economic performance.  It highlights the government's commitment to  transparency, accountability, and citizen-centric approach. The document also mentions specific policies aimed at promoting the growth of Micro, Small and Medium Enterprises (MSMEs) and achieving net-zero emissions by 2070.",
    "The provided text mentions plans for expanding existing airports and developing new ones, implementing three major economic railway corridor programmes, and launching a scheme to help the middle class buy or build their own houses.",
    "The text states that they are close to achieving the target of three crore houses under PM Awas Yojana (Grameen) and that two crore more houses will be taken up in the next five years."
]

answers = []
for query, org_answer in zip(mock_questions, mock_answers):
  answer = get_answer()
  answers.append(answer)

mlb = MultiLabelBinarizer()
tokenized_generated = mlb.fit_transform(answers)
tokenized_ideal = mlb.transform(mock_answers)


f1 = f1_score(tokenized_ideal, tokenized_generated, average='micro')
print("F1 Score:", f1)

F1 Score: 0.9


