In [None]:

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "Your API key"
os.environ["PINECONE_API_KEY"] = "Your API key"
os.environ["PINECONE_REGION"] = "Your Pinecone VectorDB region"

In [4]:
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [5]:
doc = read_doc('documents/')
len(doc)

58

In [6]:
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(docs)
    return docs


In [7]:
documents = chunk_data(doc)
documents 

[Document(metadata={'producer': 'Adobe Acrobat Pro 10.1.16', 'creator': 'Adobe Acrobat Pro 10.1.16', 'creationdate': '2023-02-01T05:28:04+05:30', 'moddate': '2023-02-01T08:28:21+05:30', 'title': '', 'source': 'documents\\budget_speech.pdf', 'total_pages': 58, 'page': 0, 'page_label': '1'}, page_content='GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023'),
 Document(metadata={'producer': 'Adobe Acrobat Pro 10.1.16', 'creator': 'Adobe Acrobat Pro 10.1.16', 'creationdate': '2023-02-01T05:28:04+05:30', 'moddate': '2023-02-01T08:28:21+05:30', 'title': '', 'source': 'documents\\budget_speech.pdf', 'total_pages': 58, 'page': 2, 'page_label': '3'}, page_content='CONTENTS \nPART-A \n Page No. \n\uf0b7 Introduction 1 \n\uf0b7 Achievements since 2014: Leaving no one behind 2 \n\uf0b7 Vision for Amrit Kaal – an empowered and inclusive economy 3 \n\uf0b7 Priorities of this Budget 5 \ni. Inclusive Development  \nii. Reaching the Last Mile \

In [8]:
embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))
embeddings 

  embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))


OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001810F1DD510>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001810F2D05B0>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

In [9]:
vectors = embeddings.embed_query("Hi, I'm Zorain")
vectors 

[-0.009474454224027673,
 -0.008864257275475854,
 -0.017229860241884002,
 -0.000681960589921785,
 -0.010379907430119146,
 0.03427600407174481,
 -0.025890718534509253,
 -0.025746370989431087,
 -0.0016944445269758057,
 -0.019618156834379608,
 0.018581479349299167,
 -0.00031124959780160286,
 0.008404969970708075,
 -0.01862084635359909,
 -0.0052424439854486925,
 -0.01280101172217646,
 0.029604389564886736,
 -0.0180040873063414,
 0.016468753649548144,
 0.003362643918757051,
 2.6962662832695896e-05,
 -0.021376573907495984,
 0.01915886952961183,
 -0.005084973639942609,
 0.001356539750122015,
 0.00877240000078681,
 0.011751210039684271,
 0.003533236599029777,
 0.015261483018533702,
 -0.04668990039904747,
 -0.00024297153294828175,
 0.01711175736633912,
 0.024158546130926162,
 -0.023738625830458306,
 -0.005619716232263685,
 -0.015235237417677865,
 0.009402280451488592,
 -0.014907174391899055,
 0.007230504710948958,
 -0.0013713026095651252,
 0.03561450084439333,
 -0.00811627394922919,
 -0.01467096

In [10]:
len(vectors)

1536

## Vector Search Database in Pinecone

In [19]:
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone

In [27]:
pc = Pinecone(
    api_key=os.getenv("PINECONE_API_KEY"),
    region=os.getenv("PINECONE_REGION")
)

In [29]:
index = pc.Index("langchainvector")

In [30]:
vectorstore = PineconeVectorStore(
    index=index,
    embedding=embeddings,
    text_key="text"
)

In [31]:
vectorstore.add_documents(doc)

['59c44234-3bac-4956-8e35-d786b5eff750',
 'fe344052-9487-4a67-81ea-2a41fc3efa6d',
 '3aa7bfc8-6413-4ce2-90a3-9641f8bc4775',
 '7d0d5c75-019a-42b7-878c-35416c7ab388',
 '277d136b-795e-4047-ae5f-34fb58a9e701',
 'f425eb90-5c94-4ebb-8e05-54b763aec566',
 '9d4de6d6-12f0-45eb-9879-39483cfc820f',
 '46fd3b50-8901-46e3-a776-48d6c0b45358',
 '50424eca-88cc-41d8-90df-eac19299556b',
 '3d2d04f4-4466-4fbe-9f9a-76615cd1aeac',
 '5b1a265a-3b49-4115-bf63-3b4ca98036c7',
 'f2e7543f-1d22-4ce3-b732-afcee1318473',
 '8915c165-d7e2-4d0c-9b86-2b69e9f9134c',
 '7c2f5879-fb8c-40e4-a191-724735dd8432',
 '979979dc-721d-4e7b-9bff-6873956db0c3',
 'e812487a-59d6-4902-b96b-3ca9c46a8c5c',
 'cc86bd44-d40a-4a27-a0d7-259fd4cabd0d',
 '527f5964-9c45-405d-adba-964af6adf600',
 '9a48392a-43b3-4666-b281-9d601aad7903',
 '3e410276-b3f5-4f53-a673-72694975030c',
 '66f4b761-b3d2-435e-b1c7-d3aaac6b0816',
 'cde628b6-19b1-45e4-82c4-62be43d41549',
 'e6bcdb2e-5291-48fc-9c02-ad28104faa7a',
 '81cc57c0-a4ce-4b79-a497-9cc66fd665a5',
 '9c275b18-d7ce-

## Applying Cosine Similarity to retrieve results

In [32]:
def retrieve_query(query, k=2):
    matching_results = vectorstore.similarity_search(query, k=k)
    return matching_results

## Loading our QnA model

In [39]:
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import ChatOpenAI

In [40]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.3
)
chain = load_qa_chain(llm, chain_type="stuff")


## Search Answers from VectorDB

In [43]:
def retrieve_answers(query):
    doc_search = retrieve_query(query)
    print("Documents retrieved: ", doc_search)
    response = chain.run(input_documents=doc_search, question=query)
    return response

In [44]:
my_query = "How much the agriculture target will be increased by how many crores?"
answer = retrieve_answers(my_query)
print(answer)

Documents retrieved:  [Document(id='5b1a265a-3b49-4115-bf63-3b4ca98036c7', metadata={'creationdate': '2023-02-01T05:28:04+05:30', 'creator': 'Adobe Acrobat Pro 10.1.16', 'moddate': '2023-02-01T08:28:21+05:30', 'page': 10.0, 'page_label': '11', 'producer': 'Adobe Acrobat Pro 10.1.16', 'source': 'documents\\budget_speech.pdf', 'title': '', 'total_pages': 58.0}, page_content="7 \n \n \n \nfarmers in contributing to the health of fellow citizens by growing these \n‘Shree Anna’.  \n22. Now to make India a global hub for ' Shree Anna', the Indian Institute \nof Millet Research, Hyderabad will be supported as the Centre of Excellence \nfor sharing best practices, research and technologies at the international \nlevel.    \nAgriculture Credit  \n23. The agriculture credit target will be increased  \nto ` 20 lakh crore with focus on animal husbandry, dairy and fisheries.  \nFisheries \n24. We will launch a new sub-scheme of PM Matsya Sampada Yojana \nwith targeted investment of ` 6,000 crore to