## Setup & Importing packages 

In [31]:
# Importing required libraries

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.llms import OpenAI
from dotenv import load_dotenv
import langchain
import pinecone
import openai
import os

In [21]:
# Importing the required keys from our .env file
load_dotenv()

True

## Reading the PDF
Using the PyPDFDirectoryLoader

In [6]:
# Defining the function to read the PDF from our directory using the PyPDFDirectoryLoader package
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [7]:
doc=read_doc('documents/')
len(doc)

32

## Dividing the PDF into Text Chunks
Using the RecursiveCharacterTextSplitter

In [9]:
## Now, we will divide the pdfs that we read into text chunks
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return docs

In [10]:
documents=chunk_data(docs=doc)
len(documents)

32

## Adding OpenAI Embeddings to text chunks

In [17]:
# Adding the Open AI API Key to use OpenAI Embeddings
embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x12752eb10>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x12ee606d0>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None)

In [18]:
vectors=embeddings.embed_query("How are you?")
len(vectors)

1536

## Initialize Pinecone

In [62]:
import os
from pinecone import Pinecone as pc

# Create a Pinecone instance
pc = Pinecone(
    api_key=os.environ.get("PINECONE_API_KEY")
)

index = pc.Index("langchainvector")

In [66]:
index=PineconeVectorStore.from_documents(doc,embeddings,index_name=index_name)

## Apply Similarity Search for Document Retrieval (Cosine Similarity)

In [68]:
## Cosine Similarity Retreive Results from VectorDB
def retrieve_query(query,k=2):
    matching_results=index.similarity_search(query,k=k)
    return matching_results

In [69]:
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI

## Select Open AI model GPT-3.5-Turbo

In [73]:
llm=OpenAI(model_name="gpt-3.5-turbo-instruct",temperature=0.5)
chain=load_qa_chain(llm,chain_type="stuff")

## Perform Search (User)

In [74]:
## Search answers from VectorDB
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    print(doc_search)
    response=chain.run(input_documents=doc_search,question=query)
    return response


In [77]:
our_query = "Give me the top 3 takaways from the budget"
answer = retrieve_answers(our_query)
print(answer)

[Document(page_content='28 \n rupees ( ₹ 10,000 ) for financial years 2010 -11 to 2014 -15. This is \nexpected to benefit about a crore tax-payers.  \nEconomy – Then and Now  \n94. In 2014 when our Government assumed the reins, the \nresponsibility to mend the economy step by step and to put the \ngovernance systems in order was enormous. The need of the \nhour was to give hope to the people, to attract investments, and \nto build support for the much -needed reforms. The Government \ndid that successfully following our strong belief of ‘nation -first’.   \n95. The crisis of those years has been overcome, and the \neconomy has been put firmly on a high sustainable growth path \nwith all -round development. It is now appropri ate to look at \nwhere we were then till 2014 and where we are now, only for \nthe purpose of drawing lessons from the mismanagement of \nthose years. The Government will lay a White Paper on table of \nthe House.  \n96. The exemplary track record of governance, de