In [16]:
## import libraries
import os
import langchain
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAI
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

os.environ['OPENAI_API_KEY'] = ''
os.environ['PINECONE_API_KEY'] = ''

In [2]:
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [3]:
doc = read_doc('documents/')
len(doc)

32

In [4]:
## Divide the document into chunks
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(docs)
    return docs

In [7]:
docs = chunk_data(docs=doc)

In [None]:
docs

In [5]:
## Embedding Technique of OpenAI

embeddings = OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
embeddings


OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001FEE3FE8D30>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001FEE3AFAA90>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [9]:
vectors = embeddings.embed_query("How are you?")

In [10]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])

index_name = "langchainvectors"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [12]:
index_name

'langchainvectors'

In [13]:
## Vector search DB in Pinecone
## Create vector DB in Pinecone

# index_name = "langchain-index"
vectorstore_from_docs = PineconeVectorStore.from_documents(
        docs,
        index_name=index_name,
        embedding=embeddings
    )

In [14]:
# Cosine similarity retrival from vector DB

def retrieve_query(query,k=2):
    matching_results = vectorstore_from_docs.similarity_search(query,k=2)
    return matching_results

In [24]:
from langchain.chains.question_answering import load_qa_chain

llm = OpenAI(model='gpt-3.5-turbo-instruct',temperature=0.5)

In [25]:
chain = load_qa_chain(llm,chain_type='stuff')

In [26]:
## Search answers from vector DB

def retrieve_answers(query):
    doc_search = retrieve_query(query)
    print(doc_search)
    response = chain.run(input_documents=doc_search,question=query)
    return response

In [27]:
answer = retrieve_answers('How much the agriculture target will be increased by how many crores?')
print(answer)

[Document(page_content='11.8 crore farmers, including marginal and small farmers.  Crop \ninsurance is given to  4 crore farmers under PM Fasal Bima \nYojana. These, besides several other programmes, are assisting \n‘Annadata’  in producing food for the country and the world.  \n15. Electroni c National Agriculture Market has integrated  \n1361 mandis, and is providing services to 1.8 crore farmers with \ntrading volume of ` 3 lakh crore.  \n16. The sector is poised for inclusive, balanced, higher growth \nand productivity. These are facilitated from farme r-centric \npolicies, income support, coverage of risks through price and', metadata={'page': 8.0, 'source': 'documents\\budget_speech.pdf'}), Document(page_content='16 \n doubled.  Implementation of Pradhan Mantri Matsya Sampada \nYojana (PMMSY) will be stepped up to:  \n(1) enhance aquaculture productivity from existing 3 to  \n5 tons per hectare,  \n(2) double exports to ` 1 lakh crore and  \n(3) generate  55 lakh employment oppor