In [60]:
!pip install langchain pinecone-client pypdf2
!pip install -U langchain-community
!pip install pypdf  tiktoken
!pip install openai==0.28
!pip freeze -l > requirements.txt 



In [62]:
# import Libraries

import openai
import langchain
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
import os
from pinecone import Pinecone, ServerlessSpec

OPENAI_API_KEY='your_key'
PINECONE_API_KEY="your_key"
openai.api_key = OPENAI_API_KEY


In [65]:
## Lets Read the document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

doc=read_doc('/content/sample_data/documents/')
len(doc)

58

In [66]:
## Divide the docs into chunks
### https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html#
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return docs

documents=chunk_data(docs=doc)
print("len(documents)=",len(documents))
doc0 = documents[0]
print("type(doc0)=",type(doc0))
print("doc0=",doc0)
print("text=",doc0.page_content)

len(documents)= 58
type(doc0)= <class 'langchain_core.documents.base.Document'>
doc0= page_content='GOVERNMENT OF INDIA
BUDGET 2023-2024
SPEECH
OF
NIRMALA SITHARAMAN
MINISTER OF FINANCE
February 1,  2023' metadata={'source': '/content/sample_data/documents/budget_speech.pdf', 'page': 0}
text= GOVERNMENT OF INDIA
BUDGET 2023-2024
SPEECH
OF
NIRMALA SITHARAMAN
MINISTER OF FINANCE
February 1,  2023


In [None]:
## Embedding Technique Of OPENAI
embeddings=OpenAIEmbeddings(api_key=OPENAI_API_KEY)

print("embeddings=",embeddings)

In [70]:
vectors=embeddings.embed_query("How are you?")
len(vectors)

1536

In [71]:
## Vector Search DB In Pinecone
pc = Pinecone(
        api_key=PINECONE_API_KEY
    )

index_name = 'langchainvector' # needs to be created first
# Now do stuff
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='euclidean',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )


index = pc.Index(index_name)
print("==============",index.describe_index_stats())

 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 59}},
 'total_vector_count': 59}


In [72]:

def get_embedding(text):
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response['data'][0]['embedding']

#populate the vector DB
for i,doc in enumerate(documents): # This takes 1-2 minutes
    embedding = get_embedding(doc.page_content)
    # index.upsert([(str(i), embedding)])
    index.upsert([{
        'id': str(i),
        'values': embedding,
        'metadata': {'text': doc.page_content}  # Include document text as metadata
    }])


print("Documents inserted into vector DB")


Documents inserted into vector DB


In [73]:
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
import os


# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Correct model initialization
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)
chain = load_qa_chain(llm, chain_type="stuff")

def retrieve_query(query,k=2):
    matching_results = []
    query_embedding = get_embedding(query)

    query_results = index.query(
    vector=query_embedding,
    top_k=k,
    include_metadata=True  # Include metadata in the results to retrieve the text
    )

    print("query_results=",query_results)

    # Loop through matches to extract text and metadata
    for match in query_results['matches']:
        # Ensure 'metadata' contains the expected fields
        page_content = match['metadata'].get('text', "No content found")
        metadata = match['metadata']

        # Create a structured result with text and metadata
        result = {
            "page_content": page_content,
            "metadata": metadata  # All other metadata fields can be accessed
        }

        # Append each match to the list
        matching_results.append(result)

    print("matching_results=",matching_results)
    return matching_results

## Search answers from VectorDB
def retrieve_answers(query):

    matching_results = retrieve_query(query)
    # Convert to Document format
    input_docs = [
        Document(page_content=doc['page_content'], metadata=doc['metadata'])
        for doc in matching_results
    ]

    # Run the chain
    response = chain.run(input_documents=input_docs, question=query)
    return response

# Example query
our_query = "How much the agriculture target will be increased by how many crore?"
answer = retrieve_answers(our_query)
print(answer)



query_results= {'matches': [{'id': '10',
              'metadata': {'text': '7 \n'
                                   ' \n'
                                   ' \n'
                                   ' farmers in contributing to the health of '
                                   'fellow citizens by growing these \n'
                                   '‘Shree Anna’.   \n'
                                   "22. Now to make India a global hub for ' "
                                   "Shree Anna' , the Indian Institute \n"
                                   'of Millet Research, Hyderabad  will be '
                                   'supported as the Centre of Excellence \n'
                                   'for sharing best practices, research and '
                                   'technologies at the international \n'
                                   'level.    \n'
                                   'Agriculture Credit  \n'
                                   '23. The agricultur