In [23]:
# Importing Libraries
from tqdm.autonotebook import tqdm
import langchain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.google_palm import GooglePalmEmbeddings
from langchain.prompts import PromptTemplate

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI


from langchain_community.vectorstores import Pinecone as PineconeStore
from langchain.chains.question_answering import load_qa_chain
from langchain_community.llms import GooglePalm
from langchain.schema import Document  # Import the Document class

import pinecone
import os
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
# GOOGLE_API_KEY = os.getenv("google_api_key")

# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 

In [None]:
# Lets Read the document
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [None]:
doc = read_doc("./documents")
len(doc)

In [None]:
def chunk_data(docs, chunk_size = 800, chunk_overlap = 50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return doc

In [None]:
documents= chunk_data(docs=doc)
len(documents)

In [43]:
# Embedding the Knowledge/Domain Knowledge
embeddings = GooglePalmEmbeddings()
#embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
embeddings

GooglePalmEmbeddings(client=<module 'google.generativeai' from 'c:\\Users\\Aman\\anaconda3\\envs\\budgetspeech\\lib\\site-packages\\google\\generativeai\\__init__.py'>, google_api_key=None, model_name='models/embedding-gecko-001', show_progress_bar=False)

In [None]:
# Testing Embedding
vectors = embeddings.embed_query("How are you?")
len(vectors) # This length will be used when we will create vector database

In [2]:
pc = pinecone.Pinecone()

In [None]:
index_name = "langchainvector"
index = pc.Index(index_name)
index

In [None]:
# Get info of Vector index stored on pinecone cloud
index.describe_index_stats()

In [None]:
# Populate embedding vectors in pinecone db for a given index
index = PineconeStore.from_documents(documents=documents, embedding=embeddings, index_name=index_name)

In [44]:
index_name = "langchainvector"
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 141}},
 'total_vector_count': 141}

In [45]:
query_vector = embeddings.embed_query("how much the agriculture target will be increased by how much crore?")
search_result = index.query(vector = query_vector, top_k=2,include_metadata=True)

In [46]:
query_vector

[0.006078688,
 0.0017665706,
 0.0003790817,
 0.03249741,
 0.103810534,
 -0.027183257,
 0.004922847,
 0.0004954607,
 -0.029062878,
 0.007545811,
 -0.04512708,
 -0.0049411417,
 0.02523555,
 -0.027901761,
 -0.011042778,
 0.01157099,
 0.00801514,
 -0.01690539,
 0.02643768,
 0.020566847,
 -0.04771639,
 0.01637682,
 0.0064143795,
 0.0055529024,
 -0.011392463,
 -0.068457745,
 0.020885037,
 0.005138944,
 -0.030387208,
 0.010428449,
 0.039354164,
 -0.0034410153,
 -0.041932,
 -0.03626201,
 -0.0048282607,
 0.03344457,
 -0.05373177,
 0.06313796,
 -0.015881725,
 0.025343807,
 0.010718729,
 -0.049377076,
 -0.00809193,
 0.012722447,
 -0.039871506,
 -0.047587525,
 -0.028588329,
 -0.0074825482,
 -0.02861813,
 -0.012538949,
 0.0061135306,
 -0.03682207,
 0.0019326686,
 0.0477439,
 0.043617263,
 0.0004087937,
 -0.01650837,
 0.034773648,
 -0.023658857,
 -0.044570766,
 0.02994892,
 -0.008498539,
 0.032886676,
 0.027244652,
 -0.06337474,
 0.042269737,
 -0.0131263,
 0.020280503,
 0.0075821504,
 0.022232706,
 

In [47]:
# Cosine Similiarity to Retreive Results from pinecone vector db
def retrive_query(query, k=5):
    query_vector = embeddings.embed_query(query)
    matching_results= index.query(vector = query_vector, top_k=k, include_metadata=True)
    return matching_results

In [51]:
def get_conversational_chain():
    prompt_template = """
    You are a knowledgeable assistant. You have access to the following information:

    {context}

    Based on the above information, answer the following question:
    {question}

    Answer:
    """
    # model = ChatGoogleGenerativeAI(model="gemini-pro",
    #                          temperature=0.7)
    
    model = GooglePalm(temperature=0.5)
    
    prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])

    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)

    return chain

In [52]:
# Search answers from VectorDB
def retrive_answer (query):
    doc_search = retrive_query(query=query)
    doc_search= doc_search['matches']
    if not doc_search:
        return "No relevant documents found."
   # Create Document objects
    input_documents = [Document(page_content=item['metadata']['text'], metadata={'source': 'Pinecone'}) for item in doc_search if 'metadata' in item and 'text' in item['metadata']]
    
    # Join the extracted documents into a single context string
    context = "\n".join([doc.page_content for doc in input_documents])
    
    chain = get_conversational_chain()
    
    # Prepare the input for the chain
    response = chain({"input_documents": input_documents, "context": context, "question": query}, return_only_outputs=True)
    
    return response

In [53]:
query = "Whose speech is this and who is it and what is this speech about?"
answer = retrive_answer (query)

In [54]:
answer

{'output_text': 'This is the Budget Speech 2023-2024 presented by Nirmala Sitharaman, the Minister of Finance of India.'}