In [None]:
from langchain_community.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.output_parsers import StrOutputParser
from uuid import uuid4

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import GradientLLM
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI, HarmBlockThreshold, HarmCategory
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain_community.embeddings import HuggingFaceEmbeddings


In [None]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyBWt4xrbfIcs1sNz6lhwhl7vW1adeQ8d5U"
os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

In [None]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder


In [None]:
from sentence_transformers import CrossEncoder

reranker_model = CrossEncoder(model_name="../bge-reranker-base", max_length=512)

In [None]:
def rerank_docs(query, retrieved_docs):
    query_and_docs = [(query, r.page_content) for r in retrieved_docs]
    scores = reranker_model.predict(query_and_docs)
    return sorted(list(zip(retrieved_docs, scores)), key=lambda x: x[1], reverse=True)

In [None]:
llm = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0.3, stream=True,    
                safety_settings={
                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
            },
            callbacks=[StreamingStdOutCallbackHandler()]
                )

In [None]:
# embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
embeddings = HuggingFaceEmbeddings(model_name = "../all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name = "../all-MiniLM-L6-v2")


In [None]:
from langchain_core.prompts import ChatPromptTemplate
template = """You are a financial expert with access to the annual report of the company.
When answering questions about the company's financial performance, prioritize information from the Financial Statements section.Considering the user's question, provide clear and concise answers from given context.
{context}

Question: {question}
Answer:
"""
# template = """You are a financial expert with access to the annual report of the company.
# When answering questions about the company's financial performance, prioritize information from the Financial Statements section.

# {context}

# **Question:** {question}

# **Answer:**

# If you can find relevant information in the annual report, please provide a clear and concise answer based on the facts.
# If you're unable to find an answer in the report, you can respond by saying:

# * "I couldn't find information related to '{question}' in the annual report."
# * "The annual report doesn't provide sufficient details to answer this question definitively."

# """


prompt = ChatPromptTemplate.from_template(template)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
import fitz
from langchain_core.documents import Document

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=2000,
    chunk_overlap=200,
    length_function=len,
)


In [None]:
doc = fitz.open("../Reports/jio.pdf")
pages=[]
for page_no in range(doc.page_count):
        text = doc[page_no].get_text()
        text = re.sub(r"\n", " ", text)
        text = text_splitter.split_text(text=text)
        for chunk in text:
            page = Document(page_content=chunk, metadata = {"page":page_no+1})
            pages.append(page)    

In [None]:
VectorStore = FAISS.from_documents(pages, embedding=embeddings)

In [None]:
retriever = VectorStore.as_retriever(
    search_kwargs={"k": 10}
)

In [None]:

model = HuggingFaceCrossEncoder(model_name="../bge-reranker-base")
Bcompressor = CrossEncoderReranker(model=model, top_n=4)
Bge_compression_retriever = ContextualCompressionRetriever(
    base_compressor=Bcompressor, base_retriever=retriever
)


In [None]:

Fcompressor = FlashrankRerank(top_n=4)
Flash_compression_retriever = ContextualCompressionRetriever(
    base_compressor=Fcompressor, base_retriever=retriever
)

In [None]:
from langchain.chains import ConversationalRetrievalChain
chain = ConversationalRetrievalChain.from_llm(llm, Flash_compression_retriever, return_source_documents=True,combine_docs_chain_kwargs={"prompt": prompt})

In [None]:
from langchain.chains.combine_documents import create_stuff_documents_chain


In [None]:
expression_language_chain = prompt | llm | StrOutputParser()

In [None]:
chat_history = []

In [None]:
query="Give information about company chairman?"

In [None]:
result = llm.invoke(["s"])

In [None]:
stream_Chain = chain.pick("answer")
for chunk in stream_Chain.stream({"question": query, "chat_history": chat_history}):
    print(f"{chunk}|", end="", flush=True)

Akash M. Ambani is the Chairman of the company.|

In [None]:
result["source_documents"]

[Document(metadata={'id': 9, 'relevance_score': 0.9983312, 'page': 33}, page_content='Reliance Jio Infocomm Limited 32 Corporate Governance Report Mr. Ranjit V. Pandit Independent  Director DIN: 00782296 Citizen of USA Profile: Mr. Ranjit V. Pandit holds an M.B.A. degree from the Wharton School at the University of  Pennsylvania and a B.E. degree in Electrical Engineering from VJTI, University of Bombay, India.  Mr. Pandit served as a Managing Director at General Atlantic, LLC between September 2007  and December 2012 and headed the India office where he served as head of the firm’s growth  investment activities across India. He served as an Advisory Director of General Atlantic LLC  in 2013. Prior to General Atlantic he served as Managing Director and Chairman of McKinsey  & Company in India. Mr. Pandit joined McKinsey & Co. in August 1980 and as a Co-Founder of  McKinsey’s India office was transferred from New York to Mumbai in January 1993. As Managing  Director - India for McKinsey

In [None]:
result

{'question': 'Give information about company chairman and it holding in company',
 'chat_history': [],
 'answer': 'Mr. Akash M. Ambani is the Chairman of Reliance Jio Infocomm Limited. The provided document does not mention his shareholding in the company.',
 'source_documents': [Document(metadata={'id': 9, 'relevance_score': 0.9983312, 'page': 33}, page_content='Reliance Jio Infocomm Limited 32 Corporate Governance Report Mr. Ranjit V. Pandit Independent  Director DIN: 00782296 Citizen of USA Profile: Mr. Ranjit V. Pandit holds an M.B.A. degree from the Wharton School at the University of  Pennsylvania and a B.E. degree in Electrical Engineering from VJTI, University of Bombay, India.  Mr. Pandit served as a Managing Director at General Atlantic, LLC between September 2007  and December 2012 and headed the India office where he served as head of the firm’s growth  investment activities across India. He served as an Advisory Director of General Atlantic LLC  in 2013. Prior to General A

In [None]:
query="Give information about company chairman and it holding in company"

In [None]:
Flash_compression_retriever.invoke(query)

[Document(metadata={'id': 9, 'relevance_score': 0.9983312, 'page': 33}, page_content='Reliance Jio Infocomm Limited 32 Corporate Governance Report Mr. Ranjit V. Pandit Independent  Director DIN: 00782296 Citizen of USA Profile: Mr. Ranjit V. Pandit holds an M.B.A. degree from the Wharton School at the University of  Pennsylvania and a B.E. degree in Electrical Engineering from VJTI, University of Bombay, India.  Mr. Pandit served as a Managing Director at General Atlantic, LLC between September 2007  and December 2012 and headed the India office where he served as head of the firm’s growth  investment activities across India. He served as an Advisory Director of General Atlantic LLC  in 2013. Prior to General Atlantic he served as Managing Director and Chairman of McKinsey  & Company in India. Mr. Pandit joined McKinsey & Co. in August 1980 and as a Co-Founder of  McKinsey’s India office was transferred from New York to Mumbai in January 1993. As Managing  Director - India for McKinsey

In [None]:
Bge_compression_retriever.invoke(query)

Batches: 100%|██████████| 1/1 [00:36<00:00, 36.11s/it]


[Document(metadata={'page': 50}, page_content='Annual Report 2022-23 49 Corporate Governance Report Disclosures in relation to the Sexual Harassment of Women at Workplace (Prevention, Prohibition and Redressal)  Act, 2013  The Company is committed to provide a work environment which ensures that every employee is treated with dignity,  respect and afforded equal treatment. There were no cases/ complaints filed during the year, under the Sexual Harassment  of Women at Workplace (Prevention, Prohibition and Redressal) Act, 2013 (“under the said Act”). Further, the Company has constituted Internal Committee as required under the said Act. Details of loans and advances in the nature of loans to firms/companies in which directors are interested  The Company has not given any loans or advances to any firm / company in which its directors are interested. Details of material subsidiaries of the listed entity; including the date and place of incorporation and the name and  date of appointment o

In [None]:
doc=VectorStore.similarity_search(query,k=20)

In [None]:
for page in doc:
    print(page.metadata["page"])

In [None]:
import uuid
from langchain.storage import InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever

# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=VectorStore,
    byte_store=store,
    id_key=id_key,
)

doc_ids = [str(uuid.uuid4()) for _ in pages]

In [None]:
# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

sub_docs = []
for i, doc in enumerate(pages):
    _id = doc_ids[i]
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

In [None]:
retriever.vectorstore.add_documents(sub_docs)
retriever.docstore.mset(list(zip(doc_ids, pages)))

In [None]:
retriever.vectorstore.similarity_search(query, k=10)[0:]


[Document(metadata={'page': 113, 'doc_id': '89e2b91f-b814-4da0-b629-2ecc03c8623f'}, page_content='Personnel  of the Ultimate Holding  Company are able to  exercise significant  influence  0   0  Sir HN Hospital Trust Enterprise over which  Key Managerial Personnel  of the Ultimate Holding  Company are able to  exercise significant  influence  1   1  5 Other Income Reliance Industries Limited Ultimate Holding Company  0   -  Jio Platforms Limited Holding Company  0   -  Cover Story Clothing'),
 Document(metadata={'page': 172, 'doc_id': 'fd5d7614-7347-4a9b-a08a-6b923070a5fb'}, page_content='Personnel  of the Ultimate Holding  Company are able to  exercise significant  influence  0   0  Sir HN Hospital Trust Enterprise over which  Key Managerial Personnel  of the Ultimate Holding  Company are able to  exercise significant  influence  1   1  5 Other Income Reliance Industries Limited  Ultimate Holding Company  0   -  Jio Platforms Limited  Parent Company  0   -'),
 Document(metadata={'page