In [None]:
!pip install langchain -q
!pip install pdfminer.six -q
!pip install unstructured -q
!pip install pinecone-client -q
!pip install pdf2image -q
!pip install pytesseract -q
!pip install tiktoken -q
!pip install langchain-community
!pip install --quiet langchain-google-genai
!pip install langchain-pinecone

In [14]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Pinecone

In [3]:
from pinecone import Pinecone
import os
from google.colab import userdata
pc = Pinecone(api_key=userdata.get('pinecone_key'))
index = pc.Index("data")

# Load Documents

In [None]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader('q3-2324.txt', encoding='utf8')
data = loader.load()

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
text_splitter =  CharacterTextSplitter(separator = ".",chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

# Load Google Gemini

In [4]:
from langchain import PromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.prompt_template import format_document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [5]:
os.environ['GOOGLE_API_KEY'] = userdata.get('gemini_key')

In [6]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash",temperature=0.8)

In [7]:
gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

In [8]:
from langchain_pinecone import PineconeVectorStore
vector_store = PineconeVectorStore(index=index, embedding=gemini_embeddings)

# Summary Generation

In [None]:
doc_prompt = PromptTemplate.from_template("{page_content}")
llm_prompt_template = """Write a concise summary of the following:
"{text}"
CONCISE SUMMARY:"""
llm_prompt = PromptTemplate.from_template(llm_prompt_template)
print(llm_prompt)

input_variables=['text'] template='Write a concise summary of the following:\n"{text}"\nCONCISE SUMMARY:'


In [None]:
stuff_chain = (

    {
        "text": lambda texts: "\n\n".join(
            format_document(doc, doc_prompt) for doc in texts
        )
    }
    | llm_prompt
    | llm
    | StrOutputParser()
)

In [None]:
summary=stuff_chain.invoke(texts)

In [None]:
summary

"## TCS Q3 FY2024 Earnings Call Summary:\n\n**Resilient Performance Amidst Macro Uncertainty**\n\nTCS reported strong Q3 FY2024 results despite global economic headwinds, highlighting the company's resilience and robust demand for its services. \n\n**Key Highlights:**\n\n* **Revenue Growth:** 4.0% YoY in Rupee terms, 1.7% in constant currency terms, and 2.9% in Dollar terms.\n* **Strong Deal Wins:** TCV of $8.1 billion, book-to-bill ratio of 1.1, and 11.3% YoY growth in trailing 12-month order book.\n* **Margin Expansion:** Operating margin at 25%, a sequential expansion of 75 basis points. Net income margin at 19.4%.\n* **Attrition Down:** LTM attrition in IT services at 13.3%, down 1.6% sequentially, within the company's comfort range.\n* **Dividend:** Board recommended an interim dividend of ₹27 per share, including a special dividend of ₹18 per share.\n* **Gen AI Focus:** TCS is investing heavily in Gen AI capabilities, building offerings, and integrating the technology into its pr

# Upload summary to pinecone

**Document IDs**

docid-1 = Transcript of the Q1 2024-25 Earnings Conference Call held on Jul 11, 2024

docid-2 = Transcript of the Q4 2023-24 Earnings Conference Call held on April 12, 2024

docid-3 = Transcript of the Q3 2023-24 Earnings Conference Call held on January 11, 2023

In [None]:
from langchain_core.documents import Document
document_1 = Document(page_content=summary, metadata={"source": "docid-3"})

In [None]:
vector_store.add_documents(documents=[document_1],namespace="summary_embeddings")

['be357e75-67a7-43e9-a20f-98b1e187b94f']

In [None]:
query = "TCS Q4 2024 Earnings"
vector_store.similarity_search(
    query,  # our search query
    k=3 , # return 3 most relevant docs
    namespace="summary_embeddings"
)

[Document(metadata={'source': 'docid-2'}, page_content='## Concise Summary of TCS Q4 FY24 Earnings Conference Call:\n\n**Key Highlights:**\n\n* **Strong Q4 performance:** TCS reported the strongest sequential revenue growth in many quarters, an all-time high TCV (Total Contract Value) of $13.2 billion, and an operating margin of 26% (highest in the last 12 quarters).\n* **Resilient FY24 growth:** Revenue grew 6.8% in rupee terms, 3.4% in constant currency terms, and 4.1% in dollar terms. Operating margin for the year was 24.6%.\n* **Strong deal momentum:**  Record Q4 TCV and double-digit growth in last 12 months TCV reflect deepening partnerships with clients.\n* **Focus on customer experience and cost optimization:** Clients are prioritizing projects with high and immediate ROI, leading to increased focus on cost optimization initiatives like vendor consolidation, cloud transformation, and AI enablement.\n* **Gen AI a key driver:**  TCS is actively pursuing Gen AI opportunities, inves

# Upload passages to pinecone

In [None]:
documents = []
for text in texts:
    documents.append(Document(page_content=text.page_content, metadata={"source": "docid-3"}))

In [None]:
vector_store.add_documents(documents=documents,namespace="passages_embeddings")

['4d51d979-afcb-44a6-9719-e06180847cbe',
 'f7ef89ab-adca-41ff-9b54-77c23458048d',
 '50e8257b-c1e7-44ed-bc8c-556c051cd37e',
 '0ac44b13-a570-4c52-ade4-b50c43a4ec04',
 '2f339a99-c682-4c82-9e0e-70ab252c0280',
 '843d3619-8051-4371-8cc1-432cc9ae9ac2',
 '76b61329-b47b-46a8-8c5c-84c3eac9e2d1',
 '44b2c57e-44e1-422e-8597-086f967b9eb0',
 'cc1c6880-9337-4ff9-8600-62d3566d8406',
 'e1c0f43c-8f85-4b80-ab96-8b29d22ee325',
 'd7c69c4b-d2cb-42f3-b379-2dad7dd69ab1',
 'ff0f3702-16d9-44f7-ae91-51dd4cdde8dc',
 'b1e12c70-e106-438a-8719-b8646becff74',
 'b4aee653-86b7-4f40-a1d6-eb0168def2ba',
 'f3ff801f-b785-4509-b6a4-d7ccde36312c',
 '529f7f02-4de8-42f7-af43-66aad8e03b01',
 '78ac967d-aa68-42ac-8be9-366ea261a605',
 '34d47bc5-f90b-4408-a4ba-3e3758e1a999',
 'b7f784c3-3302-4fb4-b560-84149ca4c19b',
 '46a6222a-37bd-423b-b512-3d94877b735c',
 'b4aa4857-92b6-402e-8df1-6c685f827d31',
 '88e32953-30bf-4c9d-85a3-f817d09e4c0d',
 '5faae227-9822-4876-a4aa-8bfacbcfa22b',
 'fcc24745-3793-4bba-8122-0056042b1315',
 'ce2f4566-6be6-

# User Query Classification

In [9]:
user_query = "TCS Q3 2024 Revenue"

In [None]:
llm_completion_select_route_chain = (
PromptTemplate.from_template("""
Given the user question below, classify whether its a general query requiring text generation tasks or a document retrieval query requiring a text extraction task for answering the user question.

General: If the question involves generating new text, such as creating, summarizing, or explaining content that isn't directly tied to specific documents.

Document: If the question requires extracting specific information from an existing document or dataset, indicating that the answer must be pulled directly from a particular source.

<question>
{question}
</question>

The response must contain only the classified category - General or Document.

<question>
{question}
</question>

Classification:""")
| llm
| StrOutputParser()
)
response=llm_completion_select_route_chain.invoke({"question": user_query})
print(response.strip())

# Document selection step using user query and summaries

In [None]:
matched_summaries=vector_store.similarity_search(
    user_query,  # our search query
    k=3 , # return 3 most relevant docs
    namespace="summary_embeddings"
)

In [None]:
summaries=[]
for passage in matched_summaries:
  summaries.append(passage.metadata["source"]+": "+passage.page_content)
summary_list=""
for i in range(0,len(summaries)):
    summary_list=summary_list+summaries[i]+"###########\n\n"
print(summary_list)

In [None]:
document_selection_prompt = f"""You are an AI assistant that can identify which documents have the probable answer to the user questions. You will be given passages each containing the summary of the of the document. Based on the user question you have to reply with a python list containing the document ids. The document ids will be like docid-1,docid-2,docid-3,docid-4 and docid-5. Only single document can have the answer or multiple documents can also have the answer. If none of the documents contain the answer reply [].
Return only the number strictly and nothing else.
Examples:
["docid-1"]
["docid-1","docid-2"]
["docid-1","docid-2","docid-3"]
["docid-2"]
[]

Document Summaries:

{summary_list}
"""

In [None]:
messages = [
    (
        "system",
       document_selection_prompt,
    ),
    ("human", user_query),
]
response = llm.invoke(messages)
print(response)

In [None]:
import json
selected_doc=json.loads(response.content.strip())
selected_doc

['docid-3']

# Document Selection Complete -> Moving to Passage selection from a basket of passages based on docid:

In [None]:
matched_passages=vector_store.similarity_search(
    user_query,  # our search query
    k=7 , # return 5 most relevant docs
    namespace="passages_embeddings",
    filter={"source":{"$in": selected_doc}}
)
matched_passages

[Document(metadata={'source': 'docid-3'}, page_content="Tata Consultancy Services Limited\nQ3 and Nine-month Ended 31st December 2023 Earnings Conference Call\nJanuary 11, 2024, 19:30 hrs IST (09:00 hrs US ET)\n\nModerator:\n\nLadies and gentlemen, good day, and welcome to the TCS Earnings\nConference Call. As a reminder, all participant lines will be in the listen-only\nmode, and there will be an opportunity for you to ask questions after the\npresentation concludes. Should you need assistance during the conference\ncall, please signal an operator by pressing star, then zero on your touchtone\nphone. Please note that this conference is being recorded. I now hand the\nconference over to Ms. Nehal Shah from the Investor Relations team at TCS.\nThank you, and over to you.\n\nNehal Shah:\n\nThank you, Operator. Good evening, and welcome, everyone. Thank you for\njoining us today to discuss TCS' financial results for the third quarter of fiscal\nyear 2024 that ended December 31, 2023"),
 D

In [None]:
paragraphs_list=""
for i in range(0,7):
    paragraphs_list+=matched_passages[i].page_content+"\n\n"

In [None]:
final_answer_generation_prompt="""You are an AI assistant that can provide helpful information to the user. You are given the following extracted parts of a long document and a question. Answer the question with the help of the supporting texts. Also mention the document to the user from which the answer was generated so that he can refer to it."""
user_prompt = f"""
Question: {user_query}
======
Supporting texts:
{paragraphs_list}

Reffered Document:
{str(selected_doc)}
======

Answer:
"""

In [None]:
messages = [
    (
        "system",
       final_answer_generation_prompt,
    ),
    ("human", user_prompt),
]
response = llm.invoke(messages)
print(response.content)

The revenue for TCS in Q3 2024 was ₹60,583 crores, which is a 4.0% growth year-on-year. In dollar terms, the revenue was $7.28 billion, a 2.9% growth year-on-year. 

This information was extracted from the "Tata Consultancy Services Earnings Conference Call" document. 

