In [19]:
import os
import glob
from dotenv import load_dotenv
from pathlib import Path
from groq import Groq
import fitz  # PyMuPDF
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from sklearn.metrics.pairwise import cosine_similarity

the above err due to usage of py 3.14 in env

In [2]:
load_dotenv()
GROQ_KEY = os.environ.get("GROQ_API_KEY")

model_name="meta-llama/llama-4-scout-17b-16e-instruct"

In [40]:
client = Groq() # GROQ_API_KEY

judge_sys_prompt = (
                    "You are an unbiased judge. "
                    "You will be given two answers: real_ans and model_ans. "
                    "Return ONLY a single integer percentage (0-100) "
                    "representing how semantically similar model_ans is to real_ans."
                )

def judge_llm(judge_prompt) -> float:  
  completion = client.chat.completions.create(
    model="moonshotai/kimi-k2-instruct-0905",
    messages=[
      {"role": "system", "content": f"{judge_sys_prompt}"},
      {"role": "user", "content": f"{judge_prompt}"}
    ],
    temperature=0,
    max_completion_tokens=3, # 0 to 100
    top_p=1,
    stream=False
  )

  return completion.choices[0].message

In [32]:
real = "Python supports asynchronous programming using asyncio."
model = "Python allows async programming via the asyncio library."

score = judge_llm(
    real_ans=real,
    model_ans=model
)

print(score.content)  # e.g., "92"


100


In [4]:
PDF_DIR = Path("docs")
OUT_DIR = Path("extracted_docs")

OUT_DIR.mkdir(exist_ok=True)


def extract_text_from_pdf(pdf_path: Path) -> str:
    doc = fitz.open(pdf_path)
    pages = []

    for i, page in enumerate(doc):
        text = page.get_text("text")
        if text.strip():
            pages.append(f"\n--- Page {i + 1} ---\n{text}")

    return "\n".join(pages)


def process_pdfs(pdf_dir: Path, out_dir: Path):
    for pdf_file in pdf_dir.glob("*.pdf"):
        print(f"Extracting: {pdf_file.name}")
        text = extract_text_from_pdf(pdf_file)

        output_file = out_dir / f"{pdf_file.stem}.txt"
        output_file.write_text(text, encoding="utf-8")

process_pdfs(PDF_DIR, OUT_DIR)

Extracting: axis_investor_presentation_q2fy26.pdf
Extracting: Axis_investments_oct-dec-2025.pdf
Extracting: VisionDecember2025.pdf
Extracting: rtgs_bank_details.pdf


In [5]:
def make_documents(text: str, source: str):
    return [
        Document(
            page_content=text,
            metadata={"source": source}
        )
    ]


def chunk_documents(docs):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=200,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    return splitter.split_documents(docs)


def get_embeddings():
    return HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )


def build_vectorstore(chunks, embeddings):
    return Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory="chroma_db",
        collection_name="docs_rag"
    )


def format_docs(docs):
    return "\n\n".join(d.page_content for d in docs)

In [6]:
# this takes time!

all_docs = []

for text in glob.glob("extracted_docs/*.txt"):
    with open(text, 'r') as txt:
        docs = make_documents(txt.read(), text)
        all_docs.extend(docs)

chunks = chunk_documents(all_docs)
embeddings = get_embeddings()
vectorstore = build_vectorstore(chunks, embeddings)

In [7]:
# vectorstore.save_local("faiss_index")

vectorstore = Chroma(
    persist_directory="chroma_db",
    embedding_function=embeddings,
    collection_name="docs_rag"
)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 6}
)

  vectorstore = Chroma(


In [8]:
llm = ChatGroq(
    model=model_name,
    temperature=0.0
)

In [9]:
prompt = ChatPromptTemplate.from_template("""
You are a factual assistant.
Answer ONLY using the provided context.
If the answer is not in the context, say "I don't know".

Context:
{context}

Question:
{question}
""")


In [10]:
# chaining
rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
)

rag_question = "tell me about Bank of Tokyo Mitsubishi Ltd."

response = rag_chain.invoke(str(rag_question))
print()
print(rag_question)
print("RAG reply: ", response.content)


tell me about Bank of Tokyo Mitsubishi Ltd.
RAG reply:  I don't know.


In [11]:
retriever.invoke(rag_question)

[Document(metadata={'source': 'extracted_docs/rtgs_bank_details.txt'}, page_content='Mumbai 400 021.\nLaxmi Vilas Bank\nLAVB0000883\nInvestment Cell\n25-31 ABAN HOUSE, 4th floor, ROPEWALK\nSTREET, RAMPARTROW, P.B.NO.783, FORT,\nMUMBAI-400001\nLord Krishna Bank\nLKBL000AOFF\nKochi\nRegd and Admn Office, Second Floor, Indian Express\nBuildings, Kaloor, Kochi - Kerala 682017\nMaharashtra State Co-operative\nBank Ltd.\nMSCI0082002\nMumbai\nThe Maharashtra State Co-op Bank Ltd., 120 Shahid\nBhagat Singh Marg, Fort, Mumbai - 400 001\nMANDAVI CO-OPERATIVE BANK MCBL0CARNAC\nCARNAC BUNDER\n1st Floor, Vyapar Bhavan ,49, P.D\'Mello Road,Carnac\nBunder, Mumbai - 400 009.\nMASHREQ BANK\nMSHQ0000001\nMUMBAI\n602/603, Tulsiani Chambers, Nariman Point, Mumbai\n- 400 021\nMizuho Corporate Bank Ltd.\nMHCB0000532\nMumbai\n Maker Chamber III, 1st Floor, Jamnalal Bajaj  Road,\nNariman Point, Mumbai 400 021\nNEW INDIA CO-OP BANK LTD\nNICB0000001\nFort\n241/243 P D\'MELLO Road, Fort, Mumbai 400 001\nNorth Ka

In [12]:
import pandas as pd

df = pd.read_csv('qna.csv')
df.head()

Unnamed: 0,slno,qn,real_ans
0,1,what is the IFSC code and address of Allahabad...,"IFSC Code: ALLA0888888, Address: Allahabad Ba..."
1,2,what is the branch name of UTI Bank,Branch name is not mentioned
2,3,tell me about Bank of Tokyo Mitsubishi Ltd.,"IFSC: BOTM0003611, Branch Name: Mumbai, Addres..."
3,4,tell the name of bank having branch in 'Star H...,Bank of India
4,5,name the bank having IFSC code PUNB0244200 and...,No bank has IFSC code PUNB0244200 and branch n...


In [None]:
df['model_ans'] = [rag_chain.invoke(str(qn)).content for qn in df['qn']]

In [13]:
df.to_csv("ans.csv", index=False)

In [None]:
with_ans_df = pd.read_csv('ans.csv')
with_ans_df

Unnamed: 0,slno,qn,real_ans,model_ans
0,1,what is the IFSC code and address of Allahabad...,"IFSC Code: ALLA0888888, Address: Allahabad Ba...",The IFSC code of Allahabad Bank is ALLA0888888...
1,2,what is the branch name of UTI Bank,Branch name is not mentioned,"Maker Towers ' F' 11th Floor, Cuffe Parade, M..."
2,3,tell me about Bank of Tokyo Mitsubishi Ltd.,"IFSC: BOTM0003611, Branch Name: Mumbai, Addres...",I don't know.
3,4,tell the name of bank having branch in 'Star H...,Bank of India,I don't know.
4,5,name the bank having IFSC code PUNB0244200 and...,No bank has IFSC code PUNB0244200 and branch n...,I don't know.
5,6,Which bank operates in the Information Technol...,IDBI Bank,I don't know.
6,7,Is Axis Bank listed in RTGS document?,No,I don't know.
7,8,Which all Global ESG Benchmarks were taken for...,"S&P ESG Score, FTSE4Good Index , MSCI ESG Rati...",The Global ESG Benchmarks that were taken for ...
8,9,Name the subsidiaries of axis bank?,"Axis Mutual Fund, Axis Capital, Axis Finance, ...",The subsidiaries of Axis Bank mentioned in the...
9,10,which bank won the digital cx awards 2024?,"Axis Bank (expected ans is I don't know, as th...",I don't know.


In [14]:
with_ans_df['real_ans'][0]

' IFSC Code: ALLA0888888, Address: Allahabad Bank, 3rd Floor, Allahabad Bank Building,\n37, Mumbai Samachar Marg, Fort ,Mumbai -23'

In [None]:
ans_similarities, judge_scores = [], []

for real_ans, model_ans in zip(with_ans_df['real_ans'], with_ans_df['model_ans']):

    real_ans_embedding = embeddings.embed_query(real_ans)
    model_ans_embeddings = embeddings.embed_query(model_ans)
    sim = cosine_similarity([real_ans_embedding], [model_ans_embeddings])[0][0]
    ans_similarities.append(float(round(sim, 4) * 100))


    judge_prompt = f"""
    real_ans:
    {real_ans}

    model_ans:
    {model_ans}

    Return only a number from 0 to 100.
    """
    message = judge_llm(judge_prompt)
    judge_scores.append(message.content)

[91.02,
 49.2,
 6.67,
 13.200000000000001,
 7.33,
 17.27,
 19.84,
 65.9,
 66.62,
 9.28,
 55.15,
 45.94,
 13.52,
 9.86,
 99.33999999999999,
 92.27,
 21.46,
 3.93,
 62.31,
 36.65,
 36.24,
 83.77,
 80.19,
 68.55,
 100.0]

In [None]:
with_ans_df['ans_cos_sim'] = ans_similarities
with_ans_df['judge_scores'] = judge_scores

with_ans_df

Unnamed: 0,slno,qn,real_ans,model_ans,ans_cos_sim
0,1,what is the IFSC code and address of Allahabad...,"IFSC Code: ALLA0888888, Address: Allahabad Ba...",The IFSC code of Allahabad Bank is ALLA0888888...,91.02
1,2,what is the branch name of UTI Bank,Branch name is not mentioned,"Maker Towers ' F' 11th Floor, Cuffe Parade, M...",49.2
2,3,tell me about Bank of Tokyo Mitsubishi Ltd.,"IFSC: BOTM0003611, Branch Name: Mumbai, Addres...",I don't know.,6.67
3,4,tell the name of bank having branch in 'Star H...,Bank of India,I don't know.,13.2
4,5,name the bank having IFSC code PUNB0244200 and...,No bank has IFSC code PUNB0244200 and branch n...,I don't know.,7.33
5,6,Which bank operates in the Information Technol...,IDBI Bank,I don't know.,17.27
6,7,Is Axis Bank listed in RTGS document?,No,I don't know.,19.84
7,8,Which all Global ESG Benchmarks were taken for...,"S&P ESG Score, FTSE4Good Index , MSCI ESG Rati...",The Global ESG Benchmarks that were taken for ...,65.9
8,9,Name the subsidiaries of axis bank?,"Axis Mutual Fund, Axis Capital, Axis Finance, ...",The subsidiaries of Axis Bank mentioned in the...,66.62
9,10,which bank won the digital cx awards 2024?,"Axis Bank (expected ans is I don't know, as th...",I don't know.,9.28


In [50]:
with_ans_df.to_csv("ans_with_machine_scores.csv", index=False)