In [1]:
import pandas as pd
df = pd.read_csv('/home/bipin/Documents/projects/legal_assistant/data/bns_testset.csv')
df.head()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What is BNSS Classification in relation to fal...,['BNS Section 233 BNSS Classification The same...,BNSS Classification refers to the classificati...,single_hop_specifc_query_synthesizer
1,What is the law regarding theft on property in...,['BNS Section 307 Illustrations: A commits the...,BNS Section 307 Illustrations states that if A...,single_hop_specifc_query_synthesizer
2,What BNS means in law?,['BNS Section 168 BNSS Classification Imprison...,BNS Section 168 BNSS Classification Imprisonme...,single_hop_specifc_query_synthesizer
3,What is BNSS Classification in BNS Section 146?,['BNS Section 146 BNSS Classification Imprison...,BNS Section 146 BNSS Classification Imprisonme...,single_hop_specifc_query_synthesizer
4,What is the amount of fine specified in the BN...,['BNS Section 276 BNSS Classification Imprison...,"The fine specified is 5,000 rupees.",single_hop_specifc_query_synthesizer


In [21]:
len(df)

30

In [2]:
import os
import re
import pandas as pd

from dotenv import load_dotenv
from qdrant_client import QdrantClient
from qdrant_client.http.models import Filter, FieldCondition, MatchAny

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_qdrant import FastEmbedSparse, QdrantVectorStore, RetrievalMode
from langchain_core.prompts import ChatPromptTemplate

from ragas import EvaluationDataset

# 1. Load env and testset
load_dotenv()
df = pd.read_csv('/home/bipin/Documents/projects/legal_assistant/data/bns_testset.csv')

# 2. Init Qdrant, embeddings, LLM, vectorstore
client = QdrantClient(
    url=os.getenv("QDRANT_CLOUD_URL"),
    api_key=os.getenv("QDRANT_API_KEY"),
    prefer_grpc=True
)

dense_embedding  = OpenAIEmbeddings(model="text-embedding-3-large")
sparse_embedding = FastEmbedSparse(model_name="Qdrant/bm25")
llm              = ChatOpenAI(model="gpt-4.1-nano", temperature=0)

vectorstore = QdrantVectorStore(
    client=client,
    collection_name="bns_sections_hybrid",
    embedding=dense_embedding,
    sparse_embedding=sparse_embedding,
    retrieval_mode=RetrievalMode.HYBRID,
    vector_name="dense",
    sparse_vector_name="sparse",
)


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

arabic.txt:   0%|          | 0.00/6.35k [00:00<?, ?B/s]

danish.txt:   0%|          | 0.00/424 [00:00<?, ?B/s]

catalan.txt:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

basque.txt:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

bengali.txt:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

chinese.txt:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

azerbaijani.txt:   0%|          | 0.00/967 [00:00<?, ?B/s]

french.txt:   0%|          | 0.00/813 [00:00<?, ?B/s]

german.txt:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

hinglish.txt:   0%|          | 0.00/5.96k [00:00<?, ?B/s]

hebrew.txt:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

dutch.txt:   0%|          | 0.00/453 [00:00<?, ?B/s]

greek.txt:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

finnish.txt:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

norwegian.txt:   0%|          | 0.00/851 [00:00<?, ?B/s]

italian.txt:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

portuguese.txt:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

english.txt:   0%|          | 0.00/936 [00:00<?, ?B/s]

hungarian.txt:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

nepali.txt:   0%|          | 0.00/3.61k [00:00<?, ?B/s]

kazakh.txt:   0%|          | 0.00/3.88k [00:00<?, ?B/s]

indonesian.txt:   0%|          | 0.00/6.45k [00:00<?, ?B/s]

russian.txt:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

turkish.txt:   0%|          | 0.00/260 [00:00<?, ?B/s]

swedish.txt:   0%|          | 0.00/559 [00:00<?, ?B/s]

slovene.txt:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

romanian.txt:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

tajik.txt:   0%|          | 0.00/1.82k [00:00<?, ?B/s]

spanish.txt:   0%|          | 0.00/2.18k [00:00<?, ?B/s]

In [3]:
def extract_section_numbers(query: str):
    return [int(n) for n in re.findall(r"\b(\d+)\b", query)]

# reuse your same prompt template
TEMPLATE = """
You are a helpful legal assistant. Use the following context to answer the question.
If the answer cannot be found in the context, say so.

Context:
{context}

Question:
{question}

Answer:
"""
prompt_template = ChatPromptTemplate.from_template(TEMPLATE)

def retrieve(query: str):
    secs = extract_section_numbers(query)
    if secs:
        flt = Filter(
            must=[ FieldCondition(key="metadata.sections", match=MatchAny(any=secs)) ]
        )
        docs = vectorstore.similarity_search(query="", filter=flt)
    else:
        docs = vectorstore.similarity_search(query, k=5)
    return docs

def docs_to_text(docs):
    # either return a big string or a list of page_contents
    return [d.page_content for d in docs]

In [4]:
dataset = []
for _, row in df.iterrows():
    query = row['user_input']
    ref_ctxs = row['reference_contexts']   # already in your CSV
    reference = row['reference']

    # 4.1 retrieve + call LLM
    docs = retrieve(query)
    retrieved_ctxs = docs_to_text(docs)

    resp = (prompt_template | llm).invoke({
        "context": "\n\n".join(retrieved_ctxs),
        "question": query
    })

    # 4.2 append
    dataset.append({
        "user_input":         query,
        "reference_contexts": ref_ctxs,
        "retrieved_contexts": retrieved_ctxs,
        "response":           resp.content,
        "reference":          reference
    })

# 5. Create RAG evaluation dataset
evaluation_dataset = EvaluationDataset.from_list(dataset)

df_eval = evaluation_dataset.to_pandas()          # or: pd.DataFrame(evaluation_dataset)
output_path = '/home/bipin/Documents/projects/legal_assistant/data/evaluation_dataset.csv'
df_eval.to_csv(output_path, index=False)

print(f"Saved evaluation results to {output_path}")

# now you can pass `evaluation_dataset` to ragas.evaluate(...)
print(f"Built evaluation dataset with {len(dataset)} examples.")

ValidationError: 1 validation error for SingleTurnSample
reference_contexts
  Input should be a valid list [type=list_type, input_value="['BNS Section 233 BNSS C... evidence is triable.']", input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/list_type

In [5]:
evaluation_dataset 

NameError: name 'evaluation_dataset' is not defined

In [7]:
len(dataset)

30

In [8]:
dataset[0]

{'user_input': 'What is BNSS Classification in relation to false evidence?',
 'reference_contexts': "['BNS Section 233 BNSS Classification The same as for giving or fabricating false evidence. Non-cognizable According as offence of giving such evidence is bailable or nonbailable. Triable by Court by which offence of giving or fabricating false evidence is triable.']",
 'retrieved_contexts': ['BNS Section 233 BNSS Classification The same as for giving or fabricating false evidence. Non-cognizable According as offence of giving such evidence is bailable or nonbailable. Triable by Court by which offence of giving or fabricating false evidence is triable.',
  'BNS Section 235 BNSS Classification The same as for giving false evidence. Non-cognizable Bailable Triable by Court by which offence of giving false evidence is triable.',
  'BNS Section 234 BNSS Classification The same as for giving false evidence. Non-cognizable Bailable Triable by Court by which offence of giving false evidence is

In [9]:
dataset


[{'user_input': 'What is BNSS Classification in relation to false evidence?',
  'reference_contexts': "['BNS Section 233 BNSS Classification The same as for giving or fabricating false evidence. Non-cognizable According as offence of giving such evidence is bailable or nonbailable. Triable by Court by which offence of giving or fabricating false evidence is triable.']",
  'retrieved_contexts': ['BNS Section 233 BNSS Classification The same as for giving or fabricating false evidence. Non-cognizable According as offence of giving such evidence is bailable or nonbailable. Triable by Court by which offence of giving or fabricating false evidence is triable.',
   'BNS Section 235 BNSS Classification The same as for giving false evidence. Non-cognizable Bailable Triable by Court by which offence of giving false evidence is triable.',
   'BNS Section 234 BNSS Classification The same as for giving false evidence. Non-cognizable Bailable Triable by Court by which offence of giving false eviden

In [10]:
import ast
import json
import pandas as pd

# ─── 1. Fix any stringified reference_contexts ─────────────
for sample in dataset:
    if isinstance(sample["reference_contexts"], str):
        sample["reference_contexts"] = ast.literal_eval(sample["reference_contexts"])

# ─── 2. Build a DataFrame with exactly the columns you want ─
df2 = pd.DataFrame(dataset, columns=[
    "user_input",
    "reference_contexts",
    "retrieved_contexts",
    "response",
    "reference"
])

# ─── 3. Serialize both list‐columns as JSON strings ────────
df2["reference_contexts"]  = df2["reference_contexts"].apply(json.dumps)
df2["retrieved_contexts"] = df2["retrieved_contexts"].apply(json.dumps)

# ─── 4. Save to CSV ─────────────────────────────────────────
output_path = "/home/bipin/Documents/projects/legal_assistant/data/evaluation_dataset.csv"
df2.to_csv(output_path, index=False)

print(f"Saved {len(df)} rows with all fields to {output_path}")


Saved 30 rows with all fields to /home/bipin/Documents/projects/legal_assistant/data/evaluation_dataset.csv


In [11]:
df2.head()

Unnamed: 0,user_input,reference_contexts,retrieved_contexts,response,reference
0,What is BNSS Classification in relation to fal...,"[""BNS Section 233 BNSS Classification The same...","[""BNS Section 233 BNSS Classification The same...",The BNSS Classification in relation to false e...,BNSS Classification refers to the classificati...
1,What is the law regarding theft on property in...,"[""BNS Section 307 Illustrations: A commits the...","[""of depriving Z of the property as a security...",The law regarding theft on property in Z's pos...,BNS Section 307 Illustrations states that if A...
2,What BNS means in law?,"[""BNS Section 168 BNSS Classification Imprison...","[""BNS Section 198 Illustration: A, being an of...","Based on the provided context, ""BNS"" appears t...",BNS Section 168 BNSS Classification Imprisonme...
3,What is BNSS Classification in BNS Section 146?,"[""BNS Section 146 BNSS Classification Imprison...","[""BNS Section 146 BNSS Classification Imprison...",The context does not provide specific informat...,BNS Section 146 BNSS Classification Imprisonme...
4,What is the amount of fine specified in the BN...,"[""BNS Section 276 BNSS Classification Imprison...","[""BNS Section 276 BNSS Classification Imprison...","The amount of fine specified is 5,000 rupees.","The fine specified is 5,000 rupees."


In [12]:
df3 =pd.read_csv(output_path)
df3.head()

Unnamed: 0,user_input,reference_contexts,retrieved_contexts,response,reference
0,What is BNSS Classification in relation to fal...,"[""BNS Section 233 BNSS Classification The same...","[""BNS Section 233 BNSS Classification The same...",The BNSS Classification in relation to false e...,BNSS Classification refers to the classificati...
1,What is the law regarding theft on property in...,"[""BNS Section 307 Illustrations: A commits the...","[""of depriving Z of the property as a security...",The law regarding theft on property in Z's pos...,BNS Section 307 Illustrations states that if A...
2,What BNS means in law?,"[""BNS Section 168 BNSS Classification Imprison...","[""BNS Section 198 Illustration: A, being an of...","Based on the provided context, ""BNS"" appears t...",BNS Section 168 BNSS Classification Imprisonme...
3,What is BNSS Classification in BNS Section 146?,"[""BNS Section 146 BNSS Classification Imprison...","[""BNS Section 146 BNSS Classification Imprison...",The context does not provide specific informat...,BNS Section 146 BNSS Classification Imprisonme...
4,What is the amount of fine specified in the BN...,"[""BNS Section 276 BNSS Classification Imprison...","[""BNS Section 276 BNSS Classification Imprison...","The amount of fine specified is 5,000 rupees.","The fine specified is 5,000 rupees."


In [None]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluation_dataset = dataset

            "user_input":query,
            "retrieved_contexts":relevant_docs,
            "response":response,
            "reference":reference
        }

evaluator_llm = LangchainLLMWrapper(llm)
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

result = evaluate(dataset=evaluation_dataset,metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],llm=evaluator_llm)
result

In [16]:
dataset[0]

{'user_input': 'What is BNSS Classification in relation to false evidence?',
 'reference_contexts': ['BNS Section 233 BNSS Classification The same as for giving or fabricating false evidence. Non-cognizable According as offence of giving such evidence is bailable or nonbailable. Triable by Court by which offence of giving or fabricating false evidence is triable.'],
 'retrieved_contexts': ['BNS Section 233 BNSS Classification The same as for giving or fabricating false evidence. Non-cognizable According as offence of giving such evidence is bailable or nonbailable. Triable by Court by which offence of giving or fabricating false evidence is triable.',
  'BNS Section 235 BNSS Classification The same as for giving false evidence. Non-cognizable Bailable Triable by Court by which offence of giving false evidence is triable.',
  'BNS Section 234 BNSS Classification The same as for giving false evidence. Non-cognizable Bailable Triable by Court by which offence of giving false evidence is t

In [18]:
evaluated_dataset =[]

for i in range(0,30):
    evaluated_dataset.append(
        {
            "user_input":dataset[i]['user_input'],
            "retrieved_contexts":dataset[i]['retrieved_contexts'],
            "response":dataset[i]['response'],
            "reference":dataset[i]['reference']
        })

In [19]:
from ragas import EvaluationDataset
evaluation_dataset = EvaluationDataset.from_list(evaluated_dataset)

In [20]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper


evaluator_llm = LangchainLLMWrapper(llm)
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

result = evaluate(dataset=evaluation_dataset,metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],llm=evaluator_llm)
result

Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]

{'context_recall': 0.9667, 'faithfulness': 0.8775, 'factual_correctness(mode=f1)': 0.6107}