<h1><b style="color: yellow;">Assignment 2</b></h1>

### <u>Imports 

In [20]:
from langchain_community.llms import Ollama
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
import json
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

### <u>Config

In [21]:
LLM = 'mistral'
file_location = 'corpus'
embedding_model = 'sentence-transformers/all-MiniLM-L6-v2'
test_dataset_path = 'test_question.json'

### <u> Loading LLM , embedding , RAG (from assignment 1)

In [22]:
def load_llm(LLM):
    llm = Ollama(model=LLM)
    return llm
def load_embedding(embedding_model):
    embedding_function = SentenceTransformerEmbeddings(
        model_name='sentence-transformers/all-MiniLM-L6-v2'
    )
    return embedding_function

def RAG(folder_path, embedding_function, llm, chunk_size, persist_directory):
    folder = Path(folder_path)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=0
    )
    vector_store = Chroma(
        collection_name="Ambedkar",
        embedding_function=embedding_function,
        persist_directory=persist_directory
    )
    for file_path in folder.glob("*.txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

        chunks = text_splitter.split_text(text)
        metadatas = [{"source": file_path.name}] * len(chunks)

        vector_store.add_texts(
            chunks,
            metadatas=metadatas
        )
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})

    qa = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True
    )

    return qa




# <u><b> Metrics 

In [23]:
def evaluate_rag_system(qa, test_dataset_path):

    with open(test_dataset_path, "r") as f:
        test_data = json.load(f)["test_questions"]

    embedding_function = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    results = []

    for item in test_data:
        qid = item["id"]
        question = item["question"]
        ground_truth = item["ground_truth"]
        correct_docs = item["source_documents"]

        rag_output = qa({"query": question})
        pred_answer = rag_output["result"]

        retrieved_docs = [d.metadata["source"] for d in rag_output["source_documents"]]
        retrieved_texts = [d.page_content for d in rag_output["source_documents"]]

        hit = 1 if any(doc in correct_docs for doc in retrieved_docs) else 0

        mrr = 0
        for rank, doc in enumerate(retrieved_docs, start=1):
            if doc in correct_docs:
                mrr = 1 / rank
                break

        K = len(retrieved_docs)
        correct_count = sum(doc in correct_docs for doc in retrieved_docs[:K])
        precision_k = correct_count / K if K > 0 else 0

        try:
            rougeL = rouge.score(pred_answer, ground_truth)['rougeL'].fmeasure
        except:
            rougeL = 0.0

        try:
            bleu = sentence_bleu([ground_truth.split()], pred_answer.split())
        except:
            bleu = 0.0

        try:
            emb_pred = embedding_function.encode([pred_answer])
            emb_gt = embedding_function.encode([ground_truth])
            cosine_sim = float(cosine_similarity(emb_pred, emb_gt)[0][0])
        except:
            cosine_sim = 0.0

        try:
            q_emb = embedding_function.encode([question])
            a_emb = embedding_function.encode([pred_answer])
            relevance_score = float(cosine_similarity(q_emb, a_emb)[0][0])
        except:
            relevance_score = 0.0

        try:
            scores = []
            for ctx in retrieved_texts:
                ctx_emb = embedding_function.encode([ctx])
                a_emb = embedding_function.encode([pred_answer])
                scores.append(float(cosine_similarity(ctx_emb, a_emb)[0][0]))
            faithfulness_score = max(scores) if scores else 0.0
        except:
            faithfulness_score = 0.0

        results.append({
            "id": qid,
            "question": question,
            "ground_truth": ground_truth,
            "prediction": pred_answer,
            "retrieved_docs": retrieved_docs,
            "hit_rate": hit,
            "mrr": mrr,
            "precision_k": precision_k,
            "rouge_l": rougeL,
            "bleu": bleu,
            "cosine_similarity": cosine_sim,
            "answer_relevance": relevance_score,
            "faithfulness": faithfulness_score
        })

    return results


In [24]:
if __name__ == "__main__":
    all_results = {}
    embedding_function = load_embedding(embedding_model)
    llm = load_llm(LLM)

    for chunk in [300, 600, 900]:
        print(f"\nEvaluating chunk size: {chunk}\n")

        persist_dir = f"./chroma_eval_chunk_{chunk}"

        qa = RAG(
            folder_path=file_location,
            embedding_function=embedding_function,
            llm=llm,
            chunk_size=chunk,
            persist_directory=persist_dir
        )

        results = evaluate_rag_system(qa, test_dataset_path)
        all_results[f"chunk_{chunk}"] = results

    with open("test_results.json", "w") as f:
        json.dump(all_results, f, indent=4)

    print("\nEvaluation complete. Results saved to test_results.json.")



Evaluating chunk size: 300



The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe


Evaluating chunk size: 600



The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe


Evaluating chunk size: 900



The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe


Evaluation complete. Results saved to test_results.json.
