In [None]:
!{sys.executable} -m pip install langchain
!pip install -U langchain-text-splitters
!pip install langchain_chroma
!pip install langchain-huggingface
!pip install -U bitsandbytes

In [None]:
#imports
import torch
import os
import sys
import re
import json
from datasets import load_dataset
import pandas as pd
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import BitsAndBytesConfig
from huggingface_hub import login
from transformers import pipeline
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain_core.messages import SystemMessage, HumanMessage

In [None]:
hf = userdata.get('HF_TOKEN')
login(hf)

In [None]:
ds = load_dataset("squad")
df = ds["train"].to_pandas()

In [None]:
import json

eval_data = []

for _, row in df.head(500).iterrows():
    eval_entry = {
        "question": row["question"],
        "ground_truth": row["answers"]["text"][0],
        "reference_context": row["context"]
    }
    eval_data.append(eval_entry)

with open("squad_eval.jsonl", "w") as f:
    for entry in eval_data:
        f.write(json.dumps(entry) + "\n")

In [None]:
documents = []
for _, row in df.iterrows():
    text = row["context"]
    documents.append(Document(page_content=text, metadata={"question": row["question"]}))
documents = documents[:10000]


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B-Instruct",
    device_map="auto",
    quantization_config= quant_config
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
#create the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # model_kwargs={"device": "cuda"}

In [None]:
#split documentsa to chuhnks which can then be vectorised
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

In [None]:
evaluation_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct", device_map="auto", quantization_config= quant_config)

In [None]:
comparison_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
comparison_tokenizer.pad_token = comparisontokenizer.eos_token

In [None]:
db_name = "RAG-database"
if os.path.exists(db_name):
  Chroma(
      persist_directory=db_name,
      embedding_function=embedding_model
  ).delete_collection()
vectordb = Chroma.from_documents(
  documents=texts,
  embedding=embedding_model,
  persist_directory=db_name
)

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

In [None]:
system_prompt = '''You are a question answering system.
Answer the user's question directly.
Phrase the answers well.
Do NOT create multiple-choice questions.
Do NOT rephrase the question.
Do NOT generalize or paraphraze the answer.
Use only the given context.
Answer in one short sentence.
If not in context, say "I don't know".
Use the subject given in the question in the answer.
Answer with the specific noun phrase only.

Context:
{related_documents}
'''


In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
history = []

In [None]:
def rag(question):
  global history
  alldocs = retriever.invoke(question)
  context = "\n".join(doc.page_content for doc in alldocs)
  system_prompt_final = system_prompt.format(related_documents = context)

  message = [{"role":"system","content":system_prompt_final}]
  message.extend(history)
  message.append({"role":"user","content":question})

  prompt = tokenizer.apply_chat_template(
      message,tokenize=True,
      add_generation_prompt=True,
      return_tensors="pt"
  ).to("cuda")

  output = model.generate(
      prompt,do_sample=False,
      max_new_tokens = 200
  )

  input_len = prompt.shape[1]
  generated = output[0][prompt.shape[-1]:]

  answer = tokenizer.decode(
      generated,
      skip_special_tokens=True
  )

  history.append({"role": "user", "content": question})
  history.append({"role": "assistant", "content": answer})

  return answer

In [None]:
read_data = []
with open("squad_eval.jsonl", "r") as f:
    for line in f:
        read_data.append(json.loads(line))

In [None]:
evaluation_systemPrompt = """
You are an impartial judge. Evaluate the "Generated Answer" based on the "Context" and "Ground Truth".
1. Faithfulness: Is the answer derived ONLY from the context?
2. Accuracy: Does it match the meaning of the Ground Truth?

Return your evaluation in this format:
Score: [0 to 10]
Reasoning: [Short explanation]
"""

def evaluate_fixed():
    global read_data, history

    results = []
    for i in range(5):
        print(f"--- Evaluating Sample {i+1} ---")

        question = read_data[i]["question"]
        answer = read_data[i]["ground_truth"]
        context = read_data[i]["reference_context"]
        history = []

        generated_answer = rag(question)

        user_prompt = (
            f"Question: {question}\n"
            f"Ground Truth: {answer}\n"
            f"Context: {context}\n"
            f"Generated Answer: {generated_answer}"
        )

        message = [
            {"role": "system", "content": evaluation_systemPrompt},
            {"role": "user", "content": user_prompt}
        ]

        prompt_ids = tokenizer.apply_chat_template(
            message,
            tokenize=True,
            add_generation_prompt=True,
            return_dict = True,
            return_tensors="pt"
        ).to("cuda")

        output_ids = model.generate(
            prompt_ids['input_ids'],
            do_sample=False,
            attention_mask=prompt_ids['attention_mask'],
            pad_token_id=tokenizer.pad_token_id,
            max_new_tokens=200
        )

        generated_text = output_ids[0][prompt_ids['input_ids'].shape[1]:]
        evaluation_text = tokenizer.decode(generated_text, skip_special_tokens=True)

        print(f"Evaluation:\n{evaluation_text}\n")
        results.append(evaluation_text)

    return results

eval_logs = evaluate_fixed()

In [None]:
print(f"{'Sample':<10} | {'Score':<5} | {'Summary'}")
print("-" * 50)

for i, report in enumerate(eval_logs):

    score_match = re.search(r"Score:\s*(\d+)", report)
    score = score_match.group(1) if score_match else "N/A"

    reasoning_match = re.search(r"Reasoning:\s*(.*)", report)
    reasoning = reasoning_match.group(1)[:60] + "..." if reasoning_match else "No reasoning"

    print(f"Sample {i+1:<3} | {score:<5} | {reasoning}")