# LLM-as-a-Judge under the covers

## Setup environment

Setup the key file path and add the utilities folder to the package path.

In [1]:
from IPython.display import JSON
from dotenv import load_dotenv
import os
from langchain.prompts import PromptTemplate
import warnings
import sys

warnings.filterwarnings("ignore")

# Load the file that contains the API keys
load_dotenv('C:\\Users\\raj\\.jupyter\\.env')


 
# setting path
sys.path.append('../')

## 1. Setup the evaluation dataset

### Q&A dataset without the prediction i.e., answer
This dataset can be used for generating the LLM responses. 

In [2]:
from utils.create_llm import create_gpt_llm, create_cohere_llm, create_ollama_llm, create_hugging_face_llm, create_google_llm, create_ai21_llm

evaluation_dataset = [
  {
    "question": "What is the capital of France?",
    "context": "France is a country in Western Europe known for its rich history, culture, and landmarks such as the Eiffel Tower in Paris, the capital of France.",
    "reference": "Paris"
  },
  {
    "question": "Who wrote 'Romeo and Juliet'?",
    "context": "'Romeo and Juliet' is a famous tragedy written in the late 16th century by William Shakespeare, an English playwright known as the Bard of Avon.",
    "reference": "William Shakespeare"
  },
  {
    "question": "What is the chemical symbol for water?",
    "context": "Water is a compound made up of two hydrogen atoms and one oxygen atom, essential for all known forms of life. Its chemical symbol is H2O.",
    "reference": "H2O"
  },
  {
    "question": "What is the tallest mountain in the world?",
    "context": "The Himalayan mountain range in Asia includes some of the tallest peaks in the world, including the renowned Mount Everest, the tallest mountain in the world.",
    "reference": "Mount Everest"
  },
  {
    "question": "Who painted the Mona Lisa?",
    "context": "The 'Mona Lisa' is a world-famous portrait known for its enigmatic smile, created during the Italian Renaissance by Leonardo da Vinci.",
    "reference": "Leonardo da Vinci"
  },
  {
    "question": "What is the largest planet in the Solar System?",
    "context": "The Solar System includes eight planets, with the largest being Jupiter, a gas giant known for its Great Red Spot.",
    "reference": "Jupiter"
  },
  {
    "question": "What is the process by which plants make food?",
    "context": "Plants produce food by converting sunlight, carbon dioxide, and water into glucose through a process called photosynthesis, occurring in chloroplasts.",
    "reference": "Photosynthesis"
  },
  {
    "question": "Which element has the atomic number 1?",
    "context": "The periodic table organizes elements by atomic number, with the first element being hydrogen, a light, colorless gas essential for life.",
    "reference": "Hydrogen"
  },
  {
    "question": "Who is known as the Father of Computers?",
    "context": "A 19th-century mathematician and inventor named Charles Babbage laid the foundations for modern computing by designing the Analytical Engine.",
    "reference": "Charles Babbage"
  },
  {
    "question": "What is the boiling point of water at sea level?",
    "context": "At standard atmospheric pressure, water transitions from a liquid to a gas at 100°C, the boiling point of water at sea level.",
    "reference": "100°C"
  }
]


### Q&A dataset with the prediction i.e., answer
Assume that the answer was generated by an LLM. This dataset is ready to be used for evaluation.

In [3]:
# Assume the answers were generated
evaluation_dataset_with_answer = evaluation_dataset = [
  {
    "question": "What is the capital of France?",
    "context": "France is a country in Western Europe known for its rich history, culture, and landmarks such as the Eiffel Tower in Paris, the capital of France.",
    "reference": "Paris",
    "answer": "The capital is Paris"  # 60%: similar/close to reference
  },
  {
    "question": "Who wrote 'Romeo and Juliet'?",
    "context": "'Romeo and Juliet' is a famous tragedy written in the late 16th century by William Shakespeare, an English playwright known as the Bard of Avon.",
    "reference": "William Shakespeare",
    "answer": "William Shakespeare"  # 20%: exact match
  },
  {
    "question": "What is the chemical symbol for water?",
    "context": "Water is a compound made up of two hydrogen atoms and one oxygen atom, essential for all known forms of life. Its chemical symbol is H2O.",
    "reference": "H2O",
    "answer": "H₂O"  # 60%: similar/close to reference (using subscript)
  },
  {
    "question": "What is the tallest mountain in the world?",
    "context": "The Himalayan mountain range in Asia includes some of the tallest peaks in the world, including the renowned Mount Everest, the tallest mountain in the world.",
    "reference": "Mount Everest",
    "answer": "Everest"  # 60%: similar/close to reference (abbreviated)
  },
  {
    "question": "Who painted the Mona Lisa?",
    "context": "The 'Mona Lisa' is a world-famous portrait known for its enigmatic smile, created during the Italian Renaissance by Leonardo da Vinci.",
    "reference": "Leonardo da Vinci",
    "answer": "Da Vinci"  # 60%: similar/close to reference (abbreviated)
  },
  {
    "question": "What is the largest planet in the Solar System?",
    "context": "The Solar System includes eight planets, with the largest being Jupiter, a gas giant known for its Great Red Spot.",
    "reference": "Jupiter",
    "answer": "Saturn"  # 10%: relevant but incorrect
  },
  {
    "question": "What is the process by which plants make food?",
    "context": "Plants produce food by converting sunlight, carbon dioxide, and water into glucose through a process called photosynthesis, occurring in chloroplasts.",
    "reference": "Photosynthesis",
    "answer": "Photosynthesis"  # 20%: exact match
  },
  {
    "question": "Which element has the atomic number 1?",
    "context": "The periodic table organizes elements by atomic number, with the first element being hydrogen, a light, colorless gas essential for life.",
    "reference": "Hydrogen",
    "answer": "Helium"  # 10%: random value
  },
  {
    "question": "Who is known as the Father of Computers?",
    "context": "A 19th-century mathematician and inventor named Charles Babbage laid the foundations for modern computing by designing the Analytical Engine.",
    "reference": "Charles Babbage",
    "answer": "Babbage"  # 60%: similar/close to reference (abbreviated)
  },
  {
    "question": "What is the boiling point of water at sea level?",
    "context": "At standard atmospheric pressure, water transitions from a liquid to a gas at 100°C, the boiling point of water at sea level.",
    "reference": "100°C",
    "answer": "100 degrees Celsius"  # 60%: similar/close to reference (paraphrased)
  }
]

## 2. Setup Judge LLM

In [4]:
# You may try different LLM as judges
llm_judge  = create_gpt_llm(args={"temperature": 0.0})

### Judge prompt-1 : WITHOUT Reference
Judge LLLM uses its parameteric knowledge to evaluate the answer as there is no ground truth i.e., reference.

In [5]:
judge_llm_prompt_no_reference = """
You are an expert evaluator tasked with judging the quality of answers generated by an AI model using the provided context. 
Your role is to assess whether the answer is correct, incorrect based on the provided question and context. 
Be objective and provide clear reasoning for your judgment.

Your response should be a JSON with two attributes {{"result": correct | incorrect, "comment": your comments}}

## context
{context}

## question
{question}

## answer:
{answer}
"""

# Template
judge_llm_prompt_no_reference_template = PromptTemplate.from_template(judge_llm_prompt_no_reference)


In [6]:
# Unit test the judge with this prompt
unit_test_data = {
  "question": "Who developed the theory of relativity?",
  "context": "Albert Einstein, a renowned physicist, developed the theory of relativity, which revolutionized our understanding of space, time, and gravity.",
  "reference": "Albert Einstein"
}

# Without reference : good answer
answer = "Albert Einstein"
judge_prompt = judge_llm_prompt_no_reference_template.format(question=unit_test_data['question'], context=unit_test_data['context'], answer=answer)
result = llm_judge.invoke(judge_prompt)
print("Test case#1  (no reference, good answer): ", result)

# Without reference : bad answer
answer = "Einstein Albert"
judge_prompt = judge_llm_prompt_no_reference_template.format(question=unit_test_data['question'], context=unit_test_data['context'], answer=answer)
result = llm_judge.invoke(judge_prompt)
print("\nTest case#2 (no reference, bad answer): ", result)

Test case#1  (no reference, good answer):  
{"result": "correct", "comment": "The answer is correct. Albert Einstein is widely recognized as the developer of the theory of relativity, which is mentioned in the context."}

Test case#2 (no reference, bad answer):  
{"result": incorrect, "comment": "The answer is incorrect because the name order is reversed. The correct answer is Albert Einstein, as stated in the context."}


### Judge prompt-2 : WITH Reference
The judge LLM is provided with the ground truth, so that it can compare the generated answer with the reference

In [7]:

judge_llm_prompt_with_reference = """
You are an expert evaluator tasked with judging the quality of answers generated by an AI model using the provided context. 
Your role is to assess whether the answer is correct, incorrect based on the provided question, context and ground truth.

Be objective and provide clear reasoning for your judgment.

Your response should be a JSON with two attributes {{"result": correct | incorrect, "comment": your comments}}

## context
{context}

## question
{question}

## ground truth
{reference}

## answer:
{answer}
"""
judge_llm_prompt_with_reference_template = PromptTemplate.from_template(judge_llm_prompt_with_reference)



In [8]:
# With reference : good answer
answer = "Albert Einstein"
judge_prompt = judge_llm_prompt_with_reference_template.format(question=unit_test_data['question'], context=unit_test_data['context'], reference=unit_test_data['reference'], answer=answer)
result = llm_judge.invoke(judge_prompt)
print("\nTest case#3  (with reference, good answer): ", result)

# With reference : bad answer
answer = "Einstein Albert"
judge_prompt = judge_llm_prompt_with_reference_template.format(question=unit_test_data['question'], context=unit_test_data['context'], reference=unit_test_data['reference'], answer=answer)
result = llm_judge.invoke(judge_prompt)
print("\nTest case#4  (with reference, bad answer): ", result)


Test case#3  (with reference, good answer):  
{"result": "correct", "comment": "The answer is correct because it directly matches the ground truth and is supported by the provided context and question."}

Test case#4  (with reference, bad answer):  
{"result": incorrect, "comment": "The answer is incorrect because the name order is reversed. The correct answer is Albert Einstein."}


### Judge prompt-3 : WITH Reference + Score assignment
The judge LLM is provided with the ground truth. Judge LLM is asked to assign a score to the answer.

In [9]:
judge_llm_prompt_with_reference_score = """
You are an expert evaluator tasked with judging the quality of answers generated by an AI model using the provided context. 

Your role is to assess whether the answer is correct, relevant, or incorrect based on the provided question, context and ground truth.

In addition, you will assign a score to the answer on a scale of 1 to 5, where:

1: The answer is incorrect (does not match the ground truth and is irrelevant to the question).

2: The answer is partially relevant but incorrect (somewhat related to the question but does not match the ground truth).

3: The answer is relevant but incorrect (directly addresses the question but does not match the ground truth).

4: The answer is mostly correct (matches the ground truth partially or is very close but not exact).

5: The answer is fully correct (matches the ground truth exactly and is fully relevant to the question



Be objective and provide clear reasoning for your judgment.


Additional Scoring Consideration:
1. Ease to Follow in the Absence of Context: The answer should be clear and self-contained, meaning it can be understood without requiring the context. 
If the answer is ambiguous, overly brief, or relies heavily on the context to be understood, it should receive a lower score.
2. Minor mistakes in answer may be ignored

Your response should be a JSON with two attributes {{"result": correct | relevant | incorrect, "score": score you assigned, "comment": your comments}}

## context
{context}

## question
{question}

## ground truth
{reference}

## answer:
{answer}
"""
judge_llm_prompt_with_reference_score_template = PromptTemplate.from_template(judge_llm_prompt_with_reference_score)

In [10]:
# With reference : good answer
answer = "Albert Einstein"
judge_prompt = judge_llm_prompt_with_reference_score_template.format(question=unit_test_data['question'], context=unit_test_data['context'], reference=unit_test_data['reference'], answer=answer)
result = llm_judge.invoke(judge_prompt)
print("\nTest case#4  (with reference, good answer): ", result)

# With reference : bad answer
answer = "Albert Smith"
judge_prompt = judge_llm_prompt_with_reference_score_template.format(question=unit_test_data['question'], context=unit_test_data['context'], reference=unit_test_data['reference'], answer=answer)
result = llm_judge.invoke(judge_prompt)
print("\nTest case#6  (with reference, good answer): ", result)


Test case#4  (with reference, good answer):  
{"result": "correct", "score": 5, "comment": "The answer is fully correct and relevant to the question. It is also clear and self-contained, as it can be understood without requiring the context."}

Test case#6  (with reference, good answer):  {"result": "incorrect", "score": 1, "comment": "The answer does not match the ground truth and is irrelevant to the question."}


## 3. Run evaluations with different judge LLM prompts

In [11]:
# Utility function for running the evaluations with different prompts, datasets & judge LLM
import json

def    run_evaluation(prompt_template, eval_dataset, judge_llm):

    MAX_SCORE=5
    results = []
    total_score, correct_count, incorrect_count, relevant_count, accuracy = 0, 0, 0, 0, 0
    for test_case in eval_dataset:
        judge_prompt = judge_llm_prompt_with_reference_score_template.format(**test_case)
        result = llm_judge.invoke(judge_prompt)
        result = json.loads(result)
        results.append(result)
        print(result, "answer: ", test_case['answer'], "  reference: ", test_case['reference'] )

        # Update the counts
        if result['result'] == 'correct':
            correct_count=correct_count+1
        elif result['result'] == 'incorrect':
            incorrect_count=incorrect_count+1
        elif result['result'] == 'relevant':
            relevant_count=relevant_count+1 

    # Assign weights to correct | relevant | incorrect to calculate accuracy
    weigth_correct = 1
    weight_incorrect= 0
    weight_relevant= 0.5
    
    # Accuracy is calculated if the score attribute is there in the 
    if "score" in results[0]:

        for result in results:
            total_score = total_score + result['score']

        accuracy = total_score/len(eval_dataset)
        accuracy_pct = (accuracy/MAX_SCORE)*100

    else:
        accuracy = (correct_count*weigth_correct  +  incorrect_count*weight_incorrect + relevant_count*weight_relevant)/len(eval_dataset)
        accuracy_pct = (accuracy/MAX_SCORE)*100
        
            
    print("Dataset size : ", len(eval_dataset))
    print("Correct : ", correct_count)
    print("Incorrect : ", incorrect_count)
    print("Relevant : ", relevant_count)
    print("Accuracy : ", accuracy)
    print("Accuracy percentage : ", accuracy_pct, "%")


In [12]:
# Prompt: judge_llm_prompt_no_reference_template, judge_llm_prompt_with_reference_template, judge_llm_prompt_with_reference_score_template
# Eval dataset : Dataset with simulated answers
run_evaluation(judge_llm_prompt_no_reference, evaluation_dataset_with_answer, llm_judge)

{'result': 'correct', 'score': 5, 'comment': 'The answer is fully correct and relevant to the question. It is clear and self-contained, and can be understood without requiring the context.'} answer:  The capital is Paris   reference:  Paris
{'result': 'correct', 'score': 5, 'comment': 'The answer is fully correct and relevant to the question. It is also clear and self-contained, as it can be understood without requiring the context.'} answer:  William Shakespeare   reference:  William Shakespeare
{'result': 'correct', 'score': 5, 'comment': 'The answer is fully correct and relevant to the question. It matches the ground truth exactly and is clear and self-contained, without requiring the context.'} answer:  H₂O   reference:  H2O
{'result': 'relevant', 'score': 4, 'comment': "The answer is mostly correct as it mentions the correct mountain, but it is not fully correct as it does not include the full name 'Mount Everest'. Additionally, the answer is clear and self-contained, as it can be

## 4. Evaluate a real LLM's performance

In [14]:
# Create cohere LLM
llm_target_cohere = create_cohere_llm()

# Create google LLM
llm_target_google_gemini = create_google_llm(model='gemini-1.5-flash')

# Change this to try different LLMs
target_llm = llm_target_cohere

print(target_llm)

target_llm_prompt = """
You are a teacher who answers students' questions strictly based on the provided context. 
You must only use information from the given context to answer the question and avoid making up any information. 
Your answers Must be concise. Do not provide any commentary or preamble.
If the context does not contain the information needed to answer the question, respond with: "The context does not provide enough information to answer this question."

Context:
{context}

Question:
{question}

Answer:
"""

target_prompt_template = PromptTemplate(
    template = target_llm_prompt,
    input_variables = ["context","question"]
)

def run_predictions(llm_target):

    dataset_with_predictions = []

    for i, qa_pair in enumerate(evaluation_dataset):
        context = qa_pair['context']
        question = qa_pair['question']
        prompt_qa = target_prompt_template.format(context=context, question=question)
    
        # invoke target LLM for answer
        answer = llm_target.invoke(prompt_qa)

        # print("ANSWER: ", answer)
        qa_pair['answer']=answer
    
        dataset_with_predictions.append(qa_pair)

    return dataset_with_predictions

# eval_dataset_cohere = run_predictions(llm_target_cohere)
eval_dataset = run_predictions(target_llm)

# Run evaluation
# Try different judge prompts: 
# judge_llm_prompt_no_reference_template, judge_llm_prompt_with_reference_template, judge_llm_prompt_with_reference_score_template
run_evaluation(judge_llm_prompt_no_reference, eval_dataset, llm_judge)

[1mCohere[0m
Params: {'model': None, 'max_tokens': 256, 'temperature': 0.75, 'k': 0, 'p': 1, 'frequency_penalty': 0.0, 'presence_penalty': 0.0, 'truncate': None}
{'result': 'correct', 'score': 5, 'comment': 'The answer is fully correct and relevant to the question. It is also clear and self-contained, as it can be understood without requiring the context.'} answer:   Paris.   reference:  Paris
{'result': 'correct', 'score': 5, 'comment': 'The answer is fully correct and relevant to the question. It is clear and self-contained, and can be understood without requiring the context.'} answer:   William Shakespeare.    reference:  William Shakespeare
{'result': 'correct', 'score': 5, 'comment': 'The answer is fully correct and relevant to the question. It is also clear and self-contained, as it can be understood without requiring the context.'} answer:   H2O.    reference:  H2O
{'result': 'correct', 'score': 5, 'comment': 'The answer is fully correct and relevant to the question. It is cl