## Phase 1 â€” Simplest LLM-as-Judge (Pass/Fail)


- One LLM generates an answer, another LLM judges it

In [4]:
import os
import json
from openai import OpenAI
from langchain_openai import ChatOpenAI


client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

MODEL_ANSWER = "gpt-4o-mini"   # responder (cheap + fast)
MODEL_JUDGE  = "gpt-4.1"        # judge (stronger)


In [9]:
def generate_answer(question: str) -> str:
    
    """LLM generates an answer to a question"""
    message = client.chat.completions.create(
        model=MODEL_ANSWER,
        max_tokens=1000,
        messages=[
            {
                "role": "user",
                "content": f"Answer this question concisely: {question}"
            }
        ]
    )
    return message.choices[0].message.content

In [10]:
def judge_answer(question: str, answer: str) -> dict:
    
    """LLM judges the quality of an answer"""
    judge_prompt = f"""You are an expert judge evaluating the quality of an answer.

Question: {question}

Answer: {answer}

Please evaluate this answer on the following criteria:
1. Correctness: Is the answer factually correct?
2. Completeness: Does it fully answer the question?
3. Clarity: Is it clear and easy to understand?

Provide:
- A score from 1-10 for each criterion
- Overall score (1-10)
- Brief explanation

Format your response as:
Correctness: [score]/10
Completeness: [score]/10
Clarity: [score]/10
Overall: [score]/10
Explanation: [your explanation]
"""
    
    message = client.chat.completions.create(
        model=MODEL_JUDGE,
        max_tokens=1000,
        messages=[
            {
                "role": "user",
                "content": judge_prompt
            }
        ]
    )
    
    return {
        "judgment": message.choices[0].message.content,
        "raw_response": message.choices[0].message.content
    }

In [12]:
question = "What is the capital of France?"
    
print("=" * 60)
print("PHASE 1: BASIC LLM-AS-JUDGE")
print("=" * 60)
print(f"\nQuestion: {question}\n")
    
# Step 1: Generate answer
print("Step 1: Generating answer...")
answer = generate_answer(question)
print(f"Answer: {answer}\n")
    
# Step 2: Judge the answer
print("Step 2: Judging the answer...")
judgment = judge_answer(question, answer)
print(f"\nJudgment:\n{judgment['judgment']}")
print("\n" + "=" * 60)

PHASE 1: BASIC LLM-AS-JUDGE

Question: What is the capital of France?

Step 1: Generating answer...
Answer: The capital of France is Paris.

Step 2: Judging the answer...

Judgment:
Correctness: 10/10  
Completeness: 10/10  
Clarity: 10/10  
Overall: 10/10  
Explanation: The answer is factually correct; Paris is indeed the capital of France. It fully addresses the question without omitting relevant information, and it is stated clearly and concisely, making it easy to understand.

