## Phase 3 ‚Äî Pairwise Comparison LLM-as-Judge

Judge compares two answers and determines which is better


In [1]:
import os
import json
from openai import OpenAI
from typing import Dict, List


client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

MODEL_ANSWER = "gpt-4o-mini"   # responder
MODEL_JUDGE  = "gpt-4.1"        # judge


In [2]:
def generate_answer(question: str, num_answers: int = 2) -> str:
    
    """Generate multiple different answers using temperature variation"""
    answers = []
    
    for i in range(num_answers):
        message = client.chat.completions.create(
            model=MODEL_ANSWER,
            max_tokens=500,
            temperature=0.7 + (i * 0.2),  # Vary temperature for diversity
            messages=[
                {
                    "role": "user",
                    "content": f"Answer this question in a different way: {question}"
                }
            ]
        )
        answers.append(message.choices[0].message.content)
    
    return answers

In [8]:
def pairwise_judge(question: str, answer_a: str, answer_b: str) -> dict:
    
    """Compare two answers and determine which is better"""
    judge_prompt = f"""You are an expert judge comparing two answers to the same question.

Question: {question}

Answer A:
{answer_a}

Answer B:
{answer_b}

Compare these answers on:
1. Accuracy
2. Completeness
3. Clarity
4. Helpfulness

Determine which answer is better overall.

Respond ONLY with valid JSON:
{{
    "winner": "A" or "B" or "tie",
    "confidence": <number 1-10>,
    "comparison": {{
        "accuracy": "A" or "B" or "tie",
        "completeness": "A" or "B" or "tie",
        "clarity": "A" or "B" or "tie",
        "helpfulness": "A" or "B" or "tie"
    }},
    "reasoning": "explanation of decision",
    "score_a": <number 1-10>,
    "score_b": <number 1-10>
}}
"""
    
    message = client.chat.completions.create(
        model=MODEL_JUDGE,
        max_tokens=1000,
        messages=[
            {"role": "user", "content": judge_prompt}
        ]
    )

    response_text = message.choices[0].message.content

    try:
        if "```json" in response_text:
            response_text = response_text.split("```json")[1].split("```")[0]
        elif "```" in response_text:
            response_text = response_text.split("```")[1].split("```")[0]
        
        return json.loads(response_text.strip())
    except json.JSONDecodeError:
        return {
            "error": "Failed to parse JSON",
            "raw_response": response_text
        }

In [4]:
def print_pairwise_comparison(comparison: Dict):
    """Pretty print pairwise comparison"""
    if "error" in comparison:
        print(f"Error: {comparison['error']}")
        return
    
    print("\n‚öñÔ∏è  PAIRWISE COMPARISON RESULTS")
    print("=" * 60)
    
    # Winner announcement
    winner_emoji = "üèÜ" if comparison['winner'] != "tie" else "ü§ù"
    print(f"\n{winner_emoji} Winner: {comparison['winner'].upper()}")
    print(f"   Confidence: {comparison['confidence']}/10")
    
    # Scores
    print(f"\nüìä Overall Scores:")
    print(f"   Answer A: {comparison['score_a']}/10")
    print(f"   Answer B: {comparison['score_b']}/10")
    
    # Criterion breakdown
    print(f"\nüìã Criterion-by-Criterion:")
    for criterion, winner in comparison['comparison'].items():
        symbol = "‚Üí" if winner == "tie" else "‚úì"
        print(f"   {criterion.capitalize():<15} {symbol} {winner.upper()}")
    
    # Reasoning
    print(f"\nüí≠ Reasoning:")
    print(f"   {comparison['reasoning']}")
    print("\n" + "=" * 60)

In [5]:
def run_tournament(question: str, num_candidates: int = 3):
    """Run a tournament-style comparison"""
    print(f"\nüèÜ RUNNING TOURNAMENT: {num_candidates} candidates")
    print("=" * 60)
    
    # Generate multiple answers
    print(f"\nüìù Generating {num_candidates} different answers...")
    answers = generate_answer(question, num_candidates)
    
    for i, ans in enumerate(answers, 1):
        print(f"\nCandidate {i}: {ans[:100]}...")
    
    # Compare all pairs
    print(f"\n\n‚öîÔ∏è  PAIRWISE BATTLES")
    print("=" * 60)
    
    wins = {i: 0 for i in range(num_candidates)}
    
    for i in range(num_candidates):
        for j in range(i + 1, num_candidates):
            print(f"\nü•ä Battle: Candidate {i+1} vs Candidate {j+1}")
            comparison = pairwise_judge(question, answers[i], answers[j])
            
            if "error" not in comparison:
                if comparison['winner'] == 'A':
                    wins[i] += 1
                    print(f"   Winner: Candidate {i+1}")
                elif comparison['winner'] == 'B':
                    wins[j] += 1
                    print(f"   Winner: Candidate {j+1}")
                else:
                    print(f"   Result: Tie")
    
    # Determine overall winner
    print(f"\n\nüèÜ TOURNAMENT RESULTS")
    print("=" * 60)
    sorted_candidates = sorted(wins.items(), key=lambda x: x[1], reverse=True)
    
    for rank, (candidate, win_count) in enumerate(sorted_candidates, 1):
        medal = ["ü•á", "ü•à", "ü•â"][rank-1] if rank <= 3 else "  "
        print(f"{medal} Rank {rank}: Candidate {candidate+1} - {win_count} wins")


In [10]:
print("=" * 70)
print("PHASE 3: PAIRWISE COMPARISON & TOURNAMENT")
print("=" * 70)

question = "What are the main differences between wormhole and black hole?"

print(f"\n‚ùì Question: {question}\n")
        
# Simple pairwise comparison
print("\n" + "=" * 70)
print("EXAMPLE 1: Simple Pairwise Comparison")
print("=" * 70)
        
answers = generate_answer(question, 2)
        
print(f"\nüí¨ Answer A:\n{answers[0]}\n")
print(f"üí¨ Answer B:\n{answers[1]}\n")
        
comparison = pairwise_judge(question, answers[0], answers[1])
print_pairwise_comparison(comparison)
        
# Tournament mode
print("\n\n" + "=" * 70)
print("EXAMPLE 2: Tournament Mode")
print("=" * 70)
run_tournament(question, num_candidates=3)
        
print("\n" + "=" * 70)

PHASE 3: PAIRWISE COMPARISON & TOURNAMENT

‚ùì Question: What are the main differences between wormhole and black hole?


EXAMPLE 1: Simple Pairwise Comparison

üí¨ Answer A:
What distinguishes a wormhole from a black hole?

üí¨ Answer B:
What distinguishes a wormhole from a black hole?


‚öñÔ∏è  PAIRWISE COMPARISON RESULTS

ü§ù Winner: TIE
   Confidence: 10/10

üìä Overall Scores:
   Answer A: 1/10
   Answer B: 1/10

üìã Criterion-by-Criterion:
   Accuracy        ‚Üí TIE
   Completeness    ‚Üí TIE
   Clarity         ‚Üí TIE
   Helpfulness     ‚Üí TIE

üí≠ Reasoning:
   Both Answer A and Answer B simply restate the question and provide no actual content or explanation distinguishing wormholes from black holes. They are therefore equal in all areas: accuracy, completeness, clarity, and helpfulness, all of which are minimal.



EXAMPLE 2: Tournament Mode

üèÜ RUNNING TOURNAMENT: 3 candidates

üìù Generating 3 different answers...

Candidate 1: What distinguishes a wormhole from a