## Phase 4 ‚Äî Multi-Aspect Judge with Reference Answers


Judge evaluates against reference answer and custom rubrics


In [1]:
import os
import json
from openai import OpenAI
from typing import Dict, List, Optional
from dataclasses import dataclass


client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

MODEL_ANSWER = "gpt-4o-mini"   # responder
MODEL_JUDGE  = "gpt-4.1"        # judge


In [3]:
@dataclass
class EvaluationRubric:
    """Define evaluation criteria"""
    name: str
    description: str
    weight: float = 1.0

In [10]:
from re import M


class MultiAspectJudge:
    """Advanced judge with reference answers and custom rubrics"""
    
    def __init__(self, rubrics: List[EvaluationRubric] = None):
        self.rubrics = rubrics or self._default_rubrics()
    
    def _default_rubrics(self) -> List[EvaluationRubric]:
        """Default evaluation rubrics"""
        return [
            EvaluationRubric("accuracy", "Factual correctness and precision", 2.0),
            EvaluationRubric("completeness", "Coverage of all relevant aspects", 1.5),
            EvaluationRubric("clarity", "Clear and understandable explanation", 1.0),
            EvaluationRubric("coherence", "Logical flow and organization", 1.0),
            EvaluationRubric("conciseness", "Appropriate length without redundancy", 0.8)
        ]
    
    def generate_answer(self, question: str, context: str = "") -> str:
        """Generate an answer"""
        prompt = f"{context}\n\nQuestion: {question}" if context else question
        
        message = client.chat.completions.create(
            model=MODEL_ANSWER,
            max_tokens=1000,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        return message.choices[0].message.content
    
    def evaluate_with_reference(
        self, 
        question: str, 
        candidate_answer: str,
        reference_answer: Optional[str] = None,
        context: str = ""
    ) -> Dict:
        """Evaluate answer against reference and rubrics"""
        
        # Build rubric description
        rubric_text = "\n".join([
            f"- {r.name.capitalize()} (weight: {r.weight}): {r.description}"
            for r in self.rubrics
        ])
        
        reference_section = ""
        if reference_answer:
            reference_section = f"""
Reference Answer (gold standard):
{reference_answer}

Compare the candidate answer to this reference.
"""
        
        context_section = ""
        if context:
            context_section = f"""
Context/Background:
{context}
"""
        
        judge_prompt = f"""You are an expert evaluator assessing an answer's quality.

{context_section}

Question: {question}

Candidate Answer:
{candidate_answer}

{reference_section}

Evaluation Rubrics:
{rubric_text}

For each rubric criterion, provide:
1. Score (1-10)
2. Brief justification

Also provide:
- Overall weighted score
- Key strengths (2-3 points)
- Key weaknesses (2-3 points)
- Specific suggestions for improvement
- Alignment with reference (if provided): percentage 0-100

Respond ONLY with valid JSON:
{{
    "rubric_scores": {{
        "accuracy": {{"score": <number>, "justification": "..."}},
        "completeness": {{"score": <number>, "justification": "..."}},
        "clarity": {{"score": <number>, "justification": "..."}},
        "coherence": {{"score": <number>, "justification": "..."}},
        "conciseness": {{"score": <number>, "justification": "..."}}
    }},
    "weighted_score": <number>,
    "raw_average": <number>,
    "reference_alignment": <number or null>,
    "strengths": ["strength1", "strength2"],
    "weaknesses": ["weakness1", "weakness2"],
    "improvements": ["suggestion1", "suggestion2"],
    "overall_assessment": "detailed paragraph"
}}
"""
        
        message = client.chat.completions.create(
            model=MODEL_JUDGE,
            max_tokens=2000,
            messages=[
                {"role": "user", "content": judge_prompt}
            ]
        )
        
        response_text = message.choices[0].message.content
        
        try:
            if "```json" in response_text:
                response_text = response_text.split("```json")[1].split("```")[0]
            elif "```" in response_text:
                response_text = response_text.split("```")[1].split("```")[0]
            
            return json.loads(response_text.strip())
        except json.JSONDecodeError:
            return {
                "error": "Failed to parse JSON",
                "raw_response": response_text
            }
    
    def batch_evaluate(
        self, 
        questions: List[str],
        candidate_answers: List[str],
        reference_answers: Optional[List[str]] = None
    ) -> List[Dict]:
        """Evaluate multiple Q&A pairs"""
        results = []
        
        for i, (question, candidate) in enumerate(zip(questions, candidate_answers)):
            reference = reference_answers[i] if reference_answers else None
            evaluation = self.evaluate_with_reference(question, candidate, reference)
            results.append({
                "question": question,
                "evaluation": evaluation
            })
        
        return results

In [8]:
def print_detailed_evaluation(eval_result: Dict):
    """Pretty print detailed evaluation"""
    if "error" in eval_result:
        print(f"‚ùå Error: {eval_result['error']}")
        return
    
    print("\n" + "=" * 70)
    print("üìä DETAILED EVALUATION REPORT")
    print("=" * 70)
    
    # Scores by rubric
    print("\nüìã RUBRIC SCORES:")
    for criterion, details in eval_result['rubric_scores'].items():
        score = details['score']
        bar = "‚ñà" * score + "‚ñë" * (10 - score)
        print(f"\n  {criterion.upper():<15} [{bar}] {score}/10")
        print(f"  ‚îî‚îÄ {details['justification']}")
    
    # Overall scores
    print(f"\nüéØ OVERALL SCORES:")
    print(f"  Raw Average:     {eval_result['raw_average']:.2f}/10")
    print(f"  Weighted Score:  {eval_result['weighted_score']:.2f}/10")
    
    if eval_result.get('reference_alignment'):
        alignment = eval_result['reference_alignment']
        bar = "‚ñà" * (alignment // 10) + "‚ñë" * (10 - alignment // 10)
        print(f"  Reference Match: [{bar}] {alignment}%")
    
    # Strengths
    print(f"\n‚úÖ STRENGTHS:")
    for strength in eval_result['strengths']:
        print(f"  ‚Ä¢ {strength}")
    
    # Weaknesses
    print(f"\n‚ö†Ô∏è  WEAKNESSES:")
    for weakness in eval_result['weaknesses']:
        print(f"  ‚Ä¢ {weakness}")
    
    # Improvements
    print(f"\nüí° IMPROVEMENT SUGGESTIONS:")
    for improvement in eval_result['improvements']:
        print(f"  ‚Ä¢ {improvement}")
    
    # Overall assessment
    print(f"\nüìù OVERALL ASSESSMENT:")
    print(f"  {eval_result['overall_assessment']}")
    
    print("\n" + "=" * 70)


In [11]:
print("=" * 70)
print("PHASE 4: MULTI-ASPECT JUDGE WITH REFERENCE ANSWERS")
print("=" * 70)
    
judge = MultiAspectJudge()
    
# Example 1: Evaluation with reference answer
print("\n\n" + "=" * 70)
print("EXAMPLE 1: Evaluation with Reference Answer")
print("=" * 70)
    
question = "What is gradient descent?"
    
reference = """Gradient descent is an optimization algorithm used to minimize 
    a cost function by iteratively moving in the direction of steepest descent 
    as defined by the negative of the gradient. It updates parameters by 
    subtracting the gradient multiplied by a learning rate."""
    
# Generate a candidate answer
candidate = judge.generate_answer(question)
    
print(f"\n‚ùì Question: {question}\n")
print(f"üìö Reference Answer:\n{reference}\n")
print(f"üí¨ Candidate Answer:\n{candidate}\n")
    
# Evaluate
evaluation = judge.evaluate_with_reference(question, candidate, reference)
print_detailed_evaluation(evaluation)
    
# Example 2: Batch evaluation
print("\n\n" + "=" * 70)
print("EXAMPLE 2: Batch Evaluation")
print("=" * 70)
    
questions = [
        "What is overfitting?",
        "Explain the bias-variance tradeoff"
    ]
    
print("\nüìù Generating candidate answers...")
candidates = [judge.generate_answer(q) for q in questions]
    
references = [
        "Overfitting occurs when a model learns training data too well, including noise, reducing generalization to new data.",
        "Bias-variance tradeoff: high bias models underfit (too simple), high variance models overfit (too complex). Goal is to balance both."
    ]
    
results = judge.batch_evaluate(questions, candidates, references)
    
for i, result in enumerate(results, 1):
        print(f"\n{'='*70}")
        print(f"QUESTION {i}: {result['question']}")
        print(f"{'='*70}")
        print_detailed_evaluation(result['evaluation'])
    
print("\n" + "=" * 70)
print("‚úÖ Batch evaluation complete!")
print("=" * 70)

PHASE 4: MULTI-ASPECT JUDGE WITH REFERENCE ANSWERS


EXAMPLE 1: Evaluation with Reference Answer

‚ùì Question: What is gradient descent?

üìö Reference Answer:
Gradient descent is an optimization algorithm used to minimize 
    a cost function by iteratively moving in the direction of steepest descent 
    as defined by the negative of the gradient. It updates parameters by 
    subtracting the gradient multiplied by a learning rate.

üí¨ Candidate Answer:
Gradient descent is an optimization algorithm used to minimize a function by iteratively moving towards the lowest point of that function. It is widely used in machine learning and statistics, particularly for training models, such as linear regression and neural networks.

### How It Works:

1. **Objective Function**: You start with a cost function (or loss function) that measures how well a particular model fits the data. The goal is to minimize this function.

2. **Initialization**: The algorithm begins with an initial guess fo