In [None]:
pip install anthropic

Collecting anthropic
  Downloading anthropic-0.43.0-py3-none-any.whl.metadata (23 kB)
Downloading anthropic-0.43.0-py3-none-any.whl (207 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.9/207.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.43.0


In [None]:
import json
import pandas as pd
import os
from anthropic import Anthropic
import time
from datetime import datetime
import numpy as np

class ClaudeEvaluator:
    def __init__(self, questions_file: str, max_consecutive_errors: int = 5):
        """
        Initialize evaluator with questions file and error threshold.
        max_consecutive_errors: stops evaluation if this many errors occur in a row
        """
        self.max_consecutive_errors = max_consecutive_errors
        self.load_questions(questions_file)
        self.results = []

    def load_questions(self, file_path: str):
        with open(file_path, 'r', encoding='utf-8') as f:
            self.questions = json.load(f)
            print(f"Loaded {len(self.questions)} questions")

    def format_prompt(self, question):
        return f"""Por favor, responde con la letra (A, B, C, o D) que corresponde a la respuesta correcta.

Pregunta: {question['question']}

A. {question['option_a']}
B. {question['option_b']}
C. {question['option_c']}
D. {question['option_d']}"""

    def evaluate_model(self, api_key: str, model_name: str = "claude-3-5-sonnet-20241022", test_run: bool = False):
        """
        Evaluate questions using specified Claude model.
        test_run: if True, only evaluates first 5 questions
        """
        client = Anthropic(api_key=api_key)
        consecutive_errors = 0

        questions_to_evaluate = self.questions[:5] if test_run else self.questions

        for i, question in enumerate(questions_to_evaluate):
            try:
                start_time = time.time()
                prompt = self.format_prompt(question)

                response = client.messages.create(
                    model=model_name,
                    messages=[{
                        "role": "user",
                        "content": prompt
                    }],
                    system="You are an AI that answers multiple choice questions. You MUST ONLY respond with exactly one letter: A, B, C, or D. No explanations or additional text.",
                    max_tokens=1024,
                    temperature=0
                )
                end_time = time.time()

                raw_answer = response.content[0].text.strip()

                # Extract just the letter
                import re
                match = re.search(r"^\s*([ABCD])(\b|\.|$)", raw_answer, re.IGNORECASE)
                if match:
                    model_answer = match.group(1).upper()
                else:
                    raise ValueError(f"Invalid answer format: {raw_answer}")

                # Validate answer format
                if model_answer not in ['A', 'B', 'C', 'D']:
                    raise ValueError(f"Invalid answer format: {model_answer}")

                result = {
                    'question_id': question['id'],
                    'model': model_name,
                    'question_input': question['question'],
                    'model_raw_output': raw_answer,
                    'answer': model_answer,
                    'correct_answer': question['correct_answer'],
                    'correct': model_answer == question['correct_answer'],
                    'time': end_time - start_time,
                    'tokens': response.usage.input_tokens + response.usage.output_tokens,
                    'error': None
                }

                self.results.append(result)
                consecutive_errors = 0

                print(f"Answer: {model_answer}, Correct: {question['correct_answer']}")
                print(f"Time: {result['time']:.2f}s, Tokens: {result['tokens']}")

                # Add delay to respect rate limits
                time.sleep(1)

            except Exception as e:
                error_msg = str(e)
                print(f"Error: {error_msg}")

                self.results.append({
                    'question_id': question['id'],
                    'model': model_name,
                    'question_input': question['question'],
                    'model_raw_output': None,
                    'answer': None,
                    'correct_answer': question['correct_answer'],
                    'correct': False,
                    'time': None,
                    'tokens': None,
                    'error': error_msg
                })

                consecutive_errors += 1
                if consecutive_errors >= self.max_consecutive_errors:
                    print(f"\nAborting: {consecutive_errors} consecutive errors")
                    break

        return self.save_results()

    def save_results(self, output_dir: str = '/content/drive/MyDrive/TFM2/TFM-DATASETS/evaluations'):
        """Save results and return summary metrics."""
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Convert to DataFrame
        df = pd.DataFrame(self.results)

        # Calculate metrics
        metrics = {
            'model': df['model'].iloc[0],
            'total_questions': len(df),
            'completed': df['error'].isna().sum(),
            'errors': df['error'].notna().sum(),
            'correct': df['correct'].sum(),
            'accuracy': df['correct'].mean(),
            'avg_time': df['time'].mean(),
            'total_tokens': df['tokens'].sum()
        }

        # Convert NumPy types to standard Python types for JSON serialization
        metrics = {k: v.item() if isinstance(v, (np.int64, np.int32)) else v for k, v in metrics.items()}

        # Save results
        base_path = f"{output_dir}/eval_{metrics['model']}_{timestamp}"
        df.to_csv(f"{base_path}_results.csv", index=False)

        with open(f"{base_path}_summary.json", 'w') as f:
            json.dump(metrics, f, indent=2)

        # Print summary
        print("\nEvaluation Summary:")
        print(f"Model: {metrics['model']}")
        print(f"Questions: {metrics['total_questions']}")
        print(f"Completed: {metrics['completed']}")
        print(f"Errors: {metrics['errors']}")
        print(f"Correct: {metrics['correct']}")
        print(f"Accuracy: {metrics['accuracy']:.2%}")
        if metrics['avg_time']:
            print(f"Avg Time: {metrics['avg_time']:.2f}s")
        if metrics['total_tokens']:
            print(f"Total Tokens: {metrics['total_tokens']}")

        return metrics


def run_test_evaluation(api_key: str):
    """Run a test evaluation with just 5 questions."""
    questions_file = '/content/drive/MyDrive/TFM2/TFM-DATASETS/structured_questions.json'
    evaluator = ClaudeEvaluator(questions_file)
    return evaluator.evaluate_model(api_key, test_run=True)

def run_full_evaluation(api_key: str):
    """Run the full evaluation."""
    questions_file = '/content/drive/MyDrive/TFM2/TFM-DATASETS/structured_questions.json'
    evaluator = ClaudeEvaluator(questions_file)
    return evaluator.evaluate_model(api_key)

# Mount Google Drive if needed
from google.colab import drive
drive.mount('/content/drive')

# Get API key
api_key = input("Enter your Anthropic API key: ")

# First run a test with 5 questions
print("\nRunning test evaluation with 5 questions...")
test_metrics = run_test_evaluation(api_key)

# Ask user if they want to continue
if input("\nContinue with full evaluation? (y/n): ").lower() == 'y':
    print("\nRunning full evaluation...")
    full_metrics = run_full_evaluation(api_key)


Mounted at /content/drive
Enter your Anthropic API key: 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

Running test evaluation with 5 questions...
Loaded 174 questions
Answer: B, Correct: B
Time: 1.01s, Tokens: 299
Answer: C, Correct: C
Time: 0.39s, Tokens: 240
Answer: A, Correct: A
Time: 0.51s, Tokens: 249
Answer: B, Correct: B
Time: 0.61s, Tokens: 242
Answer: C, Correct: C
Time: 0.43s, Tokens: 254

Evaluation Summary:
Model: claude-3-5-sonnet-20241022
Questions: 5
Completed: 5
Errors: 0
Correct: 5
Accuracy: 100.00%
Avg Time: 0.59s
Total Tokens: 1284

Continue with full evaluation? (y/n): y

Running full evaluation...
Loaded 174 questions
Answer: B, Correct: B
Time: 1.13s, Tokens: 299
Answer: C, Correct: C
Time: 0.46s, Tokens: 240
Answer: A, Correct: A
Time: 0.42s, Tokens: 249
Answer: B, Correct: B
Time: 0.50s, Tokens: 242
Answer: C, Correct: C
Time: 0.43s, Tokens: 254
Answer: A, Correct: D
Time: 2.19s, Tokens: 268
Answer: