In [None]:
pip install openai pandas tenacity



In [None]:
import json
import pandas as pd
import os
import time
from datetime import datetime
import numpy as np
import re
from google.colab import drive
from openai import OpenAI

class O1Evaluator:
    def __init__(self, questions_file: str, max_consecutive_errors: int = 5):
        """
        Initialize evaluator with questions file and error threshold.
        max_consecutive_errors: stops evaluation if this many errors occur in a row
        """
        self.max_consecutive_errors = max_consecutive_errors
        self.load_questions(questions_file)
        self.results = []

    def load_questions(self, file_path: str):
        with open(file_path, 'r', encoding='utf-8') as f:
            self.questions = json.load(f)
            print(f"Loaded {len(self.questions)} questions")

    def format_prompt(self, question):
        return [
            {"role": "user", "content": f"""Answer this multiple choice question. Start your response with the letter of the correct answer (A, B, C, or D), then explain your choice.

Question: {question['question']}

A- {question['option_a']}
B- {question['option_b']}
C- {question['option_c']}
D- {question['option_d']}"""}
        ]

    def extract_answer(self, raw_answer):
        """Extract the answer letter using multiple patterns"""
        patterns = [
            r"^\s*([ABCD])(\b|\.|$)",  # Just the letter
            r".*[^a-zA-Z]([ABCD])(\b|\.|$)",  # Letter at the end
            r"([ABCD])[^\w\s]*$",  # Letter followed by punctuation at end
            r".*\b([ABCD])\b.*",  # Letter as a word
            r".*([ABCD]).*"  # Any letter anywhere
        ]

        for pattern in patterns:
            match = re.search(pattern, raw_answer, re.IGNORECASE)
            if match:
                return match.group(1).upper()
        return None

    def evaluate_model(self, api_key: str, test_run: bool = False):
        """
        Evaluate questions using O1 model.
        test_run: if True, only evaluates first 5 questions
        """
        # Initialize OpenAI client
        client = OpenAI(api_key=api_key)

        consecutive_errors = 0
        questions_to_evaluate = self.questions[:5] if test_run else self.questions

        # For test run, we'll still abort on consecutive errors
        max_errors = self.max_consecutive_errors if test_run else float('inf')

        for i, question in enumerate(questions_to_evaluate):
            try:
                start_time = time.time()
                prompt = self.format_prompt(question)

                print("\nSending prompt:")
                for msg in prompt:
                    print(f"{msg['role']}: {msg['content']}")

                # Call O1 model with the prompt - minimal parameters
                response = client.chat.completions.create(
                    model="o1-preview-2024-09-12",
                    messages=prompt,
                    seed=42  # Add deterministic behavior
                )
                end_time = time.time()

                raw_answer = response.choices[0].message.content.strip()
                print(f"\nComplete model response object:")
                for key, value in response.model_dump().items():
                    print(f"{key}: {value}")

                # Try to extract the answer using multiple patterns
                model_answer = self.extract_answer(raw_answer)

                if not model_answer:
                    print(f"Warning: Could not extract answer from response: {raw_answer}")
                    model_answer = "invalid"

                result = {
                    'question_id': question['id'],
                    'model': 'o1-preview-2024-09-12',
                    'question_input': question['question'],
                    'model_raw_output': raw_answer,
                    'answer': model_answer if model_answer in ['A', 'B', 'C', 'D'] else None,
                    'correct_answer': question['correct_answer'],
                    'correct': model_answer == question['correct_answer'] if model_answer in ['A', 'B', 'C', 'D'] else False,
                    'time': end_time - start_time,
                    'tokens': response.usage.total_tokens,
                    'error': None if model_answer in ['A', 'B', 'C', 'D'] else f"Invalid format: {raw_answer}"
                }

                self.results.append(result)

                if model_answer in ['A', 'B', 'C', 'D']:
                    consecutive_errors = 0
                else:
                    consecutive_errors += 1

                print(f"\nEvaluation:")
                print(f"Question {i+1}/{len(questions_to_evaluate)}")
                print(f"Raw answer: {raw_answer}")
                print(f"Extracted answer: {model_answer}")
                print(f"Correct answer: {question['correct_answer']}")
                print(f"Time: {result['time']:.2f}s")
                print(f"Tokens: {result['tokens']}")
                print("-" * 40)

                # Add delay to respect rate limits
                time.sleep(1)

                if consecutive_errors >= max_errors and test_run:
                    print(f"\nAborting test run: {consecutive_errors} consecutive errors")
                    break

            except Exception as e:
                error_msg = str(e)
                print(f"Error on question {i+1}: {error_msg}")

                self.results.append({
                    'question_id': question['id'],
                    'model': 'o1-preview-2024-09-12',
                    'question_input': question['question'],
                    'model_raw_output': None,
                    'answer': None,
                    'correct_answer': question['correct_answer'],
                    'correct': False,
                    'time': None,
                    'tokens': None,
                    'error': error_msg
                })

                consecutive_errors += 1
                if consecutive_errors >= max_errors and test_run:
                    print(f"\nAborting test run: {consecutive_errors} consecutive errors")
                    break

        return self.save_results()

    def save_results(self, output_dir: str = '/content/drive/MyDrive/TFM2/TFM-DATASETS/evaluations'):
        """Save results and return summary metrics."""
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Convert to DataFrame
        df = pd.DataFrame(self.results)

        # Calculate metrics
        metrics = {
            'model': 'o1-preview-2024-09-12',
            'total_questions': len(df),
            'completed': df['error'].isna().sum(),
            'errors': df['error'].notna().sum(),
            'correct': df['correct'].sum(),
            'accuracy': df['correct'].mean(),
            'avg_time': df['time'].mean(),
            'avg_tokens': df['tokens'].mean(),
            'invalid_formats': len(df[df['answer'].isna()]),
            'valid_responses': len(df[df['answer'].notna()])
        }

        # Convert NumPy types to standard Python types for JSON serialization
        metrics = {k: v.item() if isinstance(v, (np.int64, np.int32)) else v for k, v in metrics.items()}

        # Save results
        base_path = f"{output_dir}/eval_o1_{timestamp}"
        df.to_csv(f"{base_path}_results.csv", index=False)

        with open(f"{base_path}_summary.json", 'w') as f:
            json.dump(metrics, f, indent=2)

        # Print summary
        print("\nEvaluation Summary:")
        print(f"Model: O1")
        print(f"Total Questions: {metrics['total_questions']}")
        print(f"Valid Responses: {metrics['valid_responses']}")
        print(f"Invalid Formats: {metrics['invalid_formats']}")
        print(f"Correct Answers: {metrics['correct']}")
        print(f"Accuracy (of valid): {metrics['accuracy']:.2%}")
        if metrics['avg_time']:
            print(f"Avg Time: {metrics['avg_time']:.2f}s")
        if metrics['avg_tokens']:
            print(f"Avg Tokens: {metrics['avg_tokens']:.1f}")

        return metrics


def run_test_evaluation(api_key: str):
    """Run a test evaluation with just 5 questions."""
    questions_file = '/content/drive/MyDrive/TFM2/TFM-DATASETS/structured_questions.json'
    evaluator = O1Evaluator(questions_file)
    return evaluator.evaluate_model(api_key, test_run=True)

def run_full_evaluation(api_key: str):
    """Run the full evaluation."""
    questions_file = '/content/drive/MyDrive/TFM2/TFM-DATASETS/structured_questions.json'
    evaluator = O1Evaluator(questions_file)
    return evaluator.evaluate_model(api_key)

# First install required packages
!pip install -q openai

# Mount Google Drive
drive.mount('/content/drive')

# Get API key
api_key = input("Enter your OpenAI API key: ")

# First run a test with 5 questions
print("\nRunning test evaluation with 5 questions...")
test_metrics = run_test_evaluation(api_key)

# Ask user if they want to continue
if input("\nContinue with full evaluation? (y/n): ").lower() == 'y':
    print("\nRunning full evaluation...")
    full_metrics = run_full_evaluation(api_key)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
**Explicación:**

En el tratamiento farmacológico de la manía aguda en el trastorno bipolar, los antipsicóticos atípicos (como risperidona, olanzapina, quetiapina y aripiprazol) han demostrado ser efectivos tanto cuando se utilizan solos (monoterapia) como cuando se combinan con estabilizadores del ánimo (eutimizantes) como el litio o el valproato. Estos medicamentos pueden ayudar a controlar rápidamente los síntomas maníacos, y su uso en combinación puede potenciar el efecto terapéutico y permitir el uso de dosis más bajas de cada fármaco, reduciendo así el riesgo de efectos secundarios. Por lo tanto, la opción B es la respuesta correcta.

**Análisis de las otras opciones:**

- **Opción A:** Aunque el litio y el valproato son ambos efectivos en el tratamiento de la manía aguda, la evidencia no es concluyente en cuanto a que el litio sea igual o ligeramente más eficaz que el valproato. La eficacia puede variar según el pa