In [None]:
import json
import pandas as pd
import os
import time
from datetime import datetime
import numpy as np
import re
from google.colab import drive
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

class FLANT5QAEvaluator:
    def __init__(self, questions_file: str, model_name: str = "google/flan-t5-large", max_consecutive_errors: int = 5):
        """Initialize evaluator with questions file and error threshold."""
        self.max_consecutive_errors = max_consecutive_errors
        self.model_name = model_name
        self.load_questions(questions_file)
        self.results = []

        print(f"Loading {model_name}...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)
        print(f"Model loaded and moved to {self.device}")

    def load_questions(self, file_path: str):
        with open(file_path, 'r', encoding='utf-8') as f:
            self.questions = json.load(f)
            print(f"Loaded {len(self.questions)} questions")

    def format_prompt(self, question):
        """Format the prompt with examples"""
        question_text = question.get('question', '')
        options = question.get('options', {})

        # Format with examples and clear instructions
        prompt = f"""Answer these multiple choice questions by choosing the correct letter (A, B, C, or D).
Each example shows the question, options, and correct answer.

Example 1:
Question: What is the capital of France?
Options:
A) London
B) Madrid
C) Paris
D) Berlin
Answer: C

Example 2:
Question: Which organ pumps blood through the body?
Options:
A) Liver
B) Heart
C) Kidney
D) Lung
Answer: B

Now answer this medical question:
Question: {question_text}
Options:
A) {options.get('A', '')}
B) {options.get('B', '')}
C) {options.get('C', '')}
D) {options.get('D', '')}
Answer:"""

        return prompt

    def get_model_response(self, prompt):
        """Get response from FLAN-T5 model showing full response"""
        inputs = self.tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=50,  # Increased to see full response
                num_beams=4,
                do_sample=True,
                temperature=0.7,
                early_stopping=False
            )

        # Get the full response
        full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract letter if present
        letter_match = re.search(r'[ABCD]', full_response.upper())
        extracted_letter = letter_match.group(0) if letter_match else None

        return {
            'full_response': full_response,
            'extracted_letter': extracted_letter
        }

    def evaluate_model(self, test_run: bool = False):
        """Evaluate questions using FLAN-T5 model."""
        questions_to_evaluate = self.questions[:5] if test_run else self.questions

        for i, question in enumerate(questions_to_evaluate):
            try:
                start_time = time.time()
                prompt = self.format_prompt(question)

                print(f"\nProcessing Question {i+1}/{len(questions_to_evaluate)}:")
                print(f"Question: {question['question']}")

                response = self.get_model_response(prompt)
                end_time = time.time()

                model_answer = response['extracted_letter']
                is_correct = model_answer == question['correct_answer'] if model_answer else False

                result = {
                    'question_id': question.get('id', f'Q{i+1}'),
                    'model': self.model_name,
                    'question': question['question'],
                    'full_response': response['full_response'],
                    'extracted_letter': model_answer,
                    'correct_answer': question['correct_answer'],
                    'correct': is_correct,
                    'time': end_time - start_time,
                    'error': None if model_answer in ['A', 'B', 'C', 'D'] else 'Invalid answer format'
                }

                self.results.append(result)

                print("\nResults:")
                print(f"Full model response: {response['full_response']}")
                print(f"Extracted answer: {model_answer}")
                print(f"Correct answer: {question['correct_answer']}")
                print(f"Correct: {is_correct}")
                print(f"Time: {result['time']:.2f}s")
                print("-" * 40)

            except Exception as e:
                error_msg = str(e)
                print(f"Error on question {i+1}: {error_msg}")

                self.results.append({
                    'question_id': question.get('id', f'Q{i+1}'),
                    'model': self.model_name,
                    'question': question.get('question', ''),
                    'full_response': None,
                    'extracted_letter': None,
                    'correct_answer': question.get('correct_answer', ''),
                    'correct': False,
                    'time': None,
                    'error': error_msg
                })

        return self.save_results()

    def save_results(self, output_dir: str = '/content/drive/MyDrive/TFM2/TFM-DATASETS/evaluations'):
        """Save results and return summary metrics."""
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        df = pd.DataFrame(self.results)

        # Calculate metrics
        valid_answers = df[df['extracted_letter'].isin(['A', 'B', 'C', 'D'])]

        metrics = {
            'model': self.model_name,
            'total_questions': int(len(df)),
            'completed': int(len(valid_answers)),
            'errors': int(len(df) - len(valid_answers)),
            'correct': int(df['correct'].sum()),
            'accuracy': float(df['correct'].sum() / len(df)) if len(df) > 0 else 0.0,
            'valid_accuracy': float(valid_answers['correct'].sum() / len(valid_answers)) if len(valid_answers) > 0 else 0.0,
            'avg_time': float(df['time'].mean()) if not df['time'].isna().all() else 0.0,
            'valid_responses': int(len(valid_answers)),
            'invalid_responses': int(len(df) - len(valid_answers))
        }

        # Save full results including raw responses
        model_short_name = self.model_name.split('/')[-1].replace('/', '_')
        base_path = f"{output_dir}/eval_fulltext_{model_short_name}_{timestamp}"

        df.to_csv(f"{base_path}_results.csv", index=False)

        with open(f"{base_path}_summary.json", 'w') as f:
            json.dump(metrics, f, indent=2)

        print("\nEvaluation Summary:")
        print(f"Model: {self.model_name}")
        print(f"Total Questions: {metrics['total_questions']}")
        print(f"Valid Responses: {metrics['valid_responses']}")
        print(f"Invalid Responses: {metrics['invalid_responses']}")
        print(f"Correct Answers: {metrics['correct']}")
        print(f"Overall Accuracy: {metrics['accuracy']:.2%}")
        print(f"Valid Answers Accuracy: {metrics['valid_accuracy']:.2%}")
        if metrics['avg_time'] > 0:
            print(f"Avg Time per Question: {metrics['avg_time']:.2f}s")

        return metrics

def run_test_evaluation(model_name: str):
    """Run a test evaluation with just 5 questions."""
    questions_file = '/content/drive/MyDrive/TFM2/TFM-DATASETS/structured_questions.json'
    evaluator = FLANT5QAEvaluator(questions_file, model_name)
    return evaluator.evaluate_model(test_run=True)

def run_full_evaluation(model_name: str):
    """Run the full evaluation."""
    questions_file = '/content/drive/MyDrive/TFM2/TFM-DATASETS/structured_questions.json'
    evaluator = FLANT5QAEvaluator(questions_file, model_name)
    return evaluator.evaluate_model()

# First install required packages
!pip install -q transformers torch

# Mount Google Drive
drive.mount('/content/drive')

# Example usage with fulltext model
model_path = "/content/drive/MyDrive/TFM2/models/mir_flan_t5_fulltext"  # Path to fulltext model
print("\nRunning test evaluation with 5 questions...")
test_metrics = run_test_evaluation(model_path)

if input("\nContinue with full evaluation? (y/n): ").lower() == 'y':
    print("\nRunning full evaluation...")
    full_metrics = run_full_evaluation(model_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Running test evaluation with 5 questions...
Loaded 174 questions
Loading /content/drive/MyDrive/TFM2/models/mir_flan_t5_fulltext...
Model loaded and moved to cpu

Processing Question 1/5:
Question: En relación con el metabolismo del hierro y su control mediado por hepcidina, es cierto que:

Results:
Full model response: A
Extracted answer: A
Correct answer: B
Correct: False
Time: 1.99s
----------------------------------------

Processing Question 2/5:
Question: ¿Cuál de las siguientes alteraciones bioquímicas se observa en el reordenamiento metabólico de la diabetes mellitus?:

Results:
Full model response: A
Extracted answer: A
Correct answer: C
Correct: False
Time: 1.16s
----------------------------------------

Processing Question 3/5:
Question: Una persona sana sufre una deshidratación tras realizar ejercicio físico intenso a altas temperaturas sin inger