<a href="https://colab.research.google.com/github/armelida/MELIDA/blob/main/notebooks/MELIDA_Evaluator_V2_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Initial Cell:
# Check Runtime & GPU Availability
import torch
import os
import subprocess

def check_runtime():
    """Check whether a GPU or TPU is available."""
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        print(f"✅ GPU is enabled! Using: {gpu_name}")
    elif "COLAB_TPU_ADDR" in os.environ:
        print("✅ TPU is enabled!")
    else:
        print("⚠️ WARNING: No GPU or TPU detected. Running on CPU.")
        print("👉 Go to Runtime > Change runtime type > Select GPU/TPU")

def check_gpu():
    """Check GPU details using nvidia-smi if available."""
    try:
        result = subprocess.run(
            ["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
        )
        if result.returncode == 0:
            print(result.stdout)
        else:
            print("⚠️ `nvidia-smi` not found. No GPU detected.")
    except FileNotFoundError:
        print("⚠️ No GPU found.")

# Run the checks
check_runtime()
check_gpu()

#  Clone repository and change working directory
!rm -rf MELIDA  # Remove any existing copy (optional)
!git clone https://github.com/armelida/MELIDA.git
%cd MELIDA

👉 Go to Runtime > Change runtime type > Select GPU/TPU
⚠️ No GPU found.
Cloning into 'MELIDA'...
remote: Enumerating objects: 646, done.[K
remote: Counting objects: 100% (166/166), done.[K
remote: Compressing objects: 100% (160/160), done.[K
remote: Total 646 (delta 105), reused 18 (delta 5), pack-reused 480 (from 1)[K
Receiving objects: 100% (646/646), 1.50 MiB | 4.30 MiB/s, done.
Resolving deltas: 100% (376/376), done.
/content/MELIDA


In [2]:
# Cell 0A: Load API Keys & Save API Configuration

# Install necessary libraries if not already present
!pip install -q python-dotenv google-generativeai # Added google-generativeai
# Keep openai pinned if needed, but google-generativeai is separate
!pip install openai==0.28

import os
import json
from dotenv import load_dotenv

# Initialize API keys dictionary - Added 'google'
# In your initial cell (where you load API keys)
api_keys = {"openai": None, "anthropic": None, "together": None, "google": None, "grok": None}

try:
    from google.colab import userdata
    api_keys["openai"] = userdata.get('OPENAI_API_KEY')
    api_keys["anthropic"] = userdata.get('ANTHROPIC_API_KEY')
    api_keys["together"] = userdata.get('TOGETHER_API_KEY')
    api_keys["google"] = userdata.get('GOOGLE_API_KEY')
    api_keys["grok"] = userdata.get('GROK_API_KEY')   # <-- Add this line
    # Update log messages
    if all(v is not None for k, v in api_keys.items() if k not in ['google', 'grok']):
         print("✓ API keys (OpenAI, Anthropic, Together) loaded from Colab secrets")
    if api_keys["google"]:
         print("✓ Google API key loaded from Colab secrets")
    if api_keys["grok"]:
         print("✓ Grok API key loaded from Colab secrets")
except Exception as e:
    print(f"Note: Couldn't load some keys from Colab secrets - {e}")

# Fallback: load from environment variables if not loaded yet
if not all(api_keys.values()):
    api_keys["openai"] = api_keys["openai"] or os.environ.get("OPENAI_API_KEY")
    api_keys["anthropic"] = api_keys["anthropic"] or os.environ.get("ANTHROPIC_API_KEY")
    api_keys["together"] = api_keys["together"] or os.environ.get("TOGETHER_API_KEY")
    api_keys["google"] = api_keys["google"] or os.environ.get("GOOGLE_API_KEY")
    api_keys["grok"] = api_keys["grok"] or os.environ.get("GROK_API_KEY")  # <-- Add this line

# Propagate keys to os.environ so subsequent cells can access them
if api_keys["openai"]:
    os.environ["OPENAI_API_KEY"] = api_keys["openai"]
if api_keys["anthropic"]:
    os.environ["ANTHROPIC_API_KEY"] = api_keys["anthropic"]
if api_keys["together"]:
    os.environ["TOGETHER_API_KEY"] = api_keys["together"]
if api_keys["google"]:
    os.environ["GOOGLE_API_KEY"] = api_keys["google"]
if api_keys["grok"]:
    os.environ["GROK_API_KEY"] = api_keys["grok"]


# Save API configuration to a JSON file for future reference
os.makedirs('config', exist_ok=True)
api_config = {
    "openai": {"api_key": api_keys["openai"] or "YOUR_OPENAI_API_KEY_HERE"},
    "anthropic": {"api_key": api_keys["anthropic"] or "YOUR_ANTHROPIC_API_KEY_HERE"},
    "together": {"api_key": api_keys["together"] or "YOUR_TOGETHER_API_KEY_HERE"},
    "google": {"api_key": api_keys["google"] or "YOUR_GOOGLE_API_KEY_HERE"}
}
with open('config/api_config.json', 'w') as f:
    json.dump(api_config, f, indent=2)

# Report missing keys, if any
missing = []
if not api_keys["openai"]: missing.append("OpenAI")
if not api_keys["anthropic"]: missing.append("Anthropic")
if not api_keys["together"]: missing.append("Together")
if not api_keys["google"]: missing.append("Google")
if missing:
    print(f"⚠️ Missing API keys: {', '.join(missing)}")
    print("👉 Please set the necessary API keys using Colab secrets (recommended), environment variables, or a .env file.")
else:
    print("✓ All required API configurations saved/loaded.")



✓ API keys (OpenAI, Anthropic, Together) loaded from Colab secrets
✓ Google API key loaded from Colab secrets
✓ Grok API key loaded from Colab secrets
✓ All required API configurations saved/loaded.


In [3]:
# Cell 1: Setup environment and install required packages

# Install required packages - Added google-generativeai
!pip install -q pandas PyYAML openai anthropic together transformers google-generativeai

# Import libraries
import os
import time
import json
import pandas as pd
import yaml

# Import model API clients
import openai
# Ensure your OpenAI API key is available (loaded in Cell 0A)

import anthropic
# Anthropic client might be initialized later in call_model or globally here if preferred

import together
# Together client might be initialized later in call_model or globally here if preferred

# --- Add Google Generative AI Import ---
import google.generativeai as genai
# --------------------------------------

# (Optional) If using Hugging Face transformers for local models:
try:
    from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
except ImportError:
    !pip install -q transformers # Ensure transformers is installed if needed
    from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

print("Setup complete. Libraries loaded.")

Setup complete. Libraries loaded.


In [4]:
# Cell 2: Load model registry from external JSON/YAML file

# Set the path to your model registry file in the notebooks folder
model_config_path = "notebooks/models.yaml"

# Check if the file exists
if not os.path.exists(model_config_path):
    raise FileNotFoundError(
        f"Model config file not found at {model_config_path}. "
        "Please create it as per the example and update the path."
    )

# Parse the config file (supports YAML and JSON)
if model_config_path.endswith((".yaml", ".yml")):
    with open(model_config_path, 'r') as f:
        config_data = yaml.safe_load(f)
elif model_config_path.endswith(".json"):
    with open(model_config_path, 'r') as f:
        config_data = json.load(f)
else:
    raise ValueError("Unsupported config file format. Use .json or .yaml")

# The config should either be a dict with a top-level 'models' key or a list itself.
if isinstance(config_data, dict) and "models" in config_data:
    models_config = config_data["models"]
elif isinstance(config_data, list):
    models_config = config_data
else:
    raise ValueError("Config file format error: expected a list of models or a 'models' key.")

print(f"Loaded {len(models_config)} models from registry:")
for m in models_config:
    print(f" - {m.get('name', 'Unnamed')} ({m.get('provider', 'Unknown')}, id={m.get('model_id', 'N/A')})")


Loaded 11 models from registry:
 - o3-mini-2025-01-31 (OpenAI, id=o3-mini-2025-01-31)
 - Claude (Anthropic, id=claude-3-7-sonnet-20250219)
 - deepseek-ai/DeepSeek-V3 (Together, id=deepseek-ai/DeepSeek-V3)
 - deepseek-ai/DeepSeek-R1 (Together, id=deepseek-ai/DeepSeek-R1)
 - meta-llama/Llama-3.3-70B-Instruct-Turbo (Together, id=meta-llama/Llama-3.3-70B-Instruct-Turbo)
 - meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo (Together, id=meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo)
 - mistralai/Mistral-7B-Instruct-v0.2 (Together, id=mistralai/Mistral-7B-Instruct-v0.2)
 - mistralai/Mixtral-8x22B-Instruct-v0.1 (Together, id=mistralai/Mixtral-8x22B-Instruct-v0.1)
 - Qwen/Qwen2-VL-72B-Instruct (Together, id=Qwen/Qwen2-VL-72B-Instruct)
 - grok-2-latest (XAI, id=grok-2-latest)
 - gemini-1.5-pro-latest (google, id=gemini-1.5-pro-latest)


3. Load Standardized Test Questions (Cell 3)
Next, we prepare the standardized test questions for evaluation. These can be hard-coded, loaded from a file, or generated. In this notebook, we’ll define a list of questions in a structured format (each with an ID, question text, multiple-choice options, and the correct answer). You can replace these with any set of questions relevant to your use case. For demonstration, we’ll use a few simple sample questions. In a real scenario, you might load dozens of questions from a JSON/CSV file or an existing dataset. Ensure each question has a known correct answer to compute accuracy.

In [5]:
# Cell 3: Load extracted exam questions for evaluation

import json
import os

# Define the path to the exported questions file.
# Adjust this path if your exported file name or location is different.
questions_file = "data/questions/MIR-2024-v01-t01.json"

if not os.path.exists(questions_file):
    raise FileNotFoundError(
        f"Questions file not found at {questions_file}. "
        "Please run the extraction process to generate the questions file."
    )

with open(questions_file, 'r', encoding='utf-8') as f:
    questions = json.load(f)

print(f"Loaded {len(questions)} questions for evaluation.")

# Optionally, preview the first three questions
for q in questions[:3]:
    print("---------------------------------------------------")
    print(f"ID: {q['id']}")
    print(f"Question: {q['question_text']}")
    print("Options:")
    for key, value in q['options'].items():
        print(f"  {key}: {value}")
    print("---------------------------------------------------")


Loaded 185 questions for evaluation.
---------------------------------------------------
ID: MIR-2024-v01-t01-Q026
Question: Entre los cambios metabólicos que se observan en un paciente con resistencia a insulina existe:
Options:
  A: Incremento de la expresión hepática de genes gluconeogénicos mediado por FOXO1 (forkhead box other) fosforilado.
  B: Descenso en los niveles intracelulares de hexoquinasa 2 dependiente de insulina.
  C: Aumento de la glucogenólisis muscular, contribuyendo al incremento de la glucemia.
  D: Aumento en los niveles séricos de aminoácidos como leucina e isoleucina.
---------------------------------------------------
---------------------------------------------------
ID: MIR-2024-v01-t01-Q027
Question: La deficiencia de acil-CoA-deshidrogenasa provoca una de las siguientes alteraciones bioquímicas:
Options:
  A: Disminución de ácidos dicarboxílicos.
  B: Aumento de la gluconeogénesis.
  C: Disminución de la ureagénesis.
  D: Aumento de carnitina libre.
-----

Detailed comments: We defined a list of dictionaries, where each dictionary represents a question. Each question has:
id: a unique identifier,
question: the text of the question,
choices: a list of answer choices (as strings, each prefixed with a letter),
answer: the correct choice (here represented by the letter of the correct option).
Feel free to extend or replace this list. For example, you could load questions from a file or generate them. Just ensure each question has a known correct answer for scoring.

4. Define Prompt Strategy (Cell 4)
The prompt strategy determines how we present each question to the model. For standardized test questions, a common strategy is to provide the question and multiple-choice options and ask the model to pick the best answer. We might instruct the model to output just the option letter (to make it easier to check correctness). In future, you could experiment with different strategies (e.g., asking for an explanation, chain-of-thought prompting, etc.). For now, we’ll use a straightforward prompt: the question, the options, and a final instruction like “Answer with the letter of the correct option.” We implement this as a function format_prompt(question) that takes a question entry and returns the full prompt text (or structured prompt) to send to the model. This function can be easily modified if you want to change how prompts are constructed.

In [6]:
# Cell 4: Define the prompt formatting strategy

def format_prompt(q):
    """
    Format a question dictionary into a prompt string for the MIR exam in Spanish.

    The prompt instructs the model to answer with a single letter (A, B, C, D) if it knows the answer,
    or with N if unsure.
    """
    # Get the main question text
    question_text = q.get("question_text", "")

    # Get each answer option from the options dictionary (default to empty string if missing)
    options = q.get("options", {})
    option_a = options.get("A", "")
    option_b = options.get("B", "")
    option_c = options.get("C", "")
    option_d = options.get("D", "")

    # Build the prompt using the provided format
    prompt = (
        "Eres un Médico que está realizando el examen MIR, un test estandarizado en español que determinará si obtienes tu residencia. "
        "Responde a la siguiente pregunta exactamente como se indica. Si conoces la respuesta, responde ÚNICAMENTE con una de las letras A, B, C o D. "
        "Si no estás seguro, responde con N. Cualquier texto adicional invalidará tu respuesta y restará puntos.\n\n"
        f"{question_text}\n\n"
        f"A) {option_a}\n"
        f"B) {option_b}\n"
        f"C) {option_c}\n"
        f"D) {option_d}\n\n"
        "Tu respuesta (ÚNICAMENTE una letra: A, B, C, D o N si no estás seguro):"
    )
    return prompt

# Test the prompt formatting on the first loaded question
example_prompt = format_prompt(questions[0])
print("Example formatted prompt:\n", example_prompt)


Example formatted prompt:
 Eres un Médico que está realizando el examen MIR, un test estandarizado en español que determinará si obtienes tu residencia. Responde a la siguiente pregunta exactamente como se indica. Si conoces la respuesta, responde ÚNICAMENTE con una de las letras A, B, C o D. Si no estás seguro, responde con N. Cualquier texto adicional invalidará tu respuesta y restará puntos.

Entre los cambios metabólicos que se observan en un paciente con resistencia a insulina existe:

A) Incremento de la expresión hepática de genes gluconeogénicos mediado por FOXO1 (forkhead box other) fosforilado.
B) Descenso en los niveles intracelulares de hexoquinasa 2 dependiente de insulina.
C) Aumento de la glucogenólisis muscular, contribuyendo al incremento de la glucemia.
D) Aumento en los niveles séricos de aminoácidos como leucina e isoleucina.

Tu respuesta (ÚNICAMENTE una letra: A, B, C, D o N si no estás seguro):


Detailed comments: The format_prompt function takes a question from our list and builds a prompt. We put the question text, list all the choices (joined in one line for simplicity), and then give an explicit instruction. By asking for the letter only, we aim to have consistent outputs that are easy to check (the model hopefully will just respond with “B”, etc.). After defining the function, we preview an example prompt for the first question to verify the format. You can adjust this format as needed (for instance, if a model tends to do better with a different phrasing or if you want the model to explain its answer, etc.).

In [7]:
import os
import time
import openai
import anthropic
import together
import google.generativeai as genai
from transformers import pipeline, AutoTokenizer
import requests  # Required for Together API HTTP calls

def call_model(model_cfg, prompt):
    """
    Call a model with the given prompt and return its response and metadata.
    Returns: output_text, tokens_used, latency
    """
    provider = model_cfg.get("provider", "").lower()
    model_id = model_cfg.get("model_id")
    api_key_env = model_cfg.get("api_key_env")  # Environment variable for API key
    max_tokens = model_cfg.get("max_tokens", 2000)  # Default: 2000 tokens
   # temperature = model_cfg.get("temperature", 0.7)   # Default temperature
    tokens_used = None
    output_text = ""
    start_time = time.time()

    # HuggingFace branch
    if provider == "huggingface":
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_id)
            # Decide pipeline type based on model name
            if "google/flan" in model_id.lower() or "t5" in model_id.lower():
                pipe = pipeline("text2text-generation", model=model_id, tokenizer=tokenizer)
                result = pipe(prompt, max_length=max_tokens, temperature=temperature if temperature > 0 else None, do_sample=temperature > 0)
                output_text = result[0]['generated_text']
            else:
                pipe = pipeline("text-generation", model=model_id, tokenizer=tokenizer)
                result = pipe(prompt, max_new_tokens=max_tokens, temperature=temperature if temperature > 0 else None, do_sample=temperature > 0)
                output_text = result[0]['generated_text']
                if output_text.startswith(prompt):
                    output_text = output_text[len(prompt):]
            # Optional token counting
            try:
                input_tokens = tokenizer(prompt, return_tensors="pt")["input_ids"]
                output_tokens = tokenizer(output_text, return_tensors="pt")["input_ids"]
                tokens_used = int(len(input_tokens[0]) + len(output_tokens[0]))
            except Exception:
                tokens_used = None
        except Exception as e:
            print(f"ERROR calling HuggingFace model {model_id}: {e}")
            output_text = f"ERROR: {e}"

    # OpenAI branch
    elif provider == "openai":
        api_key = os.getenv(api_key_env or "OPENAI_API_KEY")
        if not api_key:
            output_text = f"ERROR: OpenAI API key ({api_key_env or 'OPENAI_API_KEY'}) not set."
        else:
            openai.api_key = api_key
            try:
                response = openai.ChatCompletion.create(
                    model=model_id,
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=max_tokens,
                    temperature=temperature
                )
                output_text = response['choices'][0]['message']['content'].strip()
                if 'usage' in response:
                    tokens_used = response['usage'].get('total_tokens')
            except Exception as e:
                print(f"ERROR calling OpenAI model {model_id}: {e}")
                output_text = f"ERROR: {e}"

    # Anthropic branch
    elif provider == "anthropic":
        api_key = os.getenv(api_key_env or "ANTHROPIC_API_KEY")
        if not api_key:
            output_text = f"ERROR: Anthropic API key ({api_key_env or 'ANTHROPIC_API_KEY'}) not set."
        else:
            try:
                anthropic_client = anthropic.Client(api_key=api_key)
                response = anthropic_client.messages.create(
                    model=model_id,
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=max_tokens,
                    temperature=temperature
                )
                if response.content and isinstance(response.content, list):
                    output_text = response.content[0].text.strip()
                else:
                    output_text = str(response.completion).strip()
                if hasattr(response, 'usage'):
                    tokens_used = response.usage.input_tokens + response.usage.output_tokens
            except Exception as e:
                print(f"ERROR calling Anthropic model {model_id}: {e}")
                output_text = f"ERROR: {e}"

    # Together branch (updated to use HTTP requests)
    elif provider == "together":
        api_key = os.getenv(api_key_env or "TOGETHER_API_KEY")
        if not api_key:
            output_text = f"ERROR: Together API key ({api_key_env or 'TOGETHER_API_KEY'}) not set."
        else:
            try:
                endpoint = "https://api.together.xyz/v1/chat/completions"
                headers = {
                    "Authorization": f"Bearer {api_key}",
                    "Content-Type": "application/json"
                }
                # Construct messages with a system prompt to enforce a one-letter answer
                messages = [
                    {"role": "system", "content": "You are a precise question-answering system. Respond with only one letter: A, B, C, or D. No extra text is allowed."},
                    {"role": "user", "content": prompt}
                ]
                payload = {
                    "model": model_id,  # e.g., "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"
                    "messages": messages,
                    "max_tokens": max_tokens,
                    "temperature": temperature,
                    "top_p": 0.7,
                    "top_k": 50,
                    "repetition_penalty": 1,
                    "stop": ["<|eot_id|>", "<|eom_id|>"],
                    "stream": False
                }
                response = requests.post(endpoint, headers=headers, json=payload, timeout=30)
                response.raise_for_status()
                data = response.json()
                if "error" in data:
                    raise ValueError(f"API Error: {data['error']}")
                if "choices" in data and len(data["choices"]) > 0:
                    output_text = data["choices"][0]["message"]["content"].strip()
                else:
                    output_text = "ERROR: Unexpected response structure from Together API"
            except Exception as e:
                print(f"ERROR calling Together model {model_id}: {e}")
                output_text = f"ERROR: {e}"

    # Google branch
    elif provider == "google":
        api_key = os.getenv(api_key_env or "GOOGLE_API_KEY")
        if not api_key:
            output_text = f"ERROR: Google API key ({api_key_env or 'GOOGLE_API_KEY'}) not set."
        else:
            try:
                genai.configure(api_key=api_key)
                generation_config = genai.types.GenerationConfig(
                    max_output_tokens=max_tokens,
                    temperature=temperature
                )
                safety_settings = [
                    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                ]
                model = genai.GenerativeModel(
                    model_name=model_id,
                    generation_config=generation_config,
                    safety_settings=safety_settings
                )
                response = model.generate_content(prompt)
                if response.parts:
                    output_text = response.text
                elif response.prompt_feedback.block_reason:
                    output_text = f"ERROR: Blocked by safety filter - Reason: {response.prompt_feedback.block_reason}"
                    print(f"WARNING: Call to {model_id} blocked. Reason: {response.prompt_feedback.block_reason}")
                else:
                    output_text = "ERROR: No content generated (unknown reason)"
                if hasattr(response, 'usage_metadata'):
                    tokens_used = response.usage_metadata.prompt_token_count + response.usage_metadata.candidates_token_count
                else:
                    tokens_used = None
            except Exception as e:
                print(f"ERROR calling Google model {model_id}: {e}")
                output_text = f"ERROR: {e}"
    else:
        output_text = f"ERROR: Unknown provider '{provider}' for model {model_cfg.get('name', model_id)}"

    latency = time.time() - start_time
    if not isinstance(output_text, str):
        output_text = str(output_text)
    return output_text.strip(), tokens_used, latency

# --------------------------
# Example usage of the call_model function:
if __name__ == "__main__":
    # Example: testing the Together branch
    model_cfg = {
        "provider": "together",
        "model_id": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",  # Replace with your model ID if needed
        "api_key_env": "TOGETHER_API_KEY",
        "max_tokens": 2000,
        "temperature": 0.7
    }
    prompt = ("What is the capital of France? "
              "A) Paris "
              "B) London "
              "C) Berlin "
              "D) Madrid")
    output, tokens_used, latency = call_model(model_cfg, prompt)
    print("Output:", output)
    print("Tokens used:", tokens_used)
    print("Latency:", latency)


ERROR calling Together model meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo: name 'temperature' is not defined
Output: ERROR: name 'temperature' is not defined
Tokens used: None
Latency: 0.006968021392822266


5. Model Interface and Evaluation Functions (Cells 5–6)
In this section, we set up functions to handle model inference and evaluation:
call_model(model_config, prompt_text) – Invokes a single model (based on its provider and config) with the given prompt, and returns the model’s answer, along with metadata like token usage and latency.
evaluate_model(model_config, questions) – Uses call_model to get answers for each question from one model, checks correctness, and collects detailed results.
We will also prepare a loop or another function to evaluate all models and aggregate the results for comparison.
Structuring this logic into functions makes the notebook modular and easier to update. For example, if in the future we want to add a step for chain-of-thought (CoT) prompting or filter the model output for hallucinations, we could modify or wrap call_model accordingly. 5.1 call_model Implementation: This function will branch based on the provider:
HuggingFace: use transformers pipeline or model generate. We’ll initialize a pipeline for text generation or use the model’s generate method. We also tokenize the input to count input tokens. The output tokens can be counted by the tokenizer as well.
OpenAI: use openai.Completion or openai.ChatCompletion depending on model type. For chat models (e.g., GPT-4), we pass the prompt as a user message. We retrieve the output text and usage info (token counts).
Anthropic: (Claude models) use anthropic’s client. Typically you provide a prompt with a special format (like "\n\nHuman: <question>\n\nAssistant:"). We skip detailed implementation here but it can be added.
Together: use Together API client. For example, together_client.complete or the chat completion as needed, based on their documentation. (Ensure TOGETHER_API_KEY is set.)
Additional providers (e.g., Cohere, AI21) can be integrated similarly by adding new branches.
We also measure the time taken for each call (latency). If token counts are not readily available from the API, we will set them to None (or you could estimate via a tokenizer). Let’s implement call_model below:

Detailed comments: In call_model:
We take the model’s config and a prompt string.
Based on provider, we handle the call differently.
HuggingFace: We load the model and tokenizer (from local or HuggingFace Hub). We use pipeline for simplicity (it will handle the model loading and generation). We choose the pipeline task based on model type (a quick check for “t5” in the model name to decide between text2text-generation and text-generation). After generation, we count tokens by encoding the prompt and output with the tokenizer.
OpenAI: We use the OpenAI API. If the model is chat-based (we guess by name containing “gpt-3.5” or “gpt-4”), we use the ChatCompletion endpoint with a single user message. Otherwise, we use the older Completion endpoint. We fetch the text from the response and also get token usage if provided. (Make sure your OpenAI API key is set in the environment.)
Anthropic: We format the prompt in the required way for Claude and call the client’s completion method. (This assumes the anthropic package is installed and imported.) Token count isn’t directly captured here.
Together: We initialize the Together client and call the chat.completions.create method with the prompt as a user message. (This assumes the model supports chat format; for pure text-generation models on Together, you might use a different method like client.completion.create.) We extract the content from the response. (Token usage may be available via Together’s response, but for simplicity, we set it to None in this example.)
We measure the time just before and after the call to compute latency.
Finally, we return output_text (the model’s answer), tokens_used, and latency.
This function abstracts away the differences in model access, giving us a unified interface for the evaluation loop.

5.2 evaluate_model Implementation: This function will loop through all questions for a single model, use call_model to get the answer, check correctness, and record results. It will return a list of result records (one per question for that model) and also compute summary metrics (like number correct). We’ll implement evaluate_model next:

In [8]:
#  6
import json
import os

# Define file paths
original_questions_file = 'data/questions/MIR-2024-v01-t01.json'
answer_key_file = 'data/answers/MIR-2024-v01-t01-answers.json'
filtered_questions_file = 'data/questions/MIR-2024-v01-t01_filtered.json'

# Check if files exist before proceeding
if not os.path.exists(original_questions_file):
     raise FileNotFoundError(f"Original questions file not found: {original_questions_file}")
if not os.path.exists(answer_key_file):
     raise FileNotFoundError(f"Answer key file not found: {answer_key_file}")

# Load answer key (assumed to be a dictionary with question IDs as keys)
with open(answer_key_file, 'r', encoding='utf-8') as f:
     answer_key = json.load(f)

# Load original questions (assumed to be a list)
with open(original_questions_file, 'r', encoding='utf-8') as f:
     questions = json.load(f)

# Filter out questions that are not in the answer key or whose answer is empty
filtered_questions = [
     q for q in questions
     if q.get('id') in answer_key and answer_key[q.get('id')].strip() != ""
]

# Ensure the directory exists
os.makedirs(os.path.dirname(filtered_questions_file), exist_ok=True)

# Save filtered questions to a new file
with open(filtered_questions_file, 'w', encoding='utf-8') as f:
     json.dump(filtered_questions, f, indent=2)

print(f"Filtered questions saved to {filtered_questions_file} with {len(filtered_questions)} questions out of {len(questions)} original questions.")

Filtered questions saved to data/questions/MIR-2024-v01-t01_filtered.json with 180 questions out of 185 original questions.


In [9]:
# Cell 6.1: Patch OpenAI to support legacy calls in evaluator.py and reload evaluator module

!pip install openai==0.28

import openai

# Define a wrapper class so that openai.chat.completions.create(...) works.
class ChatCompletionsWrapper:
    @staticmethod
    def create(*args, **kwargs):
        return openai.ChatCompletion.create(*args, **kwargs)

class OpenAIChatWrapper:
    completions = ChatCompletionsWrapper

# Assign our wrapper to openai.chat
openai.chat = OpenAIChatWrapper

# Verify the patch:
print("openai.chat.completions.create:", openai.chat.completions.create)

# Reload evaluator so that it picks up our patched openai
import importlib
import src.evaluator as evaluator_module
importlib.reload(evaluator_module)
from src.evaluator import ModelEvaluator

print("OpenAI version:", openai.__version__)


openai.chat.completions.create: <function ChatCompletionsWrapper.create at 0x77fd55d58ae0>
OpenAI version: 0.28.0


In [10]:
#6.2
import re
import json
import os
from src.evaluator import ModelEvaluator

def extract_answer_letter(output):
    """
    Extracts the first valid answer letter (A, B, C, D, or N) from the model output.
    """
    match = re.search(r'\b([ABCDN])\b', output.upper())
    return match.group(1) if match else None

# Parameters for test evaluation
models_to_test = [
    'o3-mini-2025-01-31',
    'deepseek-ai/DeepSeek-V3',
    'claude-3-7-sonnet-20250219',
    'deepseek-ai/DeepSeek-R1',
    'Google-Gemini-gemini-1.5-pro',
    'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo',
    'meta-llama/Llama-3.3-70B-Instruct-Turbo',
    'mistralai/Mistral-7B-Instruct-v0.2',
    'mistralai/Mixtral-8x22B-Instruct-v0.1',
    'Qwen/Qwen2-VL-72B-Instruct',
    'grok-2-latest'
]
prompt_strategy = "Prompt-001"
sample_size = 10  # Evaluate 5 questions per model

filtered_questions_file = 'data/questions/MIR-2024-v01-t01_filtered.json'
answer_key_file = 'data/answers/MIR-2024-v01-t01-answers.json'

# Define a separate folder for test evaluation results
test_results_folder = "data/test_results"
os.makedirs(test_results_folder, exist_ok=True)

# Initialize the evaluator
evaluator = ModelEvaluator()

results_summary = {}

print("\n--- RUNNING SAMPLE TEST EVALUATION (Results saved in a separate folder) ---")
for model in models_to_test:
    print(f"\nEvaluating Model: {model} using Prompt Strategy: {prompt_strategy}")
    try:
        # Run evaluation; if evaluator.run_evaluation supports an output_folder parameter, use it.
        # Otherwise, move the result file to the test folder after creation.
        result_file = evaluator.run_evaluation(
            questions_file=filtered_questions_file,
            answer_key_file=answer_key_file,
            prompt_strategy=prompt_strategy,
            model=model,
            sample_size=sample_size
            # , output_folder=test_results_folder  # Uncomment if supported by your evaluator
        )
        # If output_folder parameter is not supported, move the file manually:
        new_result_file = os.path.join(test_results_folder, os.path.basename(result_file))
        os.rename(result_file, new_result_file)
        result_file = new_result_file

        print(f"✓ Sample evaluation complete for {model}. Results saved to: {result_file}")

        with open(result_file, 'r') as f:
            results = json.load(f)

        details = results.get("details", [])
        if details:
            print("\n--- DETAILS FOR EACH QUESTION ---")
            for entry in details:
                question_id = entry.get("question_id", "N/A")
                question_prompt = entry.get("prompt", "No prompt available")
                raw_output = entry.get("model_output", "No output")
                extracted = extract_answer_letter(raw_output)
                entry["extracted_answer"] = extracted
                print(f"Question ID: {question_id}")
                print("Prompt:")
                print(question_prompt)
                print("Raw Output:")
                print(raw_output)
                print("Extracted Answer:", extracted)
                print("-" * 40)
        else:
            print("\nNo per-question details found in the evaluation results.")

        summary = results.get("summary", {})
        results_summary[model] = summary

        print("\n--- SAMPLE EVALUATION SUMMARY ---")
        print(f"Model: {summary.get('model', 'N/A')}")
        print(f"Prompt Strategy: {summary.get('prompt_strategy', 'N/A')}")
        print(f"Total Questions: {summary.get('total_questions', 'N/A')}")
        print(f"Correct Answers: {summary.get('correct_count', 'N/A')} ({summary.get('accuracy', 0)*100:.2f}%)")
        print(f"Incorrect Answers: {summary.get('incorrect_count', 'N/A')}")
        print(f"Skipped Questions: {summary.get('skipped_count', 'N/A')}")
        print(f"Invalid Count: {summary.get('invalid_count', 'N/A')}")
        print(f"Total Score: {summary.get('total_score', 'N/A')}")
    except Exception as e:
        print(f"✗ Error during sample evaluation for {model}: {e}")

print("\nIf the sample evaluation looks good, proceed to full evaluation in the next cell.")



--- RUNNING SAMPLE TEST EVALUATION (Results saved in a separate folder) ---

Evaluating Model: o3-mini-2025-01-31 using Prompt Strategy: Prompt-001


Evaluating questions: 100%|██████████| 10/10 [01:21<00:00,  8.20s/it]


✓ Sample evaluation complete for o3-mini-2025-01-31. Results saved to: data/test_results/EVAL-MIR-2024-v01-t01_filtered-Prompt-001-o3-mini-2025-01-31-20250403-131641.json

No per-question details found in the evaluation results.

--- SAMPLE EVALUATION SUMMARY ---
Model: o3-mini-2025-01-31
Prompt Strategy: Prompt-001
Total Questions: 10
Correct Answers: 9 (90.00%)
Incorrect Answers: 0
Skipped Questions: 0
Invalid Count: 1
Total Score: 27

Evaluating Model: deepseek-ai/DeepSeek-V3 using Prompt Strategy: Prompt-001


Evaluating questions: 100%|██████████| 10/10 [00:10<00:00,  1.06s/it]


✓ Sample evaluation complete for deepseek-ai/DeepSeek-V3. Results saved to: data/test_results/EVAL-MIR-2024-v01-t01_filtered-Prompt-001-deepseek-ai-DeepSeek-V3-20250403-131652.json

No per-question details found in the evaluation results.

--- SAMPLE EVALUATION SUMMARY ---
Model: deepseek-ai/DeepSeek-V3
Prompt Strategy: Prompt-001
Total Questions: 10
Correct Answers: 7 (70.00%)
Incorrect Answers: 3
Skipped Questions: 0
Invalid Count: 0
Total Score: 18

Evaluating Model: claude-3-7-sonnet-20250219 using Prompt Strategy: Prompt-001


Evaluating questions: 100%|██████████| 10/10 [00:14<00:00,  1.50s/it]


✓ Sample evaluation complete for claude-3-7-sonnet-20250219. Results saved to: data/test_results/EVAL-MIR-2024-v01-t01_filtered-Prompt-001-claude-3-7-sonnet-20250219-20250403-131707.json

No per-question details found in the evaluation results.

--- SAMPLE EVALUATION SUMMARY ---
Model: claude-3-7-sonnet-20250219
Prompt Strategy: Prompt-001
Total Questions: 10
Correct Answers: 8 (80.00%)
Incorrect Answers: 2
Skipped Questions: 0
Invalid Count: 0
Total Score: 22

Evaluating Model: deepseek-ai/DeepSeek-R1 using Prompt Strategy: Prompt-001


Evaluating questions:  10%|█         | 1/10 [00:18<02:50, 18.98s/it]ERROR:melida-evaluator:Error calling Together API for model deepseek-ai/DeepSeek-R1: RetryError[<Future at 0x77fed09cb0d0 state=finished raised HTTPError>]
Evaluating questions:  30%|███       | 3/10 [00:39<01:28, 12.58s/it]ERROR:melida-evaluator:Error calling Together API for model deepseek-ai/DeepSeek-R1: RetryError[<Future at 0x77fd56aed0d0 state=finished raised HTTPError>]
Evaluating questions:  40%|████      | 4/10 [00:48<01:06, 11.12s/it]ERROR:melida-evaluator:Error calling Together API for model deepseek-ai/DeepSeek-R1: RetryError[<Future at 0x77fdd4ff6110 state=finished raised HTTPError>]
Evaluating questions:  60%|██████    | 6/10 [01:10<00:44, 11.10s/it]ERROR:melida-evaluator:Error calling Together API for model deepseek-ai/DeepSeek-R1: RetryError[<Future at 0x77fd5767a950 state=finished raised HTTPError>]
Evaluating questions:  70%|███████   | 7/10 [01:19<00:31, 10.35s/it]ERROR:melida-evaluator:Error calling

✓ Sample evaluation complete for deepseek-ai/DeepSeek-R1. Results saved to: data/test_results/EVAL-MIR-2024-v01-t01_filtered-Prompt-001-deepseek-ai-DeepSeek-R1-20250403-131859.json

No per-question details found in the evaluation results.

--- SAMPLE EVALUATION SUMMARY ---
Model: deepseek-ai/DeepSeek-R1
Prompt Strategy: Prompt-001
Total Questions: 10
Correct Answers: 3 (30.00%)
Incorrect Answers: 1
Skipped Questions: 0
Invalid Count: 6
Total Score: 8

Evaluating Model: Google-Gemini-gemini-1.5-pro using Prompt Strategy: Prompt-001


Evaluating questions: 100%|██████████| 10/10 [00:14<00:00,  1.42s/it]


✓ Sample evaluation complete for Google-Gemini-gemini-1.5-pro. Results saved to: data/test_results/EVAL-MIR-2024-v01-t01_filtered-Prompt-001-Google-Gemini-gemini-1.5-pro-20250403-131913.json

No per-question details found in the evaluation results.

--- SAMPLE EVALUATION SUMMARY ---
Model: Google-Gemini-gemini-1.5-pro
Prompt Strategy: Prompt-001
Total Questions: 10
Correct Answers: 6 (60.00%)
Incorrect Answers: 4
Skipped Questions: 0
Invalid Count: 0
Total Score: 14

Evaluating Model: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo using Prompt Strategy: Prompt-001


Evaluating questions: 100%|██████████| 10/10 [00:09<00:00,  1.06it/s]


✓ Sample evaluation complete for meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo. Results saved to: data/test_results/EVAL-MIR-2024-v01-t01_filtered-Prompt-001-meta-llama-Meta-Llama-3.1-405B-Instruct-Turbo-20250403-131922.json

No per-question details found in the evaluation results.

--- SAMPLE EVALUATION SUMMARY ---
Model: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
Prompt Strategy: Prompt-001
Total Questions: 10
Correct Answers: 8 (80.00%)
Incorrect Answers: 2
Skipped Questions: 0
Invalid Count: 0
Total Score: 22

Evaluating Model: meta-llama/Llama-3.3-70B-Instruct-Turbo using Prompt Strategy: Prompt-001


Evaluating questions: 100%|██████████| 10/10 [00:12<00:00,  1.23s/it]


✓ Sample evaluation complete for meta-llama/Llama-3.3-70B-Instruct-Turbo. Results saved to: data/test_results/EVAL-MIR-2024-v01-t01_filtered-Prompt-001-meta-llama-Llama-3.3-70B-Instruct-Turbo-20250403-131935.json

No per-question details found in the evaluation results.

--- SAMPLE EVALUATION SUMMARY ---
Model: meta-llama/Llama-3.3-70B-Instruct-Turbo
Prompt Strategy: Prompt-001
Total Questions: 10
Correct Answers: 7 (70.00%)
Incorrect Answers: 3
Skipped Questions: 0
Invalid Count: 0
Total Score: 18

Evaluating Model: mistralai/Mistral-7B-Instruct-v0.2 using Prompt Strategy: Prompt-001


Evaluating questions: 100%|██████████| 10/10 [00:09<00:00,  1.09it/s]


✓ Sample evaluation complete for mistralai/Mistral-7B-Instruct-v0.2. Results saved to: data/test_results/EVAL-MIR-2024-v01-t01_filtered-Prompt-001-mistralai-Mistral-7B-Instruct-v0.2-20250403-131944.json

No per-question details found in the evaluation results.

--- SAMPLE EVALUATION SUMMARY ---
Model: mistralai/Mistral-7B-Instruct-v0.2
Prompt Strategy: Prompt-001
Total Questions: 10
Correct Answers: 2 (20.00%)
Incorrect Answers: 8
Skipped Questions: 0
Invalid Count: 0
Total Score: -2

Evaluating Model: mistralai/Mixtral-8x22B-Instruct-v0.1 using Prompt Strategy: Prompt-001


Evaluating questions: 100%|██████████| 10/10 [00:10<00:00,  1.02s/it]


✓ Sample evaluation complete for mistralai/Mixtral-8x22B-Instruct-v0.1. Results saved to: data/test_results/EVAL-MIR-2024-v01-t01_filtered-Prompt-001-mistralai-Mixtral-8x22B-Instruct-v0.1-20250403-131954.json

No per-question details found in the evaluation results.

--- SAMPLE EVALUATION SUMMARY ---
Model: mistralai/Mixtral-8x22B-Instruct-v0.1
Prompt Strategy: Prompt-001
Total Questions: 10
Correct Answers: 8 (80.00%)
Incorrect Answers: 2
Skipped Questions: 0
Invalid Count: 0
Total Score: 22

Evaluating Model: Qwen/Qwen2-VL-72B-Instruct using Prompt Strategy: Prompt-001


Evaluating questions: 100%|██████████| 10/10 [00:07<00:00,  1.28it/s]


✓ Sample evaluation complete for Qwen/Qwen2-VL-72B-Instruct. Results saved to: data/test_results/EVAL-MIR-2024-v01-t01_filtered-Prompt-001-Qwen-Qwen2-VL-72B-Instruct-20250403-132002.json

No per-question details found in the evaluation results.

--- SAMPLE EVALUATION SUMMARY ---
Model: Qwen/Qwen2-VL-72B-Instruct
Prompt Strategy: Prompt-001
Total Questions: 10
Correct Answers: 9 (90.00%)
Incorrect Answers: 1
Skipped Questions: 0
Invalid Count: 0
Total Score: 26

Evaluating Model: grok-2-latest using Prompt Strategy: Prompt-001


Evaluating questions: 100%|██████████| 10/10 [00:08<00:00,  1.23it/s]

✓ Sample evaluation complete for grok-2-latest. Results saved to: data/test_results/EVAL-MIR-2024-v01-t01_filtered-Prompt-001-grok-2-latest-20250403-132010.json

No per-question details found in the evaluation results.

--- SAMPLE EVALUATION SUMMARY ---
Model: grok-2-latest
Prompt Strategy: Prompt-001
Total Questions: 10
Correct Answers: 8 (80.00%)
Incorrect Answers: 2
Skipped Questions: 0
Invalid Count: 0
Total Score: 22

If the sample evaluation looks good, proceed to full evaluation in the next cell.





Detailed comments: In evaluate_model:
We iterate over each question, format the prompt, and call the model via call_model.
We wrap the model call in a try-except to catch any errors (for instance, if an API call fails or a model isn’t available). If there’s an error, we log it and move on, leaving output empty for that question.
We then parse the model’s output to extract the answer. We assume the model should reply with a letter. The code checks the first character of the output: if it’s one of “A, B, C, D”, we treat that as the chosen option. (If the output is something else, you could include additional parsing logic – for example, sometimes the model might output the full option text or a sentence. Here, we simplify by taking the first letter when possible. If the output is empty or doesn’t start with a letter, we mark the answer as incorrect by default.)
We compare the model’s answer letter (uppercased) to the true answer letter from the question. If they match, it’s correct and we increment correct_count.
We append a dictionary to results containing all relevant info: question ID, model name, the exact prompt used, the model’s raw output, a boolean for correctness, latency (in seconds), and tokens used.
We also print a one-line progress update for each question, indicating what the model answered and whether it was correct. This helps to monitor the evaluation as it happens, especially if many questions are being tested.
Finally, the function returns the list of results and the count of correct answers.
With these functions in place, we can now evaluate all models and compile the metrics.
6. Run Evaluation for All Models (Cell 7)
Now we’ll loop through each model in our models_config, evaluate it on all questions using evaluate_model, and collect the outcomes. We will calculate summary metrics for each model:
Accuracy (% correct)
Total score (number of correct answers out of total questions)
Total tokens used (if available; this could be sum of tokens across all questions for that model)
Average response time per question (latency)
We’ll store summary results in a list of dictionaries (which we can later convert to a DataFrame for display or CSV export). We’ll also accumulate all per-question results into a single list for detailed logging.

In [11]:
#7 FULL EVALUATION

import re
import json
from src.evaluator import ModelEvaluator

def extract_answer_letter(output):
    match = re.search(r'\b([ABCDN])\b', output.upper())
    return match.group(1) if match else None

# Parameters for full evaluation (you might want to increase sample_size or use all questions)
models_to_test = [
    'o3-mini-2025-01-31',
    'deepseek-ai/DeepSeek-V3',
    'claude-3-7-sonnet-20250219',
    'deepseek-ai/DeepSeek-R1',
    'Google-Gemini-gemini-1.5-pro',
    'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo',
    'meta-llama/Llama-3.3-70B-Instruct-Turbo',
    'mistralai/Mistral-7B-Instruct-v0.2',
    'mistralai/Mixtral-8x22B-Instruct-v0.1',
    'Qwen/Qwen2-VL-72B-Instruct',
    'grok-2-latest'
]
prompt_strategy = "Prompt-001"
# For full evaluation, you might set sample_size to None or the full count.
sample_size = None

filtered_questions_file = 'data/questions/MIR-2024-v01-t01_filtered.json'
answer_key_file = 'data/answers/MIR-2024-v01-t01-answers.json'

# Initialize the evaluator
evaluator = ModelEvaluator()

print("\n--- RUNNING FULL EVALUATION ---")
for model in models_to_evaluate:
    try:
        result_file = evaluator.run_evaluation(
            questions_file=filtered_questions_file,
            answer_key_file=answer_key_file,
            prompt_strategy=prompt_strategy,
            model=model,
            sample_size=sample_size  # Full evaluation: use all questions
        )
        print(f"✓ Full evaluation complete for {model}. Results saved to: {result_file}")
        # The full evaluation results will later be merged and exported to CSV.
    except Exception as e:
        print(f"✗ Error during full evaluation for {model}: {e}")

print("\nProceed to the cell that merges and exports full evaluation results.")



--- RUNNING FULL EVALUATION ---


NameError: name 'models_to_evaluate' is not defined

Detailed comments: In Cell 7:
We initialize all_details to gather every question’s result and summary_records for each model.
We loop over each model configuration:
* Call evaluate_model for that model, which returns the detailed results and count of correct answers.
* We extend the all_details list with the results (so in the end, this list contains an entry for each model-question pair).
* Compute accuracy as (num_correct / total_questions) * 100. We round it to two decimal places later for neatness.
* Compute total tokens used by summing the tokens_used for each question result, if available. If none of the results have token info (i.e., the list is empty because maybe the API didn’t provide it), we leave total_tokens as None.
* Compute average latency by summing all latencies and dividing by number of questions (we exclude any None latencies just in case).
* Append a dictionary to summary_records with the model’s name and metrics. We include total questions for reference, and round the accuracy and average latency for readability.
* Print a summary line for each model (e.g., “Finished ModelX: 8/10 correct, Accuracy 80.0%.”).
After this loop, we have:
* summary_records: a list of summary info for each model.
* all_details: a list of per-question info, which we can turn into a detailed log.
Next, we’ll convert these to pandas DataFrames for easy viewing and export.

In [None]:
# =========================
# FINAL EXPORT CELL: Full Evaluation Results to CSV, GitHub, & Download
# =========================

import glob
import json
import pandas as pd
import os
from datetime import datetime
import shutil
import subprocess
from google.colab import files
from google.colab import userdata

# --- Step 1: Gather Full Evaluation Result Files ---
# Adjust the glob pattern if necessary to include only full evaluation files.
# (This example assumes your full evaluation result files contain "filtered" in the filename.)
result_files = glob.glob("data/results/EVAL-MIR-2024-v01-t01_filtered-*.json")
print(f"Found {len(result_files)} full evaluation result file(s).")

# --- Step 2: Merge Detailed Evaluation Results ---
# We assume each JSON file contains detailed results under either the "details" or "results" key.
all_details = []
for file in result_files:
    try:
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        details = data.get('details') or data.get('results')
        if details:
            all_details.extend(details)
    except Exception as e:
        print(f"Error processing {file}: {e}")

if not all_details:
    print("No detailed evaluation results were found in the JSON files.")
else:
    # Convert the merged results to a DataFrame.
    df_full = pd.DataFrame(all_details)
    print("Merged full evaluation results shape:", df_full.shape)
    print("Columns in full evaluation results:", df_full.columns.tolist())

    # --- Step 3: Export Merged Results to a Single CSV File ---
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    export_filename = f"full_evaluation_results_{timestamp}.csv"
    export_dir = os.path.join("data", "exports")
    os.makedirs(export_dir, exist_ok=True)
    export_path = os.path.join(export_dir, export_filename)
    df_full.to_csv(export_path, index=False)
    print(f"✓ Full evaluation results exported to CSV: {export_path}")

    # --- Step 4: Download the CSV File Locally (Colab) ---
    files.download(export_path)

    # --- Step 5: Push the CSV File to GitHub ---
    try:
        # Retrieve GitHub token from Colab secrets
        github_token = userdata.get('GITHUB_TOKEN')
        if not github_token:
            raise ValueError("GITHUB_TOKEN is not set in Colab secrets.")

        # Configure the remote URL with your GitHub token
        repo_name = "armelida/MELIDA"
        token_url = f"https://{github_token}@github.com/{repo_name}.git"

        subprocess.run(["git", "config", "--global", "user.email", "armelida@gmail.com"], check=True)
        subprocess.run(["git", "config", "--global", "user.name", "Armelida"], check=True)
        subprocess.run(["git", "remote", "set-url", "origin", token_url], check=True)

        # Stage the new CSV file for commit.
        subprocess.run(["git", "add", export_path], check=True)
        commit_message = f"Export full evaluation results {timestamp}"
        subprocess.run(["git", "commit", "-m", commit_message], check=True)

        # Pull the latest changes and then push your commit.
        subprocess.run(["git", "pull", "origin", "main", "--rebase"], check=True)
        subprocess.run(["git", "push", "origin", "main"], check=True)
        print("✓ CSV file successfully pushed to GitHub.")
    except Exception as e:
        print(f"Error during GitHub push: {e}")


In [None]:
# =========================
# CELL: Export Most Failed Questions CSV
# =========================

import glob
import pandas as pd
import os
from datetime import datetime

# (Optional) Load the most recent full evaluation CSV from your exports folder:
export_files = sorted(glob.glob("data/exports/full_evaluation_results_*.csv"))
if export_files:
    latest_export = export_files[-1]
    df_full = pd.read_csv(latest_export)
    print(f"Loaded merged full evaluation results from: {latest_export}")
else:
    raise FileNotFoundError("No full evaluation CSV file found in data/exports/.")

# Standardize answer columns and compute correctness.
df_full['model_answer'] = df_full['model_answer'].astype(str).str.strip().str.upper()
df_full['correct_answer'] = df_full['correct_answer'].astype(str).str.strip().str.upper()
df_full['raw_response'] = df_full['raw_response'].astype(str).str.strip()
df_full['correct'] = df_full['model_answer'] == df_full['correct_answer']

# Determine which column to use for question text.
if 'prompt' in df_full.columns:
    question_text_col = 'prompt'
elif 'question_text' in df_full.columns:
    question_text_col = 'question_text'
else:
    # If no column exists, create one with a default value.
    df_full['question_text'] = "Not available"
    question_text_col = 'question_text'

# Filter out only the failed evaluations.
df_failures = df_full[~df_full['correct']]

# Group by question_id to aggregate failure information.
df_failures_summary = df_failures.groupby("question_id").agg(
    failure_count=("model", "count"),
    models_failed=("model", lambda x: ", ".join(sorted(x.unique()))),
    correct_answer=("correct_answer", "first"),
    raw_responses=("raw_response", lambda x: " || ".join(x.astype(str).unique())),
    question_text=(question_text_col, "first")
).reset_index()

# Sort by failure_count descending (most failed questions at the top).
df_failures_summary = df_failures_summary.sort_values("failure_count", ascending=False)

# Save the summary to a CSV file.
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
failed_csv = f"most_failed_questions_{timestamp}.csv"
export_dir = os.path.join("data", "exports")
os.makedirs(export_dir, exist_ok=True)
export_path = os.path.join(export_dir, failed_csv)
df_failures_summary.to_csv(export_path, index=False)
print(f"✓ Most failed questions exported to CSV: {export_path}")

# Optionally, display the DataFrame.
print("=== Most Failed Questions ===")
print(df_failures_summary.to_string(index=False))
