<a href="https://colab.research.google.com/github/armelida/MELIDA/blob/main/notebooks/prompting_strategy_evaluator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# MELIDA: Model Evaluation for Life-sciences Intelligence and Decision Assistance
# Simplified Notebook for Running Evaluations

# Cell 1: Check Runtime & GPU Availability
import torch
import os
import subprocess

def check_runtime():
    """Check whether a GPU or TPU is available."""
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        print(f"✅ GPU is enabled! Using: {gpu_name}")
    elif "COLAB_TPU_ADDR" in os.environ:
        print("✅ TPU is enabled!")
    else:
        print("⚠️ WARNING: No GPU or TPU detected. Running on CPU.")
        print("👉 Go to Runtime > Change runtime type > Select GPU/TPU")

def check_gpu():
    """Check GPU details using nvidia-smi if available."""
    try:
        result = subprocess.run(
            ["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
        )
        if result.returncode == 0:
            print(result.stdout)
        else:
            print("⚠️ `nvidia-smi` not found. No GPU detected.")
    except FileNotFoundError:
        print("⚠️ No GPU found.")

# Run the checks
check_runtime()
check_gpu()


✅ GPU is enabled! Using: NVIDIA A100-SXM4-40GB
Sun Mar 16 11:52:15 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             43W /  400W |       5MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
 

In [2]:
# Cell 2: Clone Repository & Install Requirements
# Remove any existing directory and clone a fresh copy of MELIDA.
!rm -rf MELIDA
!git clone https://github.com/armelida/MELIDA.git
%cd MELIDA
!pip install -r requirements.txt


Cloning into 'MELIDA'...
remote: Enumerating objects: 148, done.[K
remote: Counting objects: 100% (148/148), done.[K
remote: Compressing objects: 100% (131/131), done.[K
remote: Total 148 (delta 65), reused 49 (delta 11), pack-reused 0 (from 0)[K
Receiving objects: 100% (148/148), 148.84 KiB | 2.01 MiB/s, done.
Resolving deltas: 100% (65/65), done.
/content/MELIDA


In [3]:
# Cell 3: Setup Configuration & Fetch Prompt Strategies
import os
import json
import requests

# Create config directory if it does not exist
os.makedirs('config', exist_ok=True)

def fetch_prompt_strategies(url):
    """Fetch prompt strategies JSON from GitHub and add extra metadata."""
    print(f"Fetching prompt strategies from GitHub: {url}")
    try:
        response = requests.get(url)
        response.raise_for_status()
        strategies = response.json()
        print(f"✓ Loaded {len(strategies)} prompt strategies.")

        # Enrich each strategy with language and tags if missing
        for strategy in strategies.values():
            if "language" not in strategy:
                desc = strategy.get("description", "")
                if "Spanish" in desc:
                    strategy["language"] = "Spanish"
                elif "English" in desc:
                    strategy["language"] = "English"
                else:
                    strategy["language"] = "Unknown"
            if "tags" not in strategy:
                strategy["tags"] = []
                if "doctor" in strategy.get("description", "").lower():
                    strategy["tags"].append("doctor_role")
                if "reasoning" in strategy.get("description", "").lower():
                    strategy["tags"].append("reasoning")
                if "confidence" in strategy.get("description", "").lower():
                    strategy["tags"].append("confidence")
        return strategies
    except Exception as e:
        print(f"Error: {e}\nUsing default prompt strategies instead.")
        # Default prompt strategies as fallback
        return {
            "Prompt-001": {
                "description": "Basic prompt for test-taking",
                "template": ("Answer the following question:\n\n{question_text}\n\n"
                             "A) {option_a}\nB) {option_b}\nC) {option_c}\nD) {option_d}\n\n"
                             "Your answer (only A, B, C, D):")
            }
        }

github_url = "https://raw.githubusercontent.com/armelida/MELIDA/main/config/prompt_strategies.json"
prompt_strategies = fetch_prompt_strategies(github_url)

# Validate required placeholders in each strategy
required_placeholders = ["{question_text}", "{option_a}", "{option_b}", "{option_c}", "{option_d}"]
for sid, strategy in prompt_strategies.items():
    template = strategy.get("template", "")
    missing = [ph for ph in required_placeholders if ph not in template]
    if missing:
        print(f"Warning: Strategy {sid} is missing placeholders: {', '.join(missing)}")

# Save the strategies locally
with open('config/prompt_strategies.json', 'w') as f:
    json.dump(prompt_strategies, f, indent=2)
print("✓ Prompt strategies saved to config/prompt_strategies.json")


Fetching prompt strategies from GitHub: https://raw.githubusercontent.com/armelida/MELIDA/main/config/prompt_strategies.json
✓ Loaded 10 prompt strategies.
✓ Prompt strategies saved to config/prompt_strategies.json


In [9]:
# Cell 4: Load API Keys & Save API Configuration
!pip install python-dotenv
import os
from dotenv import load_dotenv

# Initialize API keys dictionary
api_keys = {"openai": None, "anthropic": None}

# Try to load from Colab secrets (if available)
try:
    from google.colab import userdata
    api_keys["openai"] = userdata.get('OPENAI_API_KEY')
    api_keys["anthropic"] = userdata.get('ANTHROPIC_API_KEY')
    if api_keys["openai"] and api_keys["anthropic"]:
        print("✓ API keys loaded from Colab secrets")
except Exception as e:
    print(f"Note: Couldn't load from Colab secrets - {e}")

# Fallback: load from environment variables
if not all(api_keys.values()):
    api_keys["openai"] = api_keys["openai"] or os.environ.get("OPENAI_API_KEY")
    api_keys["anthropic"] = api_keys["anthropic"] or os.environ.get("ANTHROPIC_API_KEY")
    if api_keys["openai"] or api_keys["anthropic"]:
        print("✓ API keys loaded from environment variables")

# Fallback: load from a .env file
if not all(api_keys.values()):
    try:
        load_dotenv()
        api_keys["openai"] = api_keys["openai"] or os.environ.get("OPENAI_API_KEY")
        api_keys["anthropic"] = api_keys["anthropic"] or os.environ.get("ANTHROPIC_API_KEY")
        if api_keys["openai"] or api_keys["anthropic"]:
            print("✓ API keys loaded from .env file")
    except Exception as e:
        print(f"Note: Couldn't load from .env file - {e}")

# Create and save API configuration
api_config = {
    "openai": {"api_key": api_keys["openai"] or "YOUR_OPENAI_API_KEY_HERE"},
    "anthropic": {"api_key": api_keys["anthropic"] or "YOUR_ANTHROPIC_API_KEY_HERE"}
}
with open('config/api_config.json', 'w') as f:
    json.dump(api_config, f, indent=2)

if api_keys["openai"] and api_keys["anthropic"]:
    print("✓ Complete API configuration saved")
else:
    missing = []
    if not api_keys["openai"]:
        missing.append("OpenAI")
    if not api_keys["anthropic"]:
        missing.append("Anthropic")
    print(f"⚠ Missing API keys: {', '.join(missing)}")
    print("Please set the API keys using one of the available methods.")


Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
✓ API keys loaded from Colab secrets
✓ Complete API configuration saved


In [14]:
# Cell 5: Data Loading & Standardization
import json
import os

print("\n--- Data Loading ---")
questions_file = 'data/questions/MIR-2024-v01-t01.json'
answers_file = 'data/answers/MIR-2024-v01-t01-answers.json'

# Load questions
with open(questions_file, 'r', encoding='utf-8') as f:
    questions = json.load(f)
print(f"✓ Loaded {len(questions)} questions from {questions_file}")

# Convert the list of questions to a dictionary keyed by question ID if needed
if isinstance(questions, list):
    questions = {q['id']: q for q in questions}
    print("✓ Converted questions list to a dictionary keyed by question ID")

# Load answers (assumed as a dictionary)
with open(answers_file, 'r', encoding='utf-8') as f:
    answers_dict = json.load(f)
print(f"✓ Loaded answers from {answers_file}")

# Convert answers dictionary to list format for the evaluator
answers = [{"id": qid, "correct_option": ans} for qid, ans in answers_dict.items()]
print(f"✓ Converted answers to list format with {len(answers)} items")

# Save standardized answers for the evaluator
std_answers_file = 'data/answers/MIR-2024-v01-t01-answers-standardized.json'
with open(std_answers_file, 'w', encoding='utf-8') as f:
    json.dump(answers, f, indent=2)
print(f"✓ Standardized answers saved to {std_answers_file}")

# Check for matching question IDs and answer IDs
question_ids = set(questions.keys())
answer_ids = {a['id'] for a in answers}
matched = question_ids.intersection(answer_ids)
print(f"✓ Found {len(matched)} matching questions out of {len(question_ids)} total questions")



--- Data Loading ---
✓ Loaded 174 questions from data/questions/MIR-2024-v01-t01.json
✓ Converted questions list to a dictionary keyed by question ID
✓ Loaded answers from data/answers/MIR-2024-v01-t01-answers.json
✓ Converted answers to list format with 174 items
✓ Standardized answers saved to data/answers/MIR-2024-v01-t01-answers-standardized.json
✓ Found 174 matching questions out of 174 total questions


In [19]:
# Cell 6: Run Sample Evaluation (Updated)
import json
from src.evaluator import ModelEvaluator

# Define parameters
# Using only two models: update "o3-mini" to the supported name "gpt-4o-mini"
models_to_test = ['o3-mini-2025-01-31', 'claude-3.7-Sonnet']

# Using 10 prompting strategies (Prompt-001 to Prompt-010)
prompt_strategies_to_test = [f'Prompt-{i:03d}' for i in range(1, 11)]

sample_size = 5  # Use a small sample for testing

# File paths (using the standardized answers file from Cell 5)
questions_file = 'data/questions/MIR-2024-v01-t01.json'
std_answers_file = 'data/answers/MIR-2024-v01-t01-answers-standardized.json'

# Initialize the evaluator
evaluator = ModelEvaluator()

print("\n--- RUNNING SAMPLE EVALUATION ---")
# Choose first model and first prompt strategy for a sample run
sample_model = models_to_test[0]
sample_strategy = prompt_strategies_to_test[0]

try:
    # Run a sample evaluation using the selected model and prompt strategy
    result_file = evaluator.run_evaluation(
        questions_file=questions_file,
        answer_key_file=std_answers_file,
        prompt_strategy=sample_strategy,
        model=sample_model,
        sample_size=sample_size
    )
    print(f"✓ Sample evaluation complete. Results saved to: {result_file}")

    # Load and display the summary of the evaluation results
    with open(result_file, 'r') as f:
        results = json.load(f)
    summary = results.get('summary', {})

    print("\n--- SAMPLE EVALUATION RESULTS ---")
    print(f"Model: {summary.get('model', 'N/A')}")
    print(f"Prompt Strategy: {summary.get('prompt_strategy', 'N/A')}")
    print(f"Total Questions: {summary.get('total_questions', 'N/A')}")
    print(f"Correct Answers: {summary.get('correct_count', 'N/A')} ({summary.get('accuracy', 0)*100:.2f}%)")
    print(f"Incorrect Answers: {summary.get('incorrect_count', 'N/A')}")
    print(f"Skipped Questions: {summary.get('skipped_count', 'N/A')}")
    print(f"Unknown/No clear answer: {summary.get('unknown_count', 'N/A')}")
    print(f"Total Score: {summary.get('total_score', 'N/A')}")

except Exception as e:
    print(f"✗ Error during sample evaluation: {e}")

print("\nIf the sample evaluation looks good, proceed to full evaluation in the next cell.")



--- RUNNING SAMPLE EVALUATION ---


Evaluating questions:   0%|          | 0/5 [00:00<?, ?it/s]

✗ Error during sample evaluation: Unsupported model: o3-mini-2025-01-31

If the sample evaluation looks good, proceed to full evaluation in the next cell.





In [None]:
# Cell 7: Analyze Results and Create Visualizations
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

def analyze_results(result_files=None):
    """Analyze evaluation results and create visualizations"""
    print("\n----- ANALYZING RESULTS -----")

    # Get all result files if not provided
    if not result_files or len(result_files) == 0:
        result_dir = "data/results"
        result_files = [os.path.join(result_dir, f) for f in os.listdir(result_dir)
                      if f.endswith("_results.json")]
        print(f"Found {len(result_files)} result files in {result_dir}")

    if len(result_files) == 0:
        print("No result files found for analysis")
        return

    # Merge results from all files
    all_summaries = []
    all_details = []

    for file_path in result_files:
        try:
            with open(file_path, 'r') as f:
                results = json.load(f)

            all_summaries.append(results['summary'])
            all_details.extend(results['results'])
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    # Create dataframes
    summary_df = pd.DataFrame(all_summaries)
    details_df = pd.DataFrame(all_details)

    # Export to CSV
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    export_dir = "data/exports"
    os.makedirs(export_dir, exist_ok=True)

    summary_csv = os.path.join(export_dir, f"evaluation_summary_{timestamp}.csv")
    details_csv = os.path.join(export_dir, f"evaluation_details_{timestamp}.csv")

    summary_df.to_csv(summary_csv, index=False)
    details_df.to_csv(details_csv, index=False)

    print(f"✓ Exported evaluation summary to: {summary_csv}")
    print(f"✓ Exported evaluation details to: {details_csv}")

    # Display best performing combinations
    print("\n----- BEST PERFORMING COMBINATIONS -----")
    summary_df['formatted_accuracy'] = summary_df['accuracy'].apply(lambda x: f"{x*100:.2f}%")
    best_combos = summary_df.sort_values('accuracy', ascending=False)[
        ['model', 'prompt_strategy', 'formatted_accuracy', 'total_score']
    ].head(5)
    print(best_combos)

    # Create visualizations if we have more than one result
    if len(all_summaries) > 1:
        # Accuracy comparison
        plt.figure(figsize=(12, 8))
        sns.barplot(x='model', y='accuracy', hue='prompt_strategy', data=summary_df)
        plt.title('Model Accuracy by Prompt Strategy', fontsize=16)
        plt.xticks(rotation=45)
        plt.ylim(0, 1)
        plt.legend(title='Prompt Strategy', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()

        # Save the plot
        plot_path = os.path.join(export_dir, f"accuracy_comparison_{timestamp}.png")
        plt.savefig(plot_path, dpi=300)
        plt.show()

        return summary_df, details_df
    else:
        print("Not generating visualizations with only one result file")
        return summary_df, details_df

# Uncomment to run the analysis on result files
"""
summary_df, details_df = analyze_results(result_files)
"""


In [None]:
# Cell 8: Run Everything in Sequence
def run_complete_evaluation(sample_size=5, full_eval=False):
    """Run the complete evaluation process from start to finish"""
    # Setup
    prompt_strategies = setup_prompt_strategies()
    api_keys = setup_api_keys()

    # Check if we have the required API keys
    if not api_keys["openai"] or not api_keys["anthropic"]:
        print("⚠️ Missing API keys. Please add them before continuing.")
        return

    # Prepare data
    questions_file, answer_key_file = prepare_evaluation_data()

    # Run sample evaluation
    sample_result = run_sample_evaluation(questions_file, answer_key_file)

    # Run full evaluation if requested
    if full_eval:
        models_to_test = [
            'gpt-3.5-turbo',  # Faster model for testing
            'claude-3-7-sonnet-20250219'  # Claude model
        ]

        strategies_to_test = list(prompt_strategies.keys())[:2]  # Use first two strategies

        result_files = run_full_evaluation(
            questions_file,
            answer_key_file,
            models=models_to_test,
            strategies=strategies_to_test,
            sample_size=sample_size
        )

        # Analyze results
        if result_files:
            summary_df, details_df = analyze_results(result_files)
            return summary_df, details_df

    print("\nEvaluation complete!")

# Uncomment to run the complete process
"""
# For a quick test run:
run_complete_evaluation(sample_size=5, full_eval=False)

# For a more comprehensive evaluation:
# run_complete_evaluation(sample_size=20, full_eval=True)
"""