<a href="https://colab.research.google.com/github/armelida/MELIDA/blob/dev/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# MELIDA: Model Evaluation for Life-sciences Intelligence and Decision Assistance
# Evaluation Runner

# 1. Clone the repository
!git clone https://github.com/armelida/MELIDA.git
%cd MELIDA

# 2. Install required packages
!pip install -r requirements.txt

# 3. Set up API keys (this will prompt you to input your keys securely)
import os
import json
from google.colab import userdata

# Create config directory if it doesn't exist
os.makedirs('config', exist_ok=True)

# Check if API keys are stored in Colab secrets
try:
    openai_key = userdata.get('OPENAI_API_KEY')
    anthropic_key = userdata.get('ANTHROPIC_API_KEY')
    keys_from_secrets = True
except Exception:
    openai_key = None
    anthropic_key = None
    keys_from_secrets = False

# Create API config file
api_config = {
    "openai": {
        "api_key": openai_key or "YOUR_OPENAI_API_KEY"
    },
    "anthropic": {
        "api_key": anthropic_key or "YOUR_ANTHROPIC_API_KEY"
    }
}

# If keys weren't loaded from secrets, prompt the user
if not keys_from_secrets:
    print("API keys not found in Colab secrets.")
    print("Please enter your API keys (they won't be saved in the notebook history):")
    if api_config["openai"]["api_key"] == "YOUR_OPENAI_API_KEY":
        from getpass import getpass
        api_config["openai"]["api_key"] = getpass("Enter your OpenAI API key: ")

    if api_config["anthropic"]["api_key"] == "YOUR_ANTHROPIC_API_KEY":
        from getpass import getpass
        api_config["anthropic"]["api_key"] = getpass("Enter your Anthropic API key: ")

# Save API config
with open('config/api_config.json', 'w') as f:
    json.dump(api_config, f, indent=2)

print("API configuration saved.")

# 4. Import required modules
import sys
sys.path.append('.')  # Add current directory to path
from src.evaluator import ModelEvaluator
from tqdm.notebook import tqdm  # Use notebook version of tqdm for Colab
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 5. Configure the evaluation parameters
questions_file = 'data/questions/MIR-2024-v01-t01.json'
answer_key_file = 'data/answers/MIR-2024-v01-t01-answers.json'
prompt_strategy = 'Prompt-001'  # Choose from strategies in prompt_strategies.json
model = 'gpt-4'  # or 'claude-3-opus-20240229' for Anthropic

# 6. Initialize the evaluator and run the evaluation
evaluator = ModelEvaluator()
results_path = evaluator.run_evaluation(
    questions_file=questions_file,
    answer_key_file=answer_key_file,
    prompt_strategy=prompt_strategy,
    model=model,
    sample_size=None  # Set to a number for a smaller test, or None for all questions
)

print(f"Evaluation complete. Results saved to {results_path}")

# 7. Load and display the results
with open(results_path, 'r') as f:
    results = json.load(f)

# Show summary statistics
summary = results['summary']
print(f"Test ID: {summary['test_id']}")
print(f"Prompt Strategy: {summary['prompt_strategy']}")
print(f"Model: {summary['model']}")
print(f"Total Questions: {summary['total_questions']}")
print(f"Total Score: {summary['total_score']}")
print(f"Correct Answers: {summary['correct_count']} ({summary['accuracy']*100:.2f}%)")
print(f"Incorrect Answers: {summary['incorrect_count']}")
print(f"Skipped Questions: {summary['skipped_count']}")

# 8. Visualize the results
# Create a DataFrame from the detailed results
results_df = pd.DataFrame(results['results'])

# Plot score distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='score', data=results_df)
plt.title('Score Distribution')
plt.xlabel('Score')
plt.ylabel('Count')
plt.show()

# Plot response time distribution
plt.figure(figsize=(10, 6))
sns.histplot(results_df['response_time'], bins=20)
plt.title('Response Time Distribution')
plt.xlabel('Response Time (seconds)')
plt.ylabel('Count')
plt.show()

Cloning into 'MELIDA'...
remote: Enumerating objects: 42, done.[K
remote: Counting objects: 100% (42/42), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 42 (delta 3), reused 26 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (42/42), 53.48 KiB | 855.00 KiB/s, done.
Resolving deltas: 100% (3/3), done.
/content/MELIDA
Collecting anthropic>=0.7.0 (from -r requirements.txt (line 2))
  Downloading anthropic-0.49.0-py3-none-any.whl.metadata (24 kB)
Collecting jupyter>=1.0.0 (from -r requirements.txt (line 7))
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting jupyterlab (from jupyter>=1.0.0->-r requirements.txt (line 7))
  Downloading jupyterlab-4.3.5-py3-none-any.whl.metadata (16 kB)
Collecting async-lru>=1.0.0 (from jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 7))
  Downloading async_lru-2.0.4-py3-none-any.whl.metadata (4.5 kB)
Collecting jupyter-lsp>=2.0.0 (from jupyterlab->jupyter>=1.0.0->-r requirements.txt (l