<a href="https://colab.research.google.com/github/armelida/MELIDA/blob/main/notebooks/prompting_strategy_evaluator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import torch
import os
import subprocess

# Function to check if Colab is using a GPU or CPU
def check_runtime():
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        print(f"✅ GPU is enabled! Using: {gpu_name}")
    elif "COLAB_TPU_ADDR" in os.environ:
        print("✅ TPU is enabled!")
    else:
        print("⚠️ WARNING: No GPU or TPU detected. Running on CPU.")
        print("👉 Go to Runtime > Change runtime type > Select GPU/TPU")

# Function to check GPU details (if available)
def check_gpu():
    try:
        gpu_info = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        if gpu_info.returncode == 0:
            print(gpu_info.stdout)
        else:
            print("⚠️ `nvidia-smi` not found. No GPU detected.")
    except FileNotFoundError:
        print("⚠️ No GPU found.")

# Run the check
check_runtime()
check_gpu()


👉 Go to Runtime > Change runtime type > Select GPU/TPU
⚠️ No GPU found.


In [3]:
# Remove existing folder (if any) and clone a fresh copy

# MELIDA: Model Evaluation for Life-sciences Intelligence and Decision Assistance
# Production Evaluation Runner

!rm -rf MELIDA

# 1. Clone the repository and set up the environment
!git clone https://github.com/armelida/MELIDA.git
%cd MELIDA

# Install required packages
!pip install -r requirements.txt


Cloning into 'MELIDA'...
remote: Enumerating objects: 128, done.[K
remote: Counting objects: 100% (128/128), done.[K
remote: Compressing objects: 100% (111/111), done.[K
remote: Total 128 (delta 52), reused 50 (delta 11), pack-reused 0 (from 0)[K
Receiving objects: 100% (128/128), 125.78 KiB | 3.14 MiB/s, done.
Resolving deltas: 100% (52/52), done.
/content/MELIDA
Collecting anthropic>=0.7.0 (from -r requirements.txt (line 2))
  Downloading anthropic-0.49.0-py3-none-any.whl.metadata (24 kB)
Collecting jupyter>=1.0.0 (from -r requirements.txt (line 7))
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting jupyterlab (from jupyter>=1.0.0->-r requirements.txt (line 7))
  Downloading jupyterlab-4.3.6-py3-none-any.whl.metadata (16 kB)
Collecting async-lru>=1.0.0 (from jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 7))
  Downloading async_lru-2.0.4-py3-none-any.whl.metadata (4.5 kB)
Collecting jupyter-lsp>=2.0.0 (from jupyterlab->jupyter>=1.0.0->-r requirem

In [6]:
# 2. Set up configuration with support for multiple environments
import json
import re
import requests

# Create config directory
os.makedirs('config', exist_ok=True)

# Fetch prompt strategies from GitHub repository
github_prompt_strategies_url = "https://raw.githubusercontent.com/armelida/MELIDA/main/config/prompt_strategies.json"
print(f"Fetching prompt strategies from GitHub: {github_prompt_strategies_url}")

try:
    # Fetch the file from GitHub
    response = requests.get(github_prompt_strategies_url)
    response.raise_for_status()  # Raise exception for HTTP errors

    # Parse the JSON content
    prompt_strategies = response.json()
    print(f"✓ Successfully loaded {len(prompt_strategies)} prompt strategies from GitHub")

    # Add any additional metadata if needed
    for strategy_id, strategy in prompt_strategies.items():
        if "language" not in strategy:
            # Detect language based on description or content
            if "Spanish" in strategy.get("description", ""):
                strategy["language"] = "Spanish"
            elif "English" in strategy.get("description", ""):
                strategy["language"] = "English"
            else:
                strategy["language"] = "Unknown"

        # Add tags if they don't exist
        if "tags" not in strategy:
            strategy["tags"] = []
            if "doctor" in strategy.get("description", "").lower():
                strategy["tags"].append("doctor_role")
            if "reasoning" in strategy.get("description", "").lower():
                strategy["tags"].append("reasoning")
            if "confidence" in strategy.get("description", "").lower():
                strategy["tags"].append("confidence")

except Exception as e:
    print(f"Error fetching prompt strategies from GitHub: {e}")
    print("Using default prompt strategies instead")

    # Default prompt strategies as fallback
    prompt_strategies = {
        "Prompt-001": {
            "description": "Basic prompt for test-taking",
            "template": "Answer the following question:\n\n{question_text}\n\nA) {option_a}\nB) {option_b}\nC) {option_c}\nD) {option_d}\n\nYour answer (only A, B, C, D):"
        }
    }

# Validate prompt templates
required_placeholders = ["{question_text}", "{option_a}", "{option_b}", "{option_c}", "{option_d}"]
for strategy_id, strategy in prompt_strategies.items():
    template = strategy.get("template", "")
    missing = [ph for ph in required_placeholders if ph not in template]
    if missing:
        print(f"Warning: Strategy {strategy_id} is missing placeholders: {', '.join(missing)}")

# Save prompt strategies locally
with open('config/prompt_strategies.json', 'w') as f:
    json.dump(prompt_strategies, f, indent=2)
    print("✓ Prompt strategies saved to config/prompt_strategies.json")

# Try to get API keys from different sources
api_keys = {"openai": None, "anthropic": None}

# Method 1: Try Colab secrets
try:
    from google.colab import userdata
    api_keys["openai"] = userdata.get('OPENAI_API_KEY')
    api_keys["anthropic"] = userdata.get('ANTHROPIC_API_KEY')
    if api_keys["openai"] and api_keys["anthropic"]:
        print("✓ API keys loaded from Colab secrets")
except (ImportError, Exception) as e:
    print(f"Note: Couldn't load from Colab secrets - {str(e)}")

# Method 2: Try environment variables if any keys are still missing
if not all(api_keys.values()):
    try:
        if not api_keys["openai"]:
            api_keys["openai"] = os.environ.get("OPENAI_API_KEY")
        if not api_keys["anthropic"]:
            api_keys["anthropic"] = os.environ.get("ANTHROPIC_API_KEY")
        if api_keys["openai"] or api_keys["anthropic"]:
            print("✓ API keys loaded from environment variables")
    except Exception as e:
        print(f"Note: Error accessing environment variables - {str(e)}")

# Method 3: Try loading from a local .env file
if not all(api_keys.values()):
    try:
        from dotenv import load_dotenv
        load_dotenv()
        if not api_keys["openai"]:
            api_keys["openai"] = os.environ.get("OPENAI_API_KEY")
        if not api_keys["anthropic"]:
            api_keys["anthropic"] = os.environ.get("ANTHROPIC_API_KEY")
        if api_keys["openai"] or api_keys["anthropic"]:
            print("✓ API keys loaded from .env file")
    except (ImportError, Exception) as e:
        print(f"Note: Couldn't load from .env file - {str(e)}")

# Create API configuration
api_config = {
    "openai": {
        "api_key": api_keys["openai"] or "YOUR_OPENAI_API_KEY_HERE"
    },
    "anthropic": {
        "api_key": api_keys["anthropic"] or "YOUR_ANTHROPIC_API_KEY_HERE"
    }
}

# Save API configuration
with open('config/api_config.json', 'w') as f:
    json.dump(api_config, f, indent=2)

# Check if real keys were found
if api_keys["openai"] and api_keys["anthropic"]:
    print("✓ Complete API configuration saved")
else:
    missing_keys = []
    if not api_keys["openai"]:
        missing_keys.append("OpenAI")
    if not api_keys["anthropic"]:
        missing_keys.append("Anthropic")

    print(f"⚠ Missing API keys: {', '.join(missing_keys)}")
    print("Please provide API keys using one of these methods:")
    print("  - Colab: Click on the 🔑 icon and add OPENAI_API_KEY and ANTHROPIC_API_KEY")
    print("  - Environment variables: Set OPENAI_API_KEY and ANTHROPIC_API_KEY")
    print("  - .env file: Create a .env file with OPENAI_API_KEY and ANTHROPIC_API_KEY")
    print("Placeholder values have been saved to config/api_config.json")

Fetching prompt strategies from GitHub: https://raw.githubusercontent.com/armelida/MELIDA/main/config/prompt_strategies.json
✓ Successfully loaded 10 prompt strategies from GitHub
✓ Prompt strategies saved to config/prompt_strategies.json
✓ API keys loaded from Colab secrets
✓ Complete API configuration saved


In [14]:
# 3. Fix data loading and ensure questions and answers are properly loaded
import json
import os

print("\nChecking and fixing data files...")

# Define file paths
questions_file = 'data/questions/MIR-2024-v01-t01.json'
answer_key_file = 'data/answers/MIR-2024-v01-t01-answers.json'

# Function to safely load JSON files
def load_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            # Check if content is valid JSON
            data = json.loads(content)
            return data
    except FileNotFoundError:
        print(f"✗ File not found: {file_path}")
        return None
    except json.JSONDecodeError as e:
        print(f"✗ JSON decode error in {file_path}: {e}")
        print(f"  First few characters of the file: {content[:100]}...")
        return None
    except Exception as e:
        print(f"✗ Error loading file {file_path}: {e}")
        return None

# Load questions
questions = load_json_file(questions_file)
if questions:
    print(f"✓ Successfully loaded {len(questions)} questions from {questions_file}")
    # Sample question
    print(f"  First question ID: {questions[0]['id']}")
    print(f"  Sample question: {questions[0]['question_text'][:100]}...")

# Load answers - expecting a dictionary format where keys are question IDs and values are correct options
answers_dict = load_json_file(answer_key_file)
if answers_dict:
    print(f"✓ Successfully loaded answers from {answer_key_file}")
    print(f"  Answers contains {len(answers_dict)} question-answer pairs")

    # Verify it's in the expected format (sample a few entries)
    sample_count = min(5, len(answers_dict))
    sample_items = list(answers_dict.items())[:sample_count]
    print(f"  Sample format (first {sample_count} items):")
    for q_id, answer in sample_items:
        print(f"    {q_id}: {answer}")

    # Convert the dictionary to a list format for compatibility with the evaluator
    answers = []
    for q_id, correct_option in answers_dict.items():
        answers.append({
            "id": q_id,
            "correct_option": correct_option
        })

    print(f"✓ Converted answers to list format with {len(answers)} items")

    # Save the standardized answers
    standardized_answers_path = 'data/answers/MIR-2024-v01-t01-answers-standardized.json'
    with open(standardized_answers_path, 'w', encoding='utf-8') as f:
        json.dump(answers, f, indent=2)

    print(f"✓ Saved standardized answers to {standardized_answers_path}")
    answer_key_file = standardized_answers_path

# Check how many questions have matching answers
if questions and answers:
    question_ids = {q['id'] for q in questions}
    answer_ids = {a['id'] for a in answers}

    matched = question_ids.intersection(answer_ids)
    unmatched_questions = question_ids - answer_ids
    unmatched_answers = answer_ids - question_ids

    print(f"\n✓ Found {len(matched)} questions with matching answers out of {len(questions)} total questions")

    if unmatched_questions:
        print(f"⚠ {len(unmatched_questions)} questions don't have matching answers")
        if len(unmatched_questions) <= 5:
            print(f"  Unmatched question IDs: {', '.join(unmatched_questions)}")

    if unmatched_answers:
        print(f"⚠ {len(unmatched_answers)} answers don't have matching questions")
        if len(unmatched_answers) <= 5:
            print(f"  Unmatched answer IDs: {', '.join(unmatched_answers)}")

# Store the file paths for later use
print(f"\nFiles ready for evaluation:")
print(f"Questions file: {questions_file}")
print(f"Answer key file: {answer_key_file}")


Checking and fixing data files...
✓ Successfully loaded 174 questions from data/questions/MIR-2024-v01-t01.json
  First question ID: MIR-2024-v01-t01-Q026
  Sample question: En relación con el metabolismo del hierro y su control mediado por hepcidina, es cierto que:...
✓ Successfully loaded answers from data/answers/MIR-2024-v01-t01-answers.json
  Answers contains 174 question-answer pairs
  Sample format (first 5 items):
    MIR-2024-v01-t01-Q026: B
    MIR-2024-v01-t01-Q027: C
    MIR-2024-v01-t01-Q028: A
    MIR-2024-v01-t01-Q029: B
    MIR-2024-v01-t01-Q030: C
✓ Converted answers to list format with 174 items
✓ Saved standardized answers to data/answers/MIR-2024-v01-t01-answers-standardized.json

✓ Found 174 questions with matching answers out of 174 total questions

Files ready for evaluation:
Questions file: data/questions/MIR-2024-v01-t01.json
Answer key file: data/answers/MIR-2024-v01-t01-answers-standardized.json


In [16]:
# 5. Set evaluation parameters and run sample evaluation
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set evaluation parameters
models_to_test = [
    'gpt-4',  # You can change these based on what models you want to test
    'gpt-3.5-turbo',
    # 'claude-3-opus-20240229',
    # 'claude-3-sonnet-20240229'
]

prompt_strategies_to_test = [
    'Prompt-001',  # These should match the keys in your prompt_strategies.json
    'Prompt-002',
    # 'Prompt-003',
    # 'Prompt-004',
    # 'Prompt-005'
]

# Number of questions to evaluate (None for all questions, or a smaller number for testing)
sample_size = 5  # Start with a small sample to verify everything works

# Questions and answer files (use the paths from previous cell)
questions_file = 'data/questions/MIR-2024-v01-t01.json'
answer_key_file = 'data/answers/MIR-2024-v01-t01-answers-standardized.json'  # Use the standardized answers file

# Initialize the evaluator
evaluator = ModelEvaluator()

# Run sample evaluation (uncomment the actual runs when ready)
print("\n----- RUNNING SAMPLE EVALUATION -----")
result_files = []

# Run one sample evaluation to verify everything works
sample_model = models_to_test[0]
sample_strategy = prompt_strategies_to_test[0]

print(f"\nRunning sample evaluation with {sample_model} and {sample_strategy}...")
try:
    result_file = evaluator.run_evaluation(
        questions_file=questions_file,
        answer_key_file=answer_key_file,
        prompt_strategy=sample_strategy,
        model=sample_model,
        sample_size=sample_size
    )
    result_files.append(result_file)
    print(f"✓ Sample evaluation complete. Results saved to: {result_file}")

    # Display basic results
    with open(result_file, 'r') as f:
        results = json.load(f)

    summary = results['summary']
    print("\n----- SAMPLE EVALUATION RESULTS -----")
    print(f"Model: {summary['model']}")
    print(f"Prompt Strategy: {summary['prompt_strategy']}")
    print(f"Total Questions: {summary['total_questions']}")
    print(f"Correct Answers: {summary['correct_count']} ({summary['accuracy']*100:.2f}%)")
    print(f"Incorrect Answers: {summary['incorrect_count']}")
    print(f"Skipped Questions: {summary['skipped_count']}")
    print(f"Unknown/No clear answer: {summary['unknown_count']}")
    print(f"Total Score: {summary['total_score']}")

except Exception as e:
    print(f"✗ Error during sample evaluation: {e}")

print("\nIf the sample evaluation looks good, you can run the full evaluation by uncommenting the code in the next cell.")

✓ OpenAI client initialized
✓ Anthropic client initialized
ModelEvaluator initialized

----- RUNNING SAMPLE EVALUATION -----

Running sample evaluation with gpt-4 and Prompt-001...

Evaluating gpt-4 with Prompt-001
Loaded 174 questions and 174 answers
Using prompt strategy: Spanish prompt for AI model taking standardized test
Sampled 5 questions for evaluation


Evaluating gpt-4:   0%|          | 0/5 [00:00<?, ?it/s]

Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 

In [17]:
# 6. Run full evaluation with all models and prompt strategies
# Uncomment this cell when you're ready to run the full evaluation

result_files = []
print("\n----- RUNNING FULL EVALUATION -----")

for model in models_to_test:
    for prompt_strategy in prompt_strategies_to_test:
        print(f"\nEvaluating {model} with {prompt_strategy}...")
        try:
            result_file = evaluator.run_evaluation(
                questions_file=questions_file,
                answer_key_file=answer_key_file,
                prompt_strategy=prompt_strategy,
                model=model,
                sample_size=None  # Use all questions for the full evaluation
            )
            result_files.append(result_file)
            print(f"✓ Evaluation complete. Results saved to: {result_file}")
        except Exception as e:
            print(f"✗ Error during evaluation: {e}")

print(f"\n----- FULL EVALUATION COMPLETE -----")
print(f"Generated {len(result_files)} result files")


----- RUNNING FULL EVALUATION -----

Evaluating gpt-4 with Prompt-001...

Evaluating gpt-4 with Prompt-001
Loaded 174 questions and 174 answers
Using prompt strategy: Spanish prompt for AI model taking standardized test
Using all 174 questions for evaluation


Evaluating gpt-4:   0%|          | 0/174 [00:00<?, ?it/s]

Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 

Evaluating gpt-4:   0%|          | 0/174 [00:00<?, ?it/s]

Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 

Evaluating gpt-3.5-turbo:   0%|          | 0/174 [00:00<?, ?it/s]

Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 

Evaluating gpt-3.5-turbo:   0%|          | 0/174 [00:00<?, ?it/s]

Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error getting model response: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 

In [None]:
# 7. Analyze results and export data for Tableau
# Run this cell after completing the evaluations

# Helper function to merge all result files
def merge_results(result_files):
    all_summaries = []
    all_details = []

    for file_path in result_files:
        try:
            with open(file_path, 'r') as f:
                results = json.load(f)

            all_summaries.append(results['summary'])
            all_details.extend(results['results'])
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    return all_summaries, all_details

# Get all result files if not already provided
if not result_files or len(result_files) == 0:
    result_dir = "data/results"
    result_files = [os.path.join(result_dir, f) for f in os.listdir(result_dir) if f.endswith("_results.json")]
    print(f"Found {len(result_files)} result files in {result_dir}")

# Merge all results
summaries, details = merge_results(result_files)

# Create dataframes
summary_df = pd.DataFrame(summaries)
details_df = pd.DataFrame(details)

# Export to CSV for Tableau
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
export_dir = "data/exports"
os.makedirs(export_dir, exist_ok=True)

summary_csv = os.path.join(export_dir, f"evaluation_summary_{timestamp}.csv")
details_csv = os.path.join(export_dir, f"evaluation_details_{timestamp}.csv")

summary_df.to_csv(summary_csv, index=False)
details_df.to_csv(details_csv, index=False)

print(f"✓ Exported evaluation summary to: {summary_csv}")
print(f"✓ Exported evaluation details to: {details_csv}")

# Create visualizations
plt.figure(figsize=(14, 10))
sns.barplot(x='model', y='accuracy', hue='prompt_strategy', data=summary_df)
plt.title('Model Accuracy by Prompt Strategy', fontsize=16)
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.legend(title='Prompt Strategy', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

# Save the plot
plot_path = os.path.join(export_dir, f"accuracy_comparison_{timestamp}.png")
plt.savefig(plot_path, dpi=300)
plt.show()

# Additional visualization: Total scores
plt.figure(figsize=(14, 10))
sns.barplot(x='model', y='total_score', hue='prompt_strategy', data=summary_df)
plt.title('Total Score by Model and Prompt Strategy', fontsize=16)
plt.xticks(rotation=45)
plt.legend(title='Prompt Strategy', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

# Save the plot
score_plot_path = os.path.join(export_dir, f"score_comparison_{timestamp}.png")
plt.savefig(score_plot_path, dpi=300)
plt.show()

# Display best performing model-strategy combinations
print("\n----- BEST PERFORMING COMBINATIONS -----")
summary_df['formatted_accuracy'] = summary_df['accuracy'].apply(lambda x: f"{x*100:.2f}%")
best_combos = summary_df.sort_values('accuracy', ascending=False)[['model', 'prompt_strategy', 'formatted_accuracy', 'total_score']].head(5)
print(best_combos)

# Export enhanced data for Tableau
# Create additional metrics and aggregations
tableau_data = []

# Model performance by prompt strategy
for model in summary_df['model'].unique():
    model_data = summary_df[summary_df['model'] == model]
    for _, row in model_data.iterrows():
        tableau_data.append({
            'Metric Type': 'Model Performance',
            'Model': row['model'],
            'Prompt Strategy': row['prompt_strategy'],
            'Value': row['accuracy'],
            'Display Value': f"{row['accuracy']*100:.2f}%",
            'Count': row['total_questions'],
            'Score': row['total_score']
        })

# Best prompt strategy by model
for model in summary_df['model'].unique():
    model_data = summary_df[summary_df['model'] == model]
    best_strategy = model_data.loc[model_data['accuracy'].idxmax()]
    tableau_data.append({
        'Metric Type': 'Best Prompt Strategy',
        'Model': model,
        'Prompt Strategy': best_strategy['prompt_strategy'],
        'Value': best_strategy['accuracy'],
        'Display Value': f"{best_strategy['accuracy']*100:.2f}%",
        'Count': best_strategy['total_questions'],
        'Score': best_strategy['total_score']
    })

# Best model by prompt strategy
for strategy in summary_df['prompt_strategy'].unique():
    strategy_data = summary_df[summary_df['prompt_strategy'] == strategy]
    best_model = strategy_data.loc[strategy_data['accuracy'].idxmax()]
    tableau_data.append({
        'Metric Type': 'Best Model',
        'Model': best_model['model'],
        'Prompt Strategy': strategy,
        'Value': best_model['accuracy'],
        'Display Value': f"{best_model['accuracy']*100:.2f}%",
        'Count': best_model['total_questions'],
        'Score': best_model['total_score']
    })

# Overall best combination
best_overall = summary_df.loc[summary_df['accuracy'].idxmax()]
tableau_data.append({
    'Metric Type': 'Best Overall Combination',
    'Model': best_overall['model'],
    'Prompt Strategy': best_overall['prompt_strategy'],
    'Value': best_overall['accuracy'],
    'Display Value': f"{best_overall['accuracy']*100:.2f}%",
    'Count': best_overall['total_questions'],
    'Score': best_overall['total_score']
})

# Export the enhanced data
tableau_df = pd.DataFrame(tableau_data)
tableau_enhanced_csv = os.path.join(export_dir, f"tableau_enhanced_{timestamp}.csv")
tableau_df.to_csv(tableau_enhanced_csv, index=False)
print(f"✓ Exported enhanced Tableau data to: {tableau_enhanced_csv}")

# Push results to GitHub repository
try:
    import subprocess

    # Define GitHub repository and target directory
    github_repo = "armelida/MELIDA"
    target_path = "data/results/prompting-strategy"

    # Copy files to a directory that will be pushed
    github_export_dir = os.path.join("data", "github_export")
    os.makedirs(github_export_dir, exist_ok=True)

    # Copy result files
    for result_file in result_files:
        import shutil
        shutil.copy2(result_file, github_export_dir)

    # Copy CSV files for Tableau
    shutil.copy2(summary_csv, github_export_dir)
    shutil.copy2(details_csv, github_export_dir)
    shutil.copy2(tableau_enhanced_csv, github_export_dir)

    print(f"\n✓ Prepared {len(os.listdir(github_export_dir))} files for GitHub export")
    print("To push results to GitHub, run the following commands:")
    print(f"!git add {github_export_dir}")
    print("!git commit -m \"Add evaluation results from $(date)\"")
    print(f"!git push origin main")

except Exception as e:
    print(f"Error preparing GitHub export: {e}")