In [None]:
import sys, os
# add code root to python path
sys.path.append('code/')
import numpy as np
from eval.eval_contrastive_answer import ContrastiveAnswerEvaluator
from eval.eval_contrastive_full import ContrastiveFullEvaluator
from core.utils import ResourceManager
from core.models import CostManager
from core.card import GenerativeCard
from core.data import MMLUBatch, load_mmlu_batches, load_batches
import time
import re
from tqdm.notebook import tqdm, trange
import json
import pandas as pd

# html display
from IPython.display import display, HTML

## EXPERIMENT SETUP

In [None]:

models = [
    "gpt-4o",
    'claude-3-opus-20240229',
    "gpt-4-turbo",
    "Meta-Llama-3-70B-Instruct",
    "gpt-3.5-turbo",
    "Meta-Llama-3-8B-Instruct",
    "Mixtral-8x7B-Instruct-v0.1",
    "Mistral-7B-Instruct-v0.2",
    "gemma-1.1-7b-it",
]

mmlu_topics = ['high_school_mathematics',
             'high_school_physics', 
            'high_school_chemistry',
            'high_school_biology', 
            'high_school_world_history', 
            'machine_learning',
            'college_mathematics',]

anthropic_topics = [
     'corrigible-less-HHH',
    'myopic-reward',
    'power-seeking-inclination',
    'survival-instinct',
    'self-awareness-general-ai'
]

openend_topics = ['Writing efficient code for solving concrete algorthimic problems',
              'Crafting engaging and contextually appropriate jokes',
              'Providing dating advice',
              'Roleplaying as a fictional character',]

meta = 'gsm8k'


topics = ['gsm8k']

# guesser = 'gpt-4o'
guesser = 'meta-llama/Meta-Llama-3-70B-Instruct'
evaluator = 'gpt'
use_cot = False

## Import

In [None]:
import os

# Get the current working directory
current_dir = os.getcwd()
print("Current Directory:", current_dir)

# Change the current working directory to the parent directory
parent_dir = os.path.dirname(current_dir)
os.chdir(parent_dir)

## Helper

In [None]:
def load_all_batches(meta, topic, models, fraction='test'):
    all_test_batches = {}
    for model in models:
        bs = 60 if fraction == 'test' else 40
        batch = load_batches(
                f"datasets/{meta}/", topic, model, fraction, [bs], False
            )[0]
        
        all_test_batches[model] = batch

    return all_test_batches


def get_latest_folder(topic, optim_method, card_format, evaluator, model, generation_method='generative'):
    folder_root = f'outputs/{generation_method}/{topic}/{optim_method}/{card_format}/{evaluator}/{model}'
    all_folders =  os.listdir(f'outputs/{generation_method}/{topic}/{optim_method}/{card_format}/{evaluator}/{model}')
    all_folders.sort()
    all_folders = all_folders[::-1]
    for folder in all_folders:
        if re.match(r"\d{2}-\d{2}_\d{2}-\d{2}-\d{2}_" , folder):
            return f'{folder_root}/{folder}' 
    return None

def load_cards(topic,
               generation_method,
               card_format,
               models,
               evaluator,
               method='contrastive'):
    
    cards = {}
    for model in models:
        path = get_latest_folder(topic, generation_method, card_format, evaluator, model, 'generative')
        with open(path+f'/cards/epoch_{str(4) if generation_method=="prog-reg" else str(0)}_card.json', 'r') as f:
            cards[model] = str(GenerativeCard(d= json.load(f)))
            # cards[model] = 'No description available.'

    return cards

def format_few_shot_str(batch, index, cnt):
    s = f'Question {cnt+1}: ' + batch.get_question(index) + '\n'
    s += f"Student's Completion of Question {cnt + 1}: " + batch.get_model_reasoning(index) + '\n'

    return s
def load_few_shot(batches,
                models,
                k_shots=5,
                seed=42):
    
    np.random.seed(seed)
    indices = np.random.choice(len(batches[models[0]]), k_shots, replace=False)
    few_shots = {}
    for model in models:
        batch = batches[model]
        cnt = 0
        s = ''
        for i in range(k_shots):
            s += (format_few_shot_str(batch, indices[i], cnt))
            cnt += 1

        few_shots[model] = s

    return few_shots
    

def create_experiment(all_batches, topic, all_cards, models, evaluators, cot, cm, k_shots=3,few_shot=False, paraphrase=False):
    cards = (all_cards[models[0]], all_cards[models[1]])
    batches = (all_batches[models[0]], all_batches[models[1]])

    exp = 'contrastive-full-cot' if cot else 'contrastive-full-no-cot'

    eval_method = ContrastiveFullEvaluator(
        "mmlu",
        topic,
        batches,
        models,
        cards,
        evaluators,
        ResourceManager(exp_name=exp,
                        name=f"{models[0]}-{models[1]}"),
        cm,
        cot=cot,
        k_shots=k_shots,
        max_workers=60,
        few_shot=few_shot,
        paraphrase=paraphrase
    )

    return eval_method

In [None]:
def run_experiments(meta, topics, models, guesser,  evaluator='gpt', use_cot=True, cm=CostManager(), k_shots=3,
                    few_shot=False, max_worker=60,  generation_method='prog-reg', experiment_name='mmlu_baseline'):
    for topic in tqdm(topics):
        all_batches = load_all_batches(meta, topic, models)

        all_accuracies = np.zeros((len(models), len(models)))
        run_records = []

        # creaete folder if dne
        recorded_entries = []
        recorded_line = []

        file_name = f'exp_results/{experiment_name}/full_baseline_{topic}{"_few_shot" if few_shot else ""}.csv'
        # if folder dne, create
        if not os.path.exists(f'exp_results/{experiment_name}'):
            os.makedirs(f'exp_results/{experiment_name}')

        # if file exists, skip
        if os.path.exists(file_name):
            # read, check if all models are present
            df = pd.read_csv(file_name)
            # user other methods to read

            print(df.head())
            for index in range(len(df)):
                row = df.iloc[index]
                recorded_entries.append((row['model_1'], row['model_2']))
                recorded_line.append(dict(row))
            if len(recorded_entries) == len(models) * (len(models) - 1):
                continue
            
        if few_shot:
            shots = 4 if topic != 'high_school_world_history' else 2
            all_training_batches = load_all_batches(meta, topic, models, 'train')
            all_cards = load_few_shot(all_training_batches, models, k_shots=shots)
        else:
            all_cards = load_cards(topic, generation_method, 'bullet_point', models, evaluator)

        for index_1 in trange(len(models)):
            for index_2 in trange(len(models)):
                if index_1 == index_2:
                    continue
                if (models[index_1], models[index_2]) in recorded_entries:
                    print(f"Skipping {models[index_1]} vs {models[index_2]}")
                    run_records.append(recorded_line[recorded_entries.index((models[index_1], models[index_2]))])
                    continue
                model_1 = models[index_1]
                model_2 = models[index_2]
                exp = create_experiment(all_batches,
                                        topic, 
                                        all_cards,
                                        (model_1, model_2), 
                                        [guesser],
                                        use_cot, 
                                        cm, 
                                        k_shots,
                                        few_shot,
                                        paraphrase=False)
                metrics = exp.main(num_samples=120,
                                max_workers=max_worker,)[0]

                accuracy = metrics[0]
                model_1_accuracy = metrics[1]
                model_2_accuracy = metrics[2]
                all_accuracies[index_1, index_2] = accuracy
                # print(accuracy)
                record_entry = {'model_1': model_1, 'model_2': model_2, 'accuracy': accuracy,
                                'model_1_accuracy': model_1_accuracy, 'model_2_accuracy': model_2_accuracy,
                                'guesser': guesser}
                run_records.append(record_entry)
                time.sleep(10)

                df = pd.DataFrame(run_records)
                df.to_csv(file_name, index=False)

In [None]:
cm = CostManager()

## Run

In [None]:
# guesser = 'HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1'
# guesser = 'gpt-4o'
guesser = 'meta-llama/Meta-Llama-3-70B-Instruct'


In [None]:
meta = 'openend'

experiment_name = 'openend_cot'
# experiment_name = 'mmlu_claude_cot_llama'


models = [
    "gpt-4o",
    # 'claude-3-opus-20240229',
    "Mistral-7B-Instruct-v0.2",
    # "gpt-4-turbo",
    # "Meta-Llama-3-70B-Instruct",
    # "gpt-3.5-turbo",
    # "Meta-Llama-3-8B-Instruct",
    # "Mixtral-8x7B-Instruct-v0.1",
    # "gemma-1.1-7b-it",
]


use_cot = True

# topics = [
#     'corrigible-less-HHH',
#     'myopic-reward',
#     'power-seeking-inclination',
#     'survival-instinct',
#     'self-awareness-general-ai'
# ]

topics =  ['high_school_mathematics',
             'high_school_physics', 
            'high_school_chemistry',
            'machine_learning',
            ]

meta = 'openend'
# topics = ['Providing dating advice',]
topics = ['Analyzing financial Reports']

In [None]:
# from core.models import select_model
# model = select_model(system_prompt='You are a helpful assistant', model_name=guesser)
# model('Who are you?')

In [None]:
evaluator = 'claude'
# guesser =  'qwen2-72b-instruct'
# guesser = 

In [None]:
run_experiments(meta, topics, models, guesser, evaluator, use_cot, few_shot=False, k_shots=3,generation_method='prog-reg', experiment_name=experiment_name,
                max_worker=10)