In [None]:
import sys, os
# add code root to python path
sys.path.append('code/')
import numpy as np
from eval.eval_preferential import PreferentialEvaluator
from core.utils import ResourceManager
from core.models import CostManager
from core.card import GenerativeCard
from core.data import MMLUBatch, load_mmlu_batches, load_batches
import time
import re
from tqdm.notebook import tqdm, trange
import json
import pandas as pd

# html display
from IPython.display import display, HTML

## Experiment helper + Setup

In [None]:

models = [
    "gpt-4o",
    'claude-3-opus-20240229',
    "gpt-4-turbo",
    "Meta-Llama-3-70B-Instruct",
    "gpt-3.5-turbo",
    "Meta-Llama-3-8B-Instruct",
    "Mixtral-8x7B-Instruct-v0.1",
    "Mistral-7B-Instruct-v0.2",
    "gemma-1.1-7b-it",
]

mmlu_topics = ['high_school_mathematics',
             'high_school_physics', 
            'high_school_chemistry',
            'high_school_biology', 
            'high_school_world_history', 
            'machine_learning',
            'college_mathematics',]

anthropic_topics = [
     'corrigible-less-HHH',
    'myopic-reward',
    'power-seeking-inclination',
    'survival-instinct',
    'self-awareness-general-ai'
]

openend_topics = ['Writing efficient code for solving concrete algorthimic problems',
              'Crafting engaging and contextually appropriate jokes',
              'Providing dating advice',
              'Roleplaying as a fictional character',]

# guesser = 'gpt-4o'
guesser = 'meta-llama/Meta-Llama-3-70B-Instruct'
evaluator = 'gpt'
use_cot = False

In [None]:
def change_dir():
    # Get the current working directory
    current_dir = os.getcwd()
    print("Current Directory:", current_dir)

    # Change the current working directory to the parent directory
    parent_dir = os.path.dirname(current_dir)
    os.chdir(parent_dir)

In [None]:
def load_all_batches(meta, topic, models, fraction='test'):
    all_test_batches = {}
    for model in models:
        bs = 60 if fraction == 'test' else 40
        batch = load_batches(
                f"datasets/{meta}/", topic, model, fraction, [bs], False
            )[0]
        
        all_test_batches[model] = batch

    return all_test_batches


def get_latest_folder(topic, optim_method, card_format, evaluator, model, generation_method='generative'):
    folder_root = f'outputs/{generation_method}/{topic}/{optim_method}/{card_format}/{evaluator}/{model}'
    all_folders =  os.listdir(f'outputs/{generation_method}/{topic}/{optim_method}/{card_format}/{evaluator}/{model}')
    all_folders.sort()
    all_folders = all_folders[::-1]
    for folder in all_folders:
        if re.match(r"\d{2}-\d{2}_\d{2}-\d{2}-\d{2}_" , folder):
            return f'{folder_root}/{folder}' 
    return None

def load_cards(topic,
               generation_method,
               card_format,
               models,
               evaluator,
               epoch=4):
    
    cards = {}
    for model in models:
        path = get_latest_folder(topic, generation_method, card_format, evaluator, model, 'generative')
        with open(path+f'/cards/epoch_{epoch}_card.json', 'r') as f:
            cards[model] = GenerativeCard(d= json.load(f))
            # cards[model] = 'No description available.'

    return cards

def format_few_shot_str(batch, index, cnt):
    s = f'Question {cnt+1}: ' + batch.get_question(index) + '\n'
    s += f"Student's Completion of Question {cnt + 1}: " + batch.get_model_reasoning(index) + '\n'

    return s
def load_few_shot(batches,
                models,
                k_shots=5,
                seed=42):
    
    np.random.seed(seed)
    indices = np.random.choice(len(batches[models[0]]), k_shots, replace=False)
    few_shots = {}
    for model in models:
        batch = batches[model]
        cnt = 0
        s = ''
        for i in range(k_shots):
            s += (format_few_shot_str(batch, indices[i], cnt))
            cnt += 1

        few_shots[model] = s

    return few_shots
    

def create_experiment(meta, topic, model, batch, guesser_name, cot, k_shots=3, few_shot=False, paraphrase=False):
    exp = 'contrastive-full-cot' if cot else 'contrastive-full-no-cot'

    evaluator = PreferentialEvaluator(meta=meta, 
                                      topic=topic,
                                      model=model, 
                                      rm=ResourceManager(), 
                                      evaluator_name=guesser_name)
    
    
   
    return evaluator

In [None]:
change_dir()

## Setup

In [None]:
meta = 'mmlu'
topics = mmlu_topics

In [None]:
all_batches = load_all_batches(meta, topics[0], models)

In [None]:
generation_method = 'prog-reg'
card_format = 'bullet_point'
evaluator = 'gpt'
epoch = 4

In [None]:

# (player_a['evaluator'], player_a['card_format'], player_a['iterative_method'], player_a['epoch']))
# 0 vs. 4
cards_setups = [
    {'evaluator': 'gpt', 'card_format': 'bullet_point', 'iterative_method': 'prog-reg', 'epoch': 0},
    {'evaluator': 'gpt', 'card_format': 'bullet_point', 'iterative_method': 'prog-reg', 'epoch': 4},
    {'evaluator': 'gpt', 'card_format': 'dict', 'iterative_method': 'prog-reg', 'epoch': 0},
    {'evaluator': 'gpt', 'card_format': 'dict', 'iterative_method': 'prog-reg', 'epoch': 4},
    {'evaluator': 'gpt', 'card_format': 'bullet_point', 'iterative_method': 'one-pass', 'epoch': 0},
    {'evaluator': 'gpt', 'card_format': 'dict', 'iterative_method': 'one-pass', 'epoch': 0},
]

In [None]:
all_cards = {}
for setup in cards_setups:
    cards = load_cards(topics[0], setup['iterative_method'], setup['card_format'], models, setup['evaluator'], setup['epoch'])
    all_cards[str(setup)] = cards

In [None]:
model = models[0] 
for setup1 in tqdm(cards_setups):
    for setup2 in tqdm(cards_setups):
        if setup1 == setup2:
            continue
        
        cards = (all_cards[str(setup1)][model], all_cards[str(setup2)][model])
        evaluator = create_experiment(meta, topics[0], model, all_batches[model], guesser, use_cot)
        name = f'{model}_{str(setup1)}_{str(setup2)}'
        batch = all_batches[model]
        
        rslt = evaluator.main(name=name, 
                       batch=batch,
                       card1=cards[0],
                       card2=cards[1],
                       player_a=setup1,
                       player_b=setup2,)
        
        time.sleep(5)
        
