In [None]:
from datetime import datetime
import json
from utils import load_all_llm_answers_from_json
import pandas as pd

## TO DO
- HEATMAP of each model and whether they get the same questions wrong or not
- Add probabilistic outputs to each model answer
- Add "abstain" option for all models

In [None]:
answers_save_path = './2024-07-20-Multi-Benchmark/auto_eval_outputs'

all_llm_answers = load_all_llm_answers_from_json(answers_save_path, prefix_replace='auto_eval-')

all_llm_answers

In [None]:
all_llm_answers[list(all_llm_answers.keys())[0]].head(1)

In [None]:
# # Sanity check to see if the scores are being calculated correctly in the below cells after a low of manipulation
# model_scores = []
# for llm_model in list(all_llm_answers.keys()):
#     correct_answer = all_llm_answers[llm_model]['correct_letter'] == all_llm_answers[llm_model]['json_answer_letter']
#     percentage_correct = correct_answer.value_counts(normalize=True)[True] * 100
#     initial_score = all_llm_answers[llm_model]['score'].mean()
#     dict_results = {
#         'model': llm_model,
#         'percentage_correct': percentage_correct,
#         'initial_score': initial_score
#     }
#     model_scores.append(dict_results)

# model_scores_series = pd.DataFrame(model_scores).sort_values(by='percentage_correct', ascending=False)
# model_scores_series

In [None]:
moe_llm_answers = {}
for llm_model in list(all_llm_answers.keys()):
    answer_subset = all_llm_answers[llm_model]
    data = answer_subset[[
        'index', 'question', 'multiple_choice', 'correct_answer',
        'multi_choice_question', 'correct_letter', 'json_answer_letter',
    ]]
    moe_llm_answers[llm_model] = []
    for idx, row in data.iterrows():
        # if idx > 0:
        #     continue
        normalized_choices_dict_inv = dict(zip(sorted(row['multiple_choice']), ['A', 'B', 'C', 'D']))
        presented_choices_dict = dict(zip(['A', 'B', 'C', 'D'], row['multiple_choice']))
        correct_answer = presented_choices_dict[row['correct_letter']]
        row['correct'] = row['correct_letter'] == row['json_answer_letter']
        assert(row['correct_answer'] == correct_answer)
        normalized_correct_letter = normalized_choices_dict_inv[correct_answer]
        if row['json_answer_letter'] not in presented_choices_dict:
            selected_answer, normalized_choice_letter = None, None
        else:
            selected_answer = presented_choices_dict[row['json_answer_letter']]
            normalized_choice_letter = normalized_choices_dict_inv[selected_answer]
        moe_llm_answers[llm_model].append({
            'question': row['question'],
            # 'selected_answer': selected_answer,
            # 'correct_answer': correct_answer,
            'correct': row['correct'],
            'normalized_correct_letter': normalized_correct_letter,
            'normalized_choice_letter': normalized_choice_letter,
        })
        #print(f"idx: {idx} | {llm_model} | {moe_llm_answers[llm_model]}\n")

In [None]:
moe_llm_answers_comb_df = {}
for llm_model in list(all_llm_answers.keys()):
    data = pd.DataFrame(moe_llm_answers[llm_model])
    data.reset_index(drop=False, inplace=True)
    columns_mapping = {
        'index': ('bench', 'index'),
        'question': ('bench', 'question'),
        'normalized_correct_letter': ('bench', 'normalized_correct_letter'),
        'normalized_choice_letter': (llm_model, 'normalized_choice_letter'),
        'correct': (llm_model, 'correct'),
    }
    grouped_columns_map = [columns_mapping[col] for col in data.columns if col in columns_mapping]
    grouped_columns = pd.MultiIndex.from_tuples(grouped_columns_map)
    data.columns = grouped_columns
    data
    if len(moe_llm_answers_comb_df) == 0:
        moe_llm_answers_comb_df = data
    else:
        moe_llm_answers_comb_df = pd.merge(moe_llm_answers_comb_df, data, on=[
            ('bench', 'index'), ('bench', 'question'), ('bench', 'normalized_correct_letter')
        ])

assert len(moe_llm_answers_comb_df) == len(all_llm_answers[list(all_llm_answers.keys())[0]]), 'Unable to reliably merge based on index, question, and normalized_correct_letter. This might suggest that the data is inconsistent or inaccurate'

moe_llm_answers_comb_df

In [None]:
llm_models = list(all_llm_answers.keys())
norm_choice_cols = list(zip(llm_models, ['normalized_choice_letter'] * len(llm_models)))
norm_correct_cols = list(zip(llm_models, ['correct'] * len(llm_models)))
norm_choice_cols

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
answer_correlations = {}
for (model_base, col) in norm_choice_cols:
    model_correlations = {}
    for (model_test, col) in norm_choice_cols:
        same_answer_percentage = ((moe_llm_answers_comb_df[model_base][col].fillna(0) 
                                   == moe_llm_answers_comb_df[model_test][col].fillna(0))
                                  .value_counts(normalize=True)[True])
        model_correlations[model_test] = same_answer_percentage
    answer_correlations[model_base] = model_correlations

correlation_matrix = pd.DataFrame(answer_correlations)

plt.figure(figsize=(6, 4))
ax = sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=0, vmax=1, 
                 cbar_kws={"shrink": 0.5}, fmt='.2f', annot_kws={'size': 7})
plt.xticks(rotation=45, ha='right')
plt.title('Correlation Matrix')
plt.show()

### Run test to see how MOE performs with various permutations of models

In [None]:
# Sanity check to see if model scores match those calculated earlier (because we have done a lot of messing about). 
# It can used as a basis to see if MOE is improving results or not.
result_series = pd.Series({col[0]: (moe_llm_answers_comb_df[col] == moe_llm_answers_comb_df['bench']['normalized_correct_letter']).value_counts(normalize=True)[True] 
 for col in norm_choice_cols})

result_series.sort_values(ascending=False)

In [None]:
import numpy as np

def get_most_common_or_best_model(row, best_model_rank=None):
    # Get the value counts for the row, if all NaNs, return a random choice
    value_counts = row.value_counts()
    if value_counts.empty:  # Check if value_counts is empty (all values are NaN)
        return np.random.choice(['A', 'B', 'C', 'D'])
    else:
        # Find the maximum count
        max_count = value_counts.max()
        # Find the indices (letters) that have the maximum count
        most_common_choices = value_counts[value_counts == max_count].index.tolist()
        # If there's more than one most frequent letter, pick one given rules
        if len(most_common_choices) > 1:
            if best_model_rank is not None:
                for best_model in best_model_rank:
                    if row[best_model]['normalized_choice_letter'] in most_common_choices:
                        return row[best_model]['normalized_choice_letter']
            else:
                return np.random.choice(most_common_choices)
        else:
            return most_common_choices[0]

# # Remove '['normalized_choice_letter']' from return function to run mock test below        
# model_data_test = pd.DataFrame([
#     {'One': 'A', 'Two': 'A', 'Three': 'B', 'Four': 'B'},
#     {'One': 'C', 'Two': 'B', 'Three': 'C', 'Four': 'B'},
#     {'One': 'A', 'Two': 'B', 'Three': 'C', 'Four': 'D'},
#     ])

# best_model_rank = ['Four', 'Three', 'Two', 'One']
# most_common_answer = model_data_test.apply(axis='columns', 
#     func=lambda x: get_most_common_or_best_model(x, best_model_rank), result_type='expand')

# most_common_answer

In [None]:
print("QUESTION\nA group of four people needs to cross a bridge at night. The bridge is very old and rickety. They have only one torch and because it's night-time, the torch is necessary to cross the bridge. Each person walks at a different speed: - A takes 1 minute to cross, - B takes 2 minutes, - C takes 5 minutes, and - D takes 10 minutes. What is the fastest time they can all get across the bridge?\n\nANSWERS\nA. 17 minutes\nB. 14 minutes\nC. There is not enough information to determine the fastest time.\nD. 10 minutes\n\nProvide an explanation for your thinking and then select a single choice from ANSWERS that answer the QUESTION. Return in JSON format, for example:\n{\"ANSWER\": \"A\"}\n")

In [None]:
subset_of_models = [
                    # 'Meta-Llama-3-1-405B-Instruct-jjo_eastus_models_ai_azure_com',
                    # 'claude-3-5-sonnet-20240620', 
                    'claude-3-opus-20240229',
                    'o1-preview',
                    # 'gemini-1_5-pro', 
                    # 'gpt-4o', 
                    # 'gpt-4-turbo-preview',
                   ]
subset_norm_choice_cols = [col for col in norm_choice_cols if col[0] in subset_of_models]
subset_norm_correct_cols = [col for col in norm_correct_cols if col[0] in subset_of_models]

# subset_norm_choice_cols = norm_choice_cols
# subset_norm_correct_cols = norm_correct_cols

In [None]:
question_correct_df = (
    moe_llm_answers_comb_df.reset_index()[subset_norm_correct_cols + [('bench', 'question'), ('bench', 'index')]]
    .groupby([('bench', 'question')])
    .agg({
        **{(col, 'correct') : 'mean' for col in subset_of_models},
        ('bench', 'index'): 'min',
    })
    .sort_values(('bench', 'index'))
    .set_index(('bench', 'index'))
)
question_correct_df.index += 1

plt.figure(figsize=(6, 4))
cmap = sns.color_palette("Greys_r", as_cmap=True)
sns.heatmap(question_correct_df.T, cmap=cmap, cbar=False, linewidths=1, linecolor='grey')
#plt.xticks(rotation=0, ha='right')
None

In [None]:
question_correct_df

In [None]:

best_model_rank = None #list(result_series.sort_values(ascending=False).index.values)

answer_values = moe_llm_answers_comb_df.apply(axis='columns', 
    func=lambda x: x[subset_norm_choice_cols].value_counts())
most_common_answer = moe_llm_answers_comb_df.apply(axis='columns', 
    func=lambda x: get_most_common_or_best_model(x[subset_norm_choice_cols], best_model_rank), result_type='expand')
largest_common_answer = moe_llm_answers_comb_df.apply(axis='columns', 
    func=lambda x: x[subset_norm_choice_cols].value_counts().max())
answer_values['correct_letter'] = moe_llm_answers_comb_df['bench']['normalized_correct_letter']
answer_values['most_common_answer'] = most_common_answer
answer_values['largest_common_answer'] = largest_common_answer
display(answer_values)

correct = (answer_values['correct_letter'] == answer_values['most_common_answer']).sum() / len(answer_values)
print('Score %:', correct)

In [None]:
answer_values['correct'] = (answer_values['correct_letter'] == answer_values['most_common_answer']).astype(int)
accuracy_by_vote_agg = answer_values[['largest_common_answer', 'correct']].groupby('largest_common_answer')
accuracy_by_vote_agg.mean().plot(kind='bar', title='Accuracy by Most Common Answer')
accuracy_by_vote_agg.count().plot(kind='bar', title='Accuracy by Most Common Answer')
print(f'Highest accuracy: {accuracy_by_vote_agg.mean().max().values[0]:.2%}')

In [None]:
from itertools import combinations

def all_combinations(arr):
    x = len(arr)
    all_combos = []
    for r in range(2, x + 1):
        combos = list(combinations(arr, r))
        all_combos.extend(combos)
    return all_combos

all_combos = all_combinations(llm_models)
all_combos

In [None]:
raise

In [None]:
from tqdm import tqdm

all_combo_results = {}
for models in tqdm(all_combos):
    subset_norm_choice_cols = [col for col in norm_choice_cols if col[0] in models]
    answer_values = moe_llm_answers_comb_df.apply(axis='columns', 
        func=lambda x: x[subset_norm_choice_cols].value_counts())
    most_common_answer = moe_llm_answers_comb_df.apply(axis='columns', 
        func=lambda x: get_most_common_or_best_model(x[subset_norm_choice_cols]), result_type='expand')
    largest_common_answer = moe_llm_answers_comb_df.apply(axis='columns', 
        func=lambda x: x[subset_norm_choice_cols].value_counts().max())
    answer_values['correct_letter'] = moe_llm_answers_comb_df['bench']['normalized_correct_letter']
    answer_values['most_common_answer'] = most_common_answer
    answer_values['largest_common_answer'] = largest_common_answer
    #display(answer_values)

    correct = (answer_values['correct_letter'] == answer_values['most_common_answer']).sum() / len(answer_values)
    #print(f'Models: {models} | Score %: {correct}')
    all_combo_results[models] = correct

In [None]:
max_score = 0
for (models, score) in all_combo_results.items():
    if score > max_score:
        max_score = score
        print(f'Models: {models} | Score %: {score}')

In [None]:
pd.DataFrame(all_combo_results, index=['score']).T.sort_values(by='score', ascending=False)#.to_csv('./all_combo_results.csv')