In [76]:
import numpy as np
import json

In [73]:
def load_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in: {file_path}")
        return None
    
com2sense_results = load_json_file('data/com2sense_results.json')
com2sense_resilience = load_json_file('data/com2sense_resilience.json')

cause_effect_results = load_json_file('data/cause_effect_results.json')
cause_effect_resilience = load_json_file('data/cause_effect_resilience.json')

arithmetic_results = load_json_file('data/arithmetic_results.json')
arithmetic_resilience = load_json_file('data/arithmetic_resilience.json')

web_results = load_json_file('data/web_of_lies_results.json')
web_resilience = load_json_file('data/web_of_lies_resilience.json')

In [83]:
# check if predicted answer is same as true answer (used to calculate Pearson coefficient)
models_to_evaluate = ['gpt-3.5-turbo', 'gpt-4-turbo', 'gemini-1.5-flash']
STEP_SIZE = len(models_to_evaluate) * 3

def initialize_answers():
    return {
        'gpt-3.5-turbo': [],
        'gpt-4-turbo': [],
        'gemini-1.5-flash': []
    }

# com2sense
answers_com2sense = initialize_answers()
for i in range(0, len(com2sense_results), STEP_SIZE):
    for j in range(len(models_to_evaluate)):
        parsed_pred = 'true' in com2sense_results[i + j]['pred_answer'].lower()
        com2sense_results[i + j]['correct'] = com2sense_results[i + j]['true_answer'] == parsed_pred
        answers_com2sense[models_to_evaluate[j]].append(int(com2sense_results[i + j]['correct']))

# cause and effect
answers_cause_effect = initialize_answers()
for i in range(0, len(cause_effect_results), STEP_SIZE):
    for j in range(len(models_to_evaluate)):
        parsed_pred = 'true' in cause_effect_results[i + j]['pred_answer'].lower()
        cause_effect_results[i + j]['correct'] = cause_effect_results[i + j]['true_answer'] == parsed_pred
        answers_cause_effect[models_to_evaluate[j]].append(int(cause_effect_results[i + j]['correct']))

# web of lies
answers_web = initialize_answers()
for i in range(0, len(web_results), STEP_SIZE):
    for j in range(len(models_to_evaluate)):
        pred = web_results[i + j]['pred_answer'].lower()
        parsed_pred = True if pred is not None and 'yes' in pred else False
        web_results[i + j]['correct'] = web_results[i + j]['true_answer'] == parsed_pred
        answers_web[models_to_evaluate[j]].append(int(web_results[i + j]['correct']))

# arithmetic
answers_arithmetic = initialize_answers()
for i in range(0, len(arithmetic_results), STEP_SIZE):
    for j in range(len(models_to_evaluate)):
        try:
            parsed_pred = int(arithmetic_results[i + j]['pred_answer'].replace('.', '').replace(',', ''))
            arithmetic_results[i + j]['correct'] = arithmetic_results[i + j]['true_answer'] == parsed_pred
        except:
            arithmetic_results[i + j]['correct'] = False
        answers_arithmetic[models_to_evaluate[j]].append(int(arithmetic_results[i + j]['correct']))

In [25]:
def calculate_quality(correctness_list, relevance_list, alignment):
    correctness = correctness_list.count('correct') / len(correctness_list)
    relevance = relevance_list.count('relevant') / len(relevance_list)
    alignment = 1 if alignment.lower() == 'yes' else 0
    return correctness, relevance, alignment

In [30]:
def calc_qualities_by_model(results_data, resilience_data):
    models_to_evaluate = ['gpt-3.5-turbo', 'gpt-4-turbo', 'gemini-1.5-flash']
    qualities_by_model = {}
    for model in models_to_evaluate:
        qualities_by_model[model] = {
            'correctness_quality': [],
            'relevance_quality': [],
            'alignment_quality': [],
            'resilience': []
        }

    STEP_SIZE = len(models_to_evaluate) * 3
    for i in range(0, len(results_data), STEP_SIZE):
        for j in range(len(models_to_evaluate)):
            original_entry = results_data[i + j]
            model = original_entry['model']
            correctness_list = original_entry['correctness_annotations']
            relevance_list = original_entry['relevance_annotations']
            alignment = original_entry['answer_in_alignment']

            correctness_quality, relevance_quality, alignment_quality = calculate_quality(correctness_list, relevance_list, alignment)
            
            qualities_by_model[model]['correctness_quality'].append(correctness_quality)
            qualities_by_model[model]['relevance_quality'].append(relevance_quality)
            qualities_by_model[model]['alignment_quality'].append(alignment_quality)

    for i in range(0, len(resilience_data), len(models_to_evaluate)):
        for j in range(len(models_to_evaluate)):
            entry = resilience_data[i + j]
            model = entry['model']
            resilience_metric = entry['resilience_metric']
            qualities_by_model[model]['resilience'].append(resilience_metric)
    return qualities_by_model

In [None]:
qualities_by_model_com2sense = calc_qualities_by_model(com2sense_results, com2sense_resilience)
qualities_by_model_cause_effect = calc_qualities_by_model(cause_effect_results, cause_effect_resilience)
qualities_by_model_arithmetic = calc_qualities_by_model(arithmetic_results, arithmetic_resilience)
qualities_by_model_web = calc_qualities_by_model(web_results, web_resilience)

0.9954166666666666

In [39]:
def print_averages(qualities_by_model, models_to_evaluate):
    for model in models_to_evaluate:
        print('Model:', model)
        print('correctness_quality average:', sum(qualities_by_model[model]['correctness_quality'])/len(qualities_by_model[model]['correctness_quality']))
        print('relevance_quality average:', sum(qualities_by_model[model]['relevance_quality'])/len(qualities_by_model[model]['relevance_quality']))
        print('alignment_quality average:', sum(qualities_by_model[model]['alignment_quality'])/len(qualities_by_model[model]['alignment_quality']))
        print('resilience average:', sum(qualities_by_model[model]['resilience'])/len(qualities_by_model[model]['resilience']))

models_to_evaluate = ['gpt-3.5-turbo', 'gpt-4-turbo', 'gemini-1.5-flash']

# print('Printing results for com2sense:')
# print_averages(qualities_by_model_com2sense, models_to_evaluate)

# print('\nPrinting results for cause_effect:')
# print_averages(qualities_by_model_cause_effect, models_to_evaluate)

# print('\nPrinting results for arithmetic:')
# print_averages(qualities_by_model_arithmetic, models_to_evaluate)

print('\nPrinting results for web of lies:')
print_averages(qualities_by_model_web, models_to_evaluate)


Printing results for web of lies:
Model: gpt-3.5-turbo
correctness_quality average: 0.8767976190476188
relevance_quality average: 0.9936666666666666
alignment_quality average: 0.78
resilience average: 0.8448713159462602
Model: gpt-4-turbo
correctness_quality average: 0.9943333333333332
relevance_quality average: 1.0
alignment_quality average: 1.0
resilience average: 0.9760852448744575
Model: gemini-1.5-flash
correctness_quality average: 0.9919999999999999
relevance_quality average: 1.0
alignment_quality average: 1.0
resilience average: 0.9780006085616687


In [None]:
# Pearson correlation (not used)
def correlations(qualities_by_model, answers):
    for model in models_to_evaluate:
        print('Model:', model)
        c = np.array(qualities_by_model[model]['correctness_quality'])
        r = np.array(qualities_by_model[model]['relevance_quality'])
        a = np.array(qualities_by_model[model]['alignment_quality'])
        resil = np.array(qualities_by_model[model]['resilience'])
        ans = np.array(answers[model])

        print('Correctness coefficient:', np.corrcoef(r, ans)[0, 1])

correlations(qualities_by_model_com2sense, answers_com2sense)

  c /= stddev[:, None]
  c /= stddev[None, :]


array([[ 1., nan],
       [nan, nan]])