In [None]:
import util
from tqdm import tqdm
import preprocess
import prompt
import postprocess
import evaluation
import visualization
import json
import os
from collections import Counter
import random
import numpy as np

In [None]:
image_filenames = preprocess.get_all_image_paths('evaluation_dataset/images/case*.png')
questions = util.load_json('evaluation_dataset/questions.json')
questions_with_choices = util.load_json('evaluation_dataset/questions_with_choices.json')
questions_with_choices_explained = util.load_json('evaluation_dataset/questions_with_choices_explained.json')
examples = util.load_json('evaluation_dataset/examples.json')[0]
with open("evaluation_dataset/choices_per_feature.json", "r") as f:
    choices_per_feature_freq = json.load(f)
with open("evaluation_dataset/class_support.json", "r") as f:
    class_support = json.load(f)

In [None]:
def analysis_pipeline(experiment_name: str):
    """
    The function analyzes the annotations by calculating feature weighted recall,
    storing the result and plotting it.
    This function can only be used when all annotations are generated as it 
    automatically reads the annotation files stored in the experiment_name folder.

    Parameters
    ----------
    experiment_name:
        The experiment name (which is also the folder name) which contains the annotations.
    """
    doc_anns, gpt_anns, features = postprocess.extract_annotations(
        'evaluation_dataset/doc_anns.json',
        f'evaluation_results/{experiment_name}/gpt_anns.json',
        f'evaluation_results/{experiment_name}/anns_comparison.xlsx'
    )
    features_acc = evaluation.compute_feature_accuracies(doc_anns, gpt_anns, features, choices_per_feature_freq, average='weighted')
    util.save_as_json(features_acc, f'evaluation_results/{experiment_name}/features_acc.json')

# Zeroshot

In [None]:
exp_name_bs = 'Zeroshot'

In [None]:
query = ' '.join(str(item) for item in questions_with_choices)
ann_bs = []
for image_path in tqdm(image_filenames):
    ann_bs.append(prompt.zeroshot(image_path, query))
util.save_as_json(ann_bs, f'evaluation_results/{exp_name_bs}/annotations.json')

In [None]:
anns_bs = util.load_json(f'evaluation_results/{exp_name_bs}/annotations.json')
answers_bs = []
for ann in anns_bs:
    answer_bs = postprocess.extract_answers(questions_with_choices, ann)
    answers_bs.append(answer_bs)
util.save_as_json(answers_bs, f'evaluation_results/{exp_name_bs}/gpt_anns.json')

In [None]:
analysis_pipeline(exp_name_bs)

# Zeroshot_Grok

In [None]:
exp_name_bs = 'Zeroshot_Grok'

In [None]:
query = ' '.join(str(item) for item in questions_with_choices)
ann_bs = []
for image_path in tqdm(image_filenames):
    ann_bs.append(prompt.zeroshot_grok(image_path, query))
util.save_as_json(ann_bs, f'evaluation_results/{exp_name_bs}/annotations.json')

In [None]:
anns_bs = util.load_json(f'evaluation_results/{exp_name_bs}/annotations.json')
answers_bs = []
for ann in anns_bs:
    answer_bs = postprocess.extract_answers(questions_with_choices, ann)
    answers_bs.append(answer_bs)
util.save_as_json(answers_bs, f'evaluation_results/{exp_name_bs}/gpt_anns.json')

In [None]:
analysis_pipeline(exp_name_bs)

# Zeroshot_Claude

In [None]:
exp_name_bs = 'Zeroshot_Claude'

In [None]:
query = ' '.join(str(item) for item in questions_with_choices)
ann_bs = []
for image_path in tqdm(image_filenames):
    ann_bs.append(prompt.zeroshot_claude(image_path, query))
util.save_as_json(ann_bs, f'evaluation_results/{exp_name_bs}/annotations.json')

In [None]:
anns_bs = util.load_json(f'evaluation_results/{exp_name_bs}/annotations.json')
answers_bs = []
for ann in anns_bs:
    answer_bs = postprocess.extract_answers(questions_with_choices, ann)
    answers_bs.append(answer_bs)
util.save_as_json(answers_bs, f'evaluation_results/{exp_name_bs}/gpt_anns.json')

In [None]:
analysis_pipeline(exp_name_bs)

# Zeroshot Var

In [None]:
exp_name_var = 'Zeroshot_Var'
exp_num = 5

In [None]:
query = ' '.join(str(item) for item in questions_with_choices)
for i in range(exp_num):
    ann_var = []
    for image_path in tqdm(image_filenames):
        ann_var.append(prompt.zeroshot(image_path, query))
    util.save_as_json(ann_var, f'evaluation_results/{exp_name_var}/annotations{i}.json')

In [None]:
for i in range(exp_num):
    anns_var = util.load_json(f'evaluation_results/{exp_name_var}/annotations{i}.json')
    answers_var = []
    for ann in anns_var:
        answer_var = postprocess.extract_answers(questions_with_choices, ann)
        answers_var.append(answer_var)
    util.save_as_json(answers_var, f'evaluation_results/{exp_name_var}/gpt_anns{i}.json')

In [None]:
for i in range(exp_num):
    doc_anns, gpt_anns, features = postprocess.extract_annotations(
        'evaluation_dataset/doc_anns.json',
        f'evaluation_results/{exp_name_var}/gpt_anns{i}.json',
        f'evaluation_results/{exp_name_var}/annotations_comparison{i}.xlsx'
    )

    features_acc = evaluation.compute_feature_accuracies(doc_anns, gpt_anns, features, choices_per_feature_freq, average='macro')
    util.save_as_json(features_acc, f'evaluation_results/{exp_name_var}/features_acc{i}.json')

# Zeroshot Var Explained

In [None]:
exp_name_var = 'Zeroshot_Var_Explained'
exp_num = 5

In [None]:
query = ' '.join(str(item) for item in questions_with_choices_explained)
for i in range(exp_num):
    ann_var = []
    for image_path in tqdm(image_filenames):
        ann_var.append(prompt.zeroshot(image_path, query))
    util.save_as_json(ann_var, f'evaluation_results/{exp_name_var}/annotations{i}.json')

In [None]:
for i in range(exp_num):
    anns_var = util.load_json(f'evaluation_results/{exp_name_var}/annotations{i}.json')
    answers_var = []
    for ann in anns_var:
        answer_var = postprocess.extract_answers(questions_with_choices, ann)
        answers_var.append(answer_var)
    util.save_as_json(answers_var, f'evaluation_results/{exp_name_var}/gpt_anns{i}.json')

In [None]:
for i in range(exp_num):
    doc_anns, gpt_anns, features = postprocess.extract_annotations(
        'evaluation_dataset/doc_anns.json',
        f'evaluation_results/{exp_name_var}/gpt_anns{i}.json',
        f'evaluation_results/{exp_name_var}/annotations_comparison{i}.xlsx'
    )

    features_acc = evaluation.compute_feature_accuracies(doc_anns, gpt_anns, features, choices_per_feature_freq)
    util.save_as_json(features_acc, f'evaluation_results/{exp_name_var}/features_acc{i}.json')

# Zeroshot Free

In [None]:
exp_name_bsf = 'Zeroshot_free'

In [None]:
query = ' '.join(str(item) for item in questions_with_choices)
ann_bsf = []
for image_path in tqdm(image_filenames):
    ann_bsf.append(prompt.zeroshot_free(image_path, query))
util.save_as_json(ann_bsf, f'evaluation_results/{exp_name_bsf}/annotations.json')

In [None]:
anns_bsf = util.load_json(f'evaluation_results/{exp_name_bsf}/annotations.json')
answers_bsf = []
for ann in anns_bsf:
    answer_bsf = postprocess.extract_answers(questions_with_choices, ann)
    answers_bsf.append(answer_bsf)
util.save_as_json(answers_bsf, f'evaluation_results/{exp_name_bsf}/gpt_anns.json')

In [None]:
analysis_pipeline(exp_name_bsf)

# Fewshots

In [None]:
exp_name_fs = 'Fewshots'

In [None]:
query = ' '.join(str(item) for item in questions_with_choices)
fewshot_ex = ' '.join(item for item in examples['fewshots'])
ann_fs = []
for image_path in tqdm(image_filenames):
    ann_fs.append(prompt.fewshots(image_path, query, fewshot_ex))
util.save_as_json(ann_fs, f'evaluation_results/{exp_name_fs}/annotations.json')

In [None]:
anns_fs = util.load_json(f'evaluation_results/{exp_name_fs}/annotations.json')
answers_fs = []
for ann in anns_fs:
    answer_fs = postprocess.extract_answers(questions_with_choices, ann)
    answers_fs.append(answer_fs)
util.save_as_json(answers_fs, f'evaluation_results/{exp_name_fs}/gpt_anns.json')

In [None]:
analysis_pipeline(exp_name_fs)

# Fewshots with Images

In [None]:
exp_name_fsimg = 'FewshotsImg'

In [None]:
query = ' '.join(str(item) for item in questions_with_choices)
ann_fsimg = []
for image_path in tqdm(image_filenames):
    ann_fsimg.append(prompt.fewshotsImg(image_path, query, 'evaluation_dataset/images/case1.png', examples['fewshots'][0], 'evaluation_dataset/images/case5.png', examples['fewshots'][1]))
util.save_as_json(ann_fsimg, f'evaluation_results/{exp_name_fsimg}/annotations.json')

In [None]:
anns_fsimg = util.load_json(f'evaluation_results/{exp_name_fsimg}/annotations.json')
answers_fsimg = []
for ann in anns_fsimg:
    answer_fsimg = postprocess.extract_answers(questions_with_choices, ann)
    answers_fsimg.append(answer_fsimg)
util.save_as_json(answers_fsimg, f'evaluation_results/{exp_name_fsimg}/gpt_anns.json')

In [None]:
analysis_pipeline(exp_name_fsimg)

# CoT + Fewshots

In [None]:
exp_name_cot = 'CoTFewshots'

In [None]:
query = ' '.join(str(item) for item in questions_with_choices)
cot_ex = ' '.join(item for item in examples['cot'])
ann_cot = []
for image_path in tqdm(image_filenames):
    ann_cot.append(prompt.CoTFewshots(image_path, query, cot_ex))
util.save_as_json(ann_cot, f'evaluation_results/{exp_name_cot}/annotations.json')

In [None]:
anns_cot = util.load_json(f'evaluation_results/{exp_name_cot}/annotations.json')
answers_cot = []
for ann in anns_cot:
    answer_cot = postprocess.extract_answers(questions_with_choices, ann)
    answers_cot.append(answer_cot)
util.save_as_json(answers_cot, f'evaluation_results/{exp_name_cot}/gpt_anns.json')

In [None]:
analysis_pipeline(exp_name_cot)

# ToT + Fewshots

In [None]:
exp_name_tot = 'ToTFewshots'

In [None]:
tot_ex = ' '.join(item for item in examples['cot'])
query = ' '.join(str(item) for item in questions_with_choices)
ann_tot = []
for image_path in tqdm(image_filenames):
    ann_tot.append(prompt.ToTFewshots(image_path, query, 3, 5, tot_ex))
util.save_as_json(ann_tot, f'evaluation_results/{exp_name_tot}/annotations.json')

In [None]:
anns_tot = util.load_json(f'evaluation_results/{exp_name_tot}/annotations.json')
answers_tot = []
for ann in anns_tot:
    answer_tot = postprocess.extract_answers(questions_with_choices, ann)
    answers_tot.append(answer_tot)
util.save_as_json(answers_tot, f'evaluation_results/{exp_name_tot}/gpt_anns.json')

In [None]:
analysis_pipeline(exp_name_tot)

# Self Consistency

In [None]:
exp_name_var = 'Consistency'
exp_num = 5
cot_ex = ' '.join(item for item in examples['cot'])
query = ' '.join(str(item) for item in questions_with_choices)
for i in range(exp_num):
    ann_var = []
    for image_path in tqdm(image_filenames):
        ann_var.append(prompt.CoTFewshots(image_path, query, cot_ex))
    util.save_as_json(ann_var, f'evaluation_results/{exp_name_var}/annotations{i}.json')
for i in range(exp_num):
    anns_var = util.load_json(f'evaluation_results/{exp_name_var}/annotations{i}.json')
    answers_var = []
    for ann in anns_var:
        answer_var = postprocess.extract_answers(questions_with_choices, ann)
        answers_var.append(answer_var)
    util.save_as_json(answers_var, f'evaluation_results/{exp_name_var}/gpt_anns{i}.json')
for i in range(exp_num):
    doc_anns, gpt_anns, features = postprocess.extract_annotations(
        'evaluation_dataset/doc_anns.json',
        f'evaluation_results/{exp_name_var}/gpt_anns{i}.json',
        f'evaluation_results/{exp_name_var}/annotations_comparison{i}.xlsx'
    )

    features_acc = evaluation.compute_feature_accuracies(doc_anns, gpt_anns, features, choices_per_feature_freq)
    util.save_as_json(features_acc, f'evaluation_results/{exp_name_var}/features_acc{i}.json')

In [None]:
final_answer = []
for i in range(25):
    consistent_ans = {}
    case_answers = []
    for j in range(5):
        file_path = f"evaluation_results/Consistency/gpt_anns{j}.json"
        answers = util.load_json(file_path)
        case_answer = json.loads(answers[i])
        case_answers.append(case_answer)
    for item in range(len(case_answers[0])):
        values = []
        for index in range(5):
            values.append(list(case_answers[index].values())[item])
            value_counts = Counter(values)
            value, _ = value_counts.most_common(1)[0]
            consistent_ans[list(case_answers[0].keys())[item]] = value
    final_answer.append(json.dumps(consistent_ans))
util.save_as_json(final_answer, 'evaluation_results/Consistency/gpt_anns.json')

In [None]:
analysis_pipeline('Consistency')

# Self-debate

In [None]:
exp_name_tot = 'Debate'

In [None]:
tot_ex = ' '.join(item for item in examples['debate'])
query = ' '.join(str(item) for item in questions_with_choices)
ann_tot = []
for image_path in tqdm(image_filenames):
    ann_tot.append(prompt.self_debate(image_path, query, tot_ex))
util.save_as_json(ann_tot, f'evaluation_results/{exp_name_tot}/annotations.json')

In [None]:
anns_tot = util.load_json(f'evaluation_results/{exp_name_tot}/annotations.json')
answers_tot = []
for ann in anns_tot:
    answer_tot = postprocess.extract_answers(questions_with_choices, ann)
    answers_tot.append(answer_tot)
util.save_as_json(answers_tot, f'evaluation_results/{exp_name_tot}/gpt_anns.json')

In [None]:
analysis_pipeline(exp_name_tot)

# Self-Critique

In [None]:
exp_name_tot = 'Critique'

In [None]:
cri_ex = ' '.join(item for item in examples['critique'])
query = ' '.join(str(item) for item in questions_with_choices)
ann_tot = []
for image_path in tqdm(image_filenames):
    ann_tot.append(prompt.self_critique(image_path, query, cri_ex))
util.save_as_json(ann_tot, f'evaluation_results/{exp_name_tot}/annotations.json')

In [None]:
anns_tot = util.load_json(f'evaluation_results/{exp_name_tot}/annotations.json')
answers_tot = []
for ann in anns_tot:
    answer_tot = postprocess.extract_answers(questions_with_choices, ann)
    answers_tot.append(answer_tot)
util.save_as_json(answers_tot, f'evaluation_results/{exp_name_tot}/gpt_anns.json')

In [None]:
analysis_pipeline(exp_name_tot)

# Temperature Test

In [None]:
cot_ex = ' '.join(item for item in examples['cot'])
query = ' '.join(str(item) for item in questions_with_choices)

for temp in np.linspace(0, 0.6, 5):
    exp_name_temp = f'Temperature_{temp}'
    ann_temp = []
    for image_path in tqdm(image_filenames):
        ann_temp.append(prompt.CoTFewshots(image_path, query, cot_ex, temperature=temp))
    util.save_as_json(ann_temp, f'evaluation_results/{exp_name_temp}/annotations.json')
    anns_temp = util.load_json(f'evaluation_results/{exp_name_temp}/annotations.json')
    answers_temp = []
    for ann in anns_temp:
        answer_temp = postprocess.extract_answers(questions_with_choices, ann)
        answers_temp.append(answer_temp)
    util.save_as_json(answers_temp, f'evaluation_results/{exp_name_temp}/gpt_anns.json')
    analysis_pipeline(exp_name_temp)

# Order

In [None]:
cot_ex = ' '.join(item for item in examples['cot'])
for i in range(5):
    shf_qs = preprocess.randomized_list(questions_with_choices, 13)
    query_qs = ' '.join(str(item) for item in shf_qs)
    exp_name_order = f'Order{i}'

    anns_order = []
    for image_path in tqdm(image_filenames):
        anns_order.append(prompt.CoTFewshots(image_path, query_qs, cot_ex))
    util.save_as_json(anns_order, f'evaluation_results/{exp_name_order}/annotations.json')

    anns_order = util.load_json(f'evaluation_results/{exp_name_order}/annotations.json')
    answers_order = []
    for ann in anns_order:
        answer_order = postprocess.extract_answers(questions_with_choices, ann)
        answers_order.append(answer_order)
    util.save_as_json(answers_order, f'evaluation_results/{exp_name_order}/gpt_anns.json')

    analysis_pipeline(exp_name_order)

# Single

In [None]:
exp_name_sg = f'Single'

In [None]:
anns_sg = []
cot_ex = ' '.join(item for item in examples['cot'])
for image_path in tqdm(image_filenames):
    single_ans = ''
    for question in questions_with_choices:
        single_ans += prompt.CoTFewshots(image_path, question, cot_ex)
    anns_sg.append(single_ans)
util.save_as_json(anns_sg, f'evaluation_results/{exp_name_sg}/annotations.json')

In [None]:
anns_sg = util.load_json(f'evaluation_results/{exp_name_sg}/annotations.json')
answers_sg = []
for ann in anns_sg:
    answer_sg = postprocess.extract_answers(questions_with_choices, ann)
    answers_sg.append(answer_sg)
util.save_as_json(answers_sg, f'evaluation_results/{exp_name_sg}/gpt_anns.json')

In [None]:
analysis_pipeline(exp_name_sg)

# Leave one out

In [None]:
exp_name_loo = f'Leave_one_out'
doc_anns = util.load_json('evaluation_dataset/doc_anns.json')
questions = preprocess.extract_questions('evaluation_dataset/questions.json')

In [None]:
anns_loo = []
cot_ex = ' '.join(item for item in examples['cot'])
for i, image_path in enumerate(image_filenames):
    print(i)
    image_path = image_filenames[i]
    doc_ann = doc_anns[i]
    single_ans = ''
    for (question, qa) in tqdm(zip(questions, questions_with_choices)):
        doc_ann_copy = doc_ann.copy()
        doc_ann_copy.pop(question)
        information = "\n".join(f"{key}{value}" for key, value in list(doc_ann_copy.items())[:-1])
        single_ans += prompt.leave_one_out(image_path, information, qa, cot_ex)
    anns_loo.append(single_ans)
util.save_as_json(anns_loo, f'evaluation_results/{exp_name_loo}/annotations.json')

In [None]:
anns_loo = util.load_json(f'evaluation_results/{exp_name_loo}/annotations.json')
answers_loo = []
for ann in anns_loo:
    answer_loo = postprocess.extract_answers(questions_with_choices, ann)
    answers_loo.append(answer_loo)
util.save_as_json(answers_loo, f'evaluation_results/{exp_name_loo}/gpt_anns.json')

In [None]:
analysis_pipeline(exp_name_loo)

# Leave one out txt

In [None]:
exp_name_loo = f'Leave_one_out_txt'
doc_anns = util.load_json('evaluation_dataset/doc_anns.json')
questions = preprocess.extract_questions('evaluation_dataset/questions.json')

In [None]:
anns_loo = []
cot_ex = ' '.join(item for item in examples['cot'])
for i, image_path in enumerate(image_filenames):
    print(i)
    image_path = image_filenames[i]
    doc_ann = doc_anns[i]
    single_ans = ''
    for (question, qa) in tqdm(zip(questions, questions_with_choices)):
        doc_ann_copy = doc_ann.copy()
        doc_ann_copy.pop(question)
        information = "\n".join(f"{key}{value}" for key, value in list(doc_ann_copy.items())[:-1])
        single_ans += prompt.leave_one_out_txt(information, qa, cot_ex)
    anns_loo.append(single_ans)
util.save_as_json(anns_loo, f'evaluation_results/{exp_name_loo}/annotations.json')

In [None]:
anns_loo = util.load_json(f'evaluation_results/{exp_name_loo}/annotations.json')
answers_loo = []
for ann in anns_loo:
    answer_loo = postprocess.extract_answers(questions_with_choices, ann)
    answers_loo.append(answer_loo)
util.save_as_json(answers_loo, f'evaluation_results/{exp_name_loo}/gpt_anns.json')

In [None]:
analysis_pipeline(exp_name_loo)

# Correct One Known

In [None]:
exp_name_cone = f'Correct_one_known'
doc_anns = util.load_json('evaluation_dataset/doc_anns.json')
questions = preprocess.extract_questions('evaluation_dataset/questions.json')
qa_pairs = util.load_json('evaluation_dataset/questions_with_choices_correct_mistakes.json')
correction_rec = {}

In [None]:
correction_attempts = []
cot_ex = ' '.join(item for item in examples['cot'])
anns_cone = []
for i, image_path in enumerate(image_filenames):
    print(i)
    image_path = image_filenames[i]
    doc_ann = doc_anns[i]
    for question in tqdm(questions):
        if question not in correction_rec:
            correction_rec[question] = {'total': 1, 'correct': 0}
        else:
            correction_rec[question]['total'] += 1

        doc_ann_copy = doc_ann.copy()

        original_answer = doc_ann_copy[question]
        fake_ans = util.random_answer(original_answer, qa_pairs[question.replace('\n', '')])
        doc_ann_copy[question] = fake_ans

        information = "\n".join(f"{key} {value}" for key, value in list(doc_ann_copy.items()))
        response = {}
        while response == {}:
            ans = prompt.correct_one_mistake_known(image_path, information, question, cot_ex)
            anns_cone.append(ans)
            response = postprocess.extract_answers_specific_question(questions_with_choices, ans, question)
            response = postprocess.process_response(response)
        print(response)
        corrected_answer = next(iter(response.values()))

        if corrected_answer == original_answer:
            correction_rec[question]['correct'] += 1

        correction_attempts.append(f"Question: {question}, Correct answer: {original_answer}, Fake answer: {fake_ans}, Answer after GPT correction: {corrected_answer}")


util.save_as_json(correction_attempts, f'evaluation_results/{exp_name_cone}/correction_attempts.json')
util.save_as_json(correction_rec, f'evaluation_results/{exp_name_cone}/correction_rec.json')
util.save_as_json(anns_cone, f'evaluation_results/{exp_name_cone}/annotations.json')

In [None]:
correction_result = util.load_json(f'evaluation_results/{exp_name_cone}/correction_rec.json')
acc_after_corr = evaluation.compute_feature_acc_from_dict(correction_result)
util.save_as_json(acc_after_corr, f'evaluation_results/{exp_name_cone}/features_acc.json')

# Correct One Unknown

In [None]:
from evaluation import sparse_embed, sparse_cosine_similarity
embedded = {}
def is_match(pred: str, true: str) -> bool:
        if pred not in embedded:
            embedded[pred] = sparse_embed(pred)
        if true not in embedded:
            embedded[true] = sparse_embed(true)
        return sparse_cosine_similarity(embedded[pred], embedded[true]) >= 0.8

In [None]:
exp_name_cone = f'Correct_one_unknown'
doc_anns = util.load_json('evaluation_dataset/doc_anns.json')
questions = preprocess.extract_questions('evaluation_dataset/questions.json')
qa_pairs = util.load_json('evaluation_dataset/questions_with_choices_correct_mistakes.json')
correction_rec = {}
questions_loc = {}

In [None]:
correction_attempts = []
anns_cone = []
identified_questions = []
cot_ex = ' '.join(item for item in examples['cot'])

for i, image_path in enumerate(image_filenames):
    print(i)
    image_path = image_filenames[i]
    doc_ann = doc_anns[i]
    for question in tqdm(questions):
        if question not in correction_rec:
            correction_rec[question] = {'total': 1, 'correct': 0}
        else:
            correction_rec[question]['total'] += 1

        if question not in questions_loc:
            questions_loc[question] = {'total': 1, 'correct': 0}
        else:
            questions_loc[question]['total'] += 1

        doc_ann_copy = doc_ann.copy()

        original_answer = doc_ann_copy[question]
        fake_ans = util.random_answer(original_answer, qa_pairs[question.replace('\n', '')])
        doc_ann_copy[question] = fake_ans

        information = "\n".join(f"{key} {value}" for key, value in list(doc_ann_copy.items()))
        response = {}
        ans = prompt.correct_one_mistake_unknown(image_path, information, cot_ex)
        anns_cone.append(ans)
        response = postprocess.extract_answers_specific_question(questions_with_choices, ans, question)
        response = postprocess.process_response(response)
        identified_question = postprocess.question_locate(questions_with_choices, ans)

        identified_questions.append(identified_question)
        if is_match(identified_question, question):
            questions_loc[question]['correct'] += 1
            
        print(response)
        corrected_answer = next(iter(response.values()))

        if corrected_answer == original_answer:
            correction_rec[question]['correct'] += 1

        correction_attempts.append(f"Question: {question}, Correct answer: {original_answer}, Fake answer: {fake_ans}, Answer after GPT correction: {corrected_answer}")


util.save_as_json(correction_attempts, f'evaluation_results/{exp_name_cone}/correction_attempts.json')
util.save_as_json(correction_rec, f'evaluation_results/{exp_name_cone}/correction_rec.json')
util.save_as_json(questions_loc, f'evaluation_results/{exp_name_cone}/questions_loc.json')
util.save_as_json(anns_cone, f'evaluation_results/{exp_name_cone}/annotations.json')

In [None]:
correction_result = util.load_json(f'evaluation_results/{exp_name_cone}/correction_rec.json')
acc_after_corr = evaluation.compute_feature_acc_from_dict(correction_result)
id_result = util.load_json(f'evaluation_results/{exp_name_cone}/questions_loc.json')
question_identify_rate = evaluation.compute_feature_acc_from_dict(id_result)
util.save_as_json(question_identify_rate, f'evaluation_results/{exp_name_cone}/q_id_rate.json')
util.save_as_json(acc_after_corr, f'evaluation_results/{exp_name_cone}/features_acc.json')

# Visualization

## Major Visalization

In [None]:
visualization.comparison_star_plot([], f'evaluation_results/plots/Major.png', 'Majority Class Baseline', class_support, figsize=(12,10))

## PE Visualization

In [None]:
exp_names = ['Zeroshot']
features_accs = []
for exp in exp_names:
    file_path = os.path.join(f'evaluation_results/{exp}/', "features_acc.json")
    feature_acc = util.load_json(file_path)
    features_accs.append(feature_acc)
visualization.comparison_star_plot(None, f'evaluation_results/plots/{exp_names[0]}.png', ['Marjority-Class'], class_support, figsize=(12,10), title='Majority Baseline Weighted Recall')

## Var Visualization

single

In [None]:
features_accs = []
for exp in exp_names:
    file_path = os.path.join(f'evaluation_results/Zeroshot_Var/', "features_acc0.json")
    feature_acc = util.load_json(file_path)
    features_accs.append(feature_acc)
visualization.comparison_star_plot(features_accs, f'evaluation_results/plots/test.png', exp_names, class_support)

var

In [None]:
feature_accs = []
for i in range(5):
    feature_acc = util.load_json(f'evaluation_results/Zeroshot_Var_explained/features_acc{i}.json')
    feature_accs.append(feature_acc)
visualization.create_star_variance(feature_accs, f'evaluation_results/plots/Zeroshot_Var_Explained.png', class_support, figsize=(12, 8))

## Predictions Distribution

In [None]:
experiment_names = ['Leave_one_out', 'Leave_one_out_txt']
doc_anns = util.load_json('evaluation_dataset/doc_anns.json') # List of 25 dicts
path = f"evaluation_results/question_compare/q_{'+'.join(experiment_names)}.png"
visualization.question_lables_distribution('evaluation_dataset/doc_anns.json', experiment_names, 12, path)

# Wrong Question Location

In [None]:
exp_names = ['Wrong Answers Location Accuracy', 'Wrong Answers Correction Accuracy']
file_path = os.path.join(f'evaluation_results/Correct_one_unknown/', "q_id_rate.json")
question_acc = util.load_json(file_path)
question_acc_list = [question_acc, feature_acc]
visualization.comparison_star_plot(question_acc_list, f'evaluation_results/plots/{exp_names[0]}.png', exp_names, None, figsize=(12,10), title='Wrong Answers Location Accuracy VS Wrong Answers Correction Accuracy')

# Avg Recall Calculator

In [None]:
exp_name = 'Debate'
scores = util.load_json(f'evaluation_results/{exp_name}/features_acc.json')
tot = 0
for item in scores.items():
    tot += item[1]
avg = tot / len(scores)
print(avg)

exp_name = 'Fewshots'
scores = util.load_json(f'evaluation_results/{exp_name}/features_acc.json')
tot = 0
for item in scores.items():
    tot += item[1]
avg = tot / len(scores)
print(avg)