In [1]:
import json
import plotly.express as px
import pandas as pd

In [2]:
def get_count_correct(path):
    try:
        with open(path, 'r') as f:
            data = json.load(f)
        return data['stats']['count_correct']/data['stats']['count_total'] * 100
    except:
        print(f"Error reading {path}")


In [3]:
# read in json file

'''
  - ```baseline``` for directly prompting, 
  - ```zs_cot``` for zero-shot chain-of-thought (CoT) prompting, 
  - ```os``` for one-shot in-context learning (ICL) prompting with the original Linda Problem (default), 
  - ```os_cot``` for one-shot ICL plus COT prompting , 
  - ```os_bob``` for one-shot ICL prompting but with a rephrased Bob Problem, 
  - ```os_bob_cot``` for one-shot ICL prompting plus COT but with a rephrased Bob Problem, 
  - ```os_incorrect``` for one-shot ICL but with an incorrect answer, 
  - ```os_incorrect_cot``` for one-shot ICL plus COT but with an incorrect answer,
  - ```fs``` for few-shot ICL prompting,
  - ```fs_cot``` for few-shot ICL plus COT prompting,
  - ```weak_control_zs_cot``` for weakly controlled zero-shot CoT prompting, leaking the hint that it is a Linda Problem but without detailed explanations,
  - ```weak_control_os_cot``` for weakly controlled one-shot CoT prompting, leaking the hint that it is a Linda Problem but without detailed explanations,
  - ```control_zs_cot``` for controlled zero-shot CoT prompting, leaking the hint that it is a Linda Problem with detailed and carefully-curated explanations,
  - ```control_os_cot``` for controlled one-shot CoT prompting, leaking the hint that it is a Linda Problem with detailed and carefully-curated explanations.
'''


def draw(variant):
    data = {"gold": [], "random": [], 'baseline': []}
    for data_type in ['gold', 'random', 'baseline']:
        for prompting_method in ['baseline', 'zs_cot', 'os', 'os_cot', 'os_bob', 'os_bob_cot', 'os_incorrect', 'os_incorrect_cot', 'fs', 'fs_cot', 'weak_control_zs_cot', 'weak_control_os_cot', 'control_zs_cot', 'control_os_cot']:
            count_correct = get_count_correct(f'outputs/meta-llama-3-8b-instruct/responses_{prompting_method}_synthetic_dataset_linda_variant_{variant}_{data_type}.json')
            data[data_type].append(count_correct)


    # draw a comparison plot
    df = pd.DataFrame(data)
    df['prompting_method'] = ['baseline', 'zs_cot', 'os', 'os_cot', 'os_bob', 'os_bob_cot', 'os_incorrect', 'os_incorrect_cot', 'fs', 'fs_cot', 'weak_control_zs_cot', 'weak_control_os_cot', 'control_zs_cot', 'control_os_cot']
    df = df.melt(id_vars='prompting_method', var_name='data_type', value_name='count_correct')
    fig = px.bar(df, x='prompting_method', y='count_correct', color='data_type', barmode='group')
    fig.update_layout(title=f'Percentage of correct answers for different prompting methods - Variant {variant}', xaxis_title='Prompting Method', yaxis_title='Percentage of correct answers')
    # fix y axis to 0 to 100
    fig.update_yaxes(range=[0, 100])
    # resize the plot
    fig.update_layout(
        autosize=False,
        width=800,
        height=600,
    )
    return fig

In [4]:
fig = draw("two_to")
fig.show()

Error reading outputs/meta-llama-3-8b-instruct/responses_baseline_synthetic_dataset_linda_variant_two_to_random.json
Error reading outputs/meta-llama-3-8b-instruct/responses_zs_cot_synthetic_dataset_linda_variant_two_to_random.json
Error reading outputs/meta-llama-3-8b-instruct/responses_os_synthetic_dataset_linda_variant_two_to_random.json
Error reading outputs/meta-llama-3-8b-instruct/responses_os_cot_synthetic_dataset_linda_variant_two_to_random.json
Error reading outputs/meta-llama-3-8b-instruct/responses_os_bob_synthetic_dataset_linda_variant_two_to_random.json
Error reading outputs/meta-llama-3-8b-instruct/responses_os_bob_cot_synthetic_dataset_linda_variant_two_to_random.json
Error reading outputs/meta-llama-3-8b-instruct/responses_os_incorrect_synthetic_dataset_linda_variant_two_to_random.json
Error reading outputs/meta-llama-3-8b-instruct/responses_os_incorrect_cot_synthetic_dataset_linda_variant_two_to_random.json
Error reading outputs/meta-llama-3-8b-instruct/responses_fs_sy

In [5]:
# read in json file

'''
  - ```baseline``` for directly prompting, 
  - ```zs_cot``` for zero-shot chain-of-thought (CoT) prompting, 
  - ```os``` for one-shot in-context learning (ICL) prompting with the original Linda Problem (default), 
  - ```os_cot``` for one-shot ICL plus COT prompting , 
  - ```os_bob``` for one-shot ICL prompting but with a rephrased Bob Problem, 
  - ```os_bob_cot``` for one-shot ICL prompting plus COT but with a rephrased Bob Problem, 
  - ```os_incorrect``` for one-shot ICL but with an incorrect answer, 
  - ```os_incorrect_cot``` for one-shot ICL plus COT but with an incorrect answer,
  - ```fs``` for few-shot ICL prompting,
  - ```fs_cot``` for few-shot ICL plus COT prompting,
  - ```weak_control_zs_cot``` for weakly controlled zero-shot CoT prompting, leaking the hint that it is a Linda Problem but without detailed explanations,
  - ```weak_control_os_cot``` for weakly controlled one-shot CoT prompting, leaking the hint that it is a Linda Problem but without detailed explanations,
  - ```control_zs_cot``` for controlled zero-shot CoT prompting, leaking the hint that it is a Linda Problem with detailed and carefully-curated explanations,
  - ```control_os_cot``` for controlled one-shot CoT prompting, leaking the hint that it is a Linda Problem with detailed and carefully-curated explanations.
'''


def compare_models(variant):
    data = {"gpt-4-turbo": [], "gpt-3.5-turbo": [], "meta-llama-3-8b-instruct": [], "meta-llama-3-70b-instruct": [], 'claude-3-opus-20240229': []}
    for model in ['gpt-4-turbo', 'gpt-3.5-turbo', 'meta-llama-3-8b-instruct', 'meta-llama-3-70b-instruct', 'claude-3-opus-20240229']:
        for prompting_method in ['baseline', 'zs_cot', 'os', 'os_cot', 'os_bob', 'os_bob_cot', 'os_incorrect', 'os_incorrect_cot', 'fs', 'fs_cot', 'weak_control_zs_cot', 'weak_control_os_cot', 'control_zs_cot', 'control_os_cot']:
            count_correct = get_count_correct(f'outputs/{model}/responses_{prompting_method}_synthetic_dataset_linda_variant_{variant}_gold.json')
            data[model].append(count_correct)


    # draw a comparison plot
    df = pd.DataFrame(data)
    df['prompting_method'] = ['baseline', 'zs_cot', 'os', 'os_cot', 'os_bob', 'os_bob_cot', 'os_incorrect', 'os_incorrect_cot', 'fs', 'fs_cot', 'weak_control_zs_cot', 'weak_control_os_cot', 'control_zs_cot', 'control_os_cot']
    df = df.melt(id_vars='prompting_method', var_name='model', value_name='count_correct')
    fig = px.bar(df, x='prompting_method', y='count_correct', color='model', barmode='group')
    fig.update_layout(title=f'Percentage of correct answers for different models - Variant {variant}', xaxis_title='Prompting Method', yaxis_title='Percentage of correct answers')
    # fix y axis to 0 to 100
    fig.update_yaxes(range=[0, 100])
    # draw horizontal line at 50%
    fig.add_shape(type="line", x0=-0.5, y0=50, x1=13.5, y1=50, line=dict(color="black", width=3))
    # resize the plot
    fig.update_layout(
        autosize=False,
        width=800,
        height=600,
    )
    return fig

In [7]:
compare_models("two_sothat").show()