In [1]:
import json
import plotly.express as px
import pandas as pd

In [2]:
def get_percent_correct(path):
    try:
        with open(path, 'r') as f:
            data = json.load(f)
        return data['stats']['count_correct']/data['stats']['count_total'] * 100
    except:
        print(f"Error reading {path}")


In [3]:


'''
  - ```baseline``` for directly prompting, 
  - ```zs_cot``` for zero-shot chain-of-thought (CoT) prompting, 
  - ```os``` for one-shot in-context learning (ICL) prompting with the original Linda Problem (default), 
  - ```os_cot``` for one-shot ICL plus COT prompting , 
  - ```os_bob``` for one-shot ICL prompting but with a rephrased Bob Problem, 
  - ```os_bob_cot``` for one-shot ICL prompting plus COT but with a rephrased Bob Problem, 
  - ```os_incorrect``` for one-shot ICL but with an incorrect answer, 
  - ```os_incorrect_cot``` for one-shot ICL plus COT but with an incorrect answer,
  - ```fs``` for few-shot ICL prompting,
  - ```fs_cot``` for few-shot ICL plus COT prompting,
  - ```weak_control_zs_cot``` for weakly controlled zero-shot CoT prompting, leaking the hint that it is a Linda Problem but without detailed explanations,
  - ```weak_control_os_cot``` for weakly controlled one-shot CoT prompting, leaking the hint that it is a Linda Problem but without detailed explanations,
  - ```control_zs_cot``` for controlled zero-shot CoT prompting, leaking the hint that it is a Linda Problem with detailed and carefully-curated explanations,
  - ```control_os_cot``` for controlled one-shot CoT prompting, leaking the hint that it is a Linda Problem with detailed and carefully-curated explanations.
'''


def draw(variant):
    data = {"gold": [], "random": [], 'baseline': []}
    for data_type in ['gold', 'random', 'baseline']:
        for prompting_method in ['baseline', 'zs_cot', 'os', 'os_cot', 'os_bob', 'os_bob_cot', 'os_incorrect', 'os_incorrect_cot', 'fs', 'fs_cot', 'weak_control_zs_cot', 'weak_control_os_cot', 'control_zs_cot', 'control_os_cot']:
            count_correct = get_percent_correct(f'outputs/meta-llama-3-70b-instruct/responses_{prompting_method}_synthetic_dataset_linda_variant_{variant}_{data_type}.json')
            data[data_type].append(count_correct)


    # draw a comparison plot
    df = pd.DataFrame(data)
    df['prompting_method'] = ['baseline', 'zs_cot', 'os', 'os_cot', 'os_bob', 'os_bob_cot', 'os_incorrect', 'os_incorrect_cot', 'fs', 'fs_cot', 'weak_control_zs_cot', 'weak_control_os_cot', 'control_zs_cot', 'control_os_cot']
    df = df.melt(id_vars='prompting_method', var_name='data_type', value_name='count_correct')
    fig = px.bar(df, x='prompting_method', y='count_correct', color='data_type', barmode='group')
    fig.update_layout(title=f'Percentage of correct answers for different prompting methods - Variant {variant}', xaxis_title='Prompting Method', yaxis_title='Percentage of correct answers')
    # fix y axis to 0 to 100
    fig.update_yaxes(range=[0, 100])
    # resize the plot
    fig.update_layout(
        autosize=False,
        width=800,
        height=600,
    )
    return fig

In [4]:
import seaborn as sns
def draw_baseline_accuracy(type = "baseline"):
    datasets = ['one_because', 'one_sothat', 'one_to', 'two_because', 'two_sothat', 'two_to', 'three', 'four']
    models = ['meta-llama-3-70b-instruct', 'gpt-4-turbo', 'claude-3-opus-20240229']
    # create a dictionary to store the data
    data = {model: [] for model in models}
    for model in models:
        for dataset in datasets:
            count_correct = get_percent_correct(f'outputs/{model}/responses_{type}_synthetic_dataset_linda_variant_{dataset}_gold.json')
            data[model].append(count_correct)
    # draw a comparison plot
    df = pd.DataFrame(data)
    df['dataset'] = datasets
    df = df.melt(id_vars='dataset', var_name='model', value_name='percent_correct')
    fig = px.bar(df, x='dataset', y='percent_correct', color='model', barmode='group')
    fig.update_layout(title=f'Percentage of correct answers for different models - {type}', xaxis_title='Dataset', yaxis_title='Percentage of correct answers')
    # fix y axis to 0 to 100
    fig.update_yaxes(range=[0, 100])
    # draw a line at 50%
    fig.add_hline(y=50, line_dash="dot", annotation_text="random guessing")
    # resize the plot
    fig.update_layout(
        autosize=False,
        width=800,
        height=600,
    )
    return fig

In [5]:
draw_baseline_accuracy("baseline").show()
# future work: include original linda problem set
# message, when taken another form, models barely perform better than random guessing, if at all

In [6]:
draw_baseline_accuracy("zs_cot").show()
# future work: include original linda problem set
# message: even with universal zero shot cot prompt, models barely perform better than random guessing, if at all

Error reading outputs/claude-3-opus-20240229/responses_zs_cot_synthetic_dataset_linda_variant_two_to_gold.json


In [7]:
fig = draw("four")
fig.show()

Error reading outputs/meta-llama-3-70b-instruct/responses_baseline_synthetic_dataset_linda_variant_four_baseline.json
Error reading outputs/meta-llama-3-70b-instruct/responses_zs_cot_synthetic_dataset_linda_variant_four_baseline.json
Error reading outputs/meta-llama-3-70b-instruct/responses_os_synthetic_dataset_linda_variant_four_baseline.json
Error reading outputs/meta-llama-3-70b-instruct/responses_os_cot_synthetic_dataset_linda_variant_four_baseline.json
Error reading outputs/meta-llama-3-70b-instruct/responses_os_bob_synthetic_dataset_linda_variant_four_baseline.json
Error reading outputs/meta-llama-3-70b-instruct/responses_os_bob_cot_synthetic_dataset_linda_variant_four_baseline.json
Error reading outputs/meta-llama-3-70b-instruct/responses_os_incorrect_synthetic_dataset_linda_variant_four_baseline.json
Error reading outputs/meta-llama-3-70b-instruct/responses_os_incorrect_cot_synthetic_dataset_linda_variant_four_baseline.json
Error reading outputs/meta-llama-3-70b-instruct/respon

In [8]:
# read in json file

'''
  - ```baseline``` for directly prompting, 
  - ```zs_cot``` for zero-shot chain-of-thought (CoT) prompting, 
  - ```os``` for one-shot in-context learning (ICL) prompting with the original Linda Problem (default), 
  - ```os_cot``` for one-shot ICL plus COT prompting , 
  - ```os_bob``` for one-shot ICL prompting but with a rephrased Bob Problem, 
  - ```os_bob_cot``` for one-shot ICL prompting plus COT but with a rephrased Bob Problem, 
  - ```os_incorrect``` for one-shot ICL but with an incorrect answer, 
  - ```os_incorrect_cot``` for one-shot ICL plus COT but with an incorrect answer,
  - ```fs``` for few-shot ICL prompting,
  - ```fs_cot``` for few-shot ICL plus COT prompting,
  - ```weak_control_zs_cot``` for weakly controlled zero-shot CoT prompting, leaking the hint that it is a Linda Problem but without detailed explanations,
  - ```weak_control_os_cot``` for weakly controlled one-shot CoT prompting, leaking the hint that it is a Linda Problem but without detailed explanations,
  - ```control_zs_cot``` for controlled zero-shot CoT prompting, leaking the hint that it is a Linda Problem with detailed and carefully-curated explanations,
  - ```control_os_cot``` for controlled one-shot CoT prompting, leaking the hint that it is a Linda Problem with detailed and carefully-curated explanations.
'''


def compare_models(variant, prompting_methods = ['baseline', 'zs_cot', 'os', 'os_cot', 'os_bob', 'os_bob_cot', 'os_incorrect', 'os_incorrect_cot', 'fs', 'fs_cot', 'weak_control_zs_cot', 'weak_control_os_cot', 'control_zs_cot', 'control_os_cot']):

    # prompting_methods = ['baseline', 'zs_cot', 'os', 'os_cot', 'os_bob', 'os_bob_cot', 'os_incorrect', 'os_incorrect_cot', 'fs', 'fs_cot', 'weak_control_zs_cot', 'weak_control_os_cot', 'control_zs_cot', 'control_os_cot']
    #data = {"gpt-4-turbo": [], "gpt-3.5-turbo": [], "meta-llama-3-8b-instruct": [], "meta-llama-3-70b-instruct": [], 'claude-3-opus-20240229': []}
    models = ['meta-llama-3-70b-instruct', 'gpt-4-turbo', 'claude-3-opus-20240229']
    data = {model: [] for model in models}
    for model in models:
        for prompting_method in prompting_methods:
            count_correct = get_percent_correct(f'outputs/{model}/responses_{prompting_method}_synthetic_dataset_linda_variant_{variant}_gold.json')
            data[model].append(count_correct)


    # draw a comparison plot
    df = pd.DataFrame(data)
    df['prompting_method'] = prompting_methods
    df = df.melt(id_vars='prompting_method', var_name='model', value_name='count_correct')
    fig = px.bar(df, x='prompting_method', y='count_correct', color='model', barmode='group')
    fig.update_layout(title=f'Percentage of correct answers for different models - Variant {variant}', xaxis_title='Prompting Method', yaxis_title='Percentage of correct answers')
    # fix y axis to 0 to 100
    fig.update_yaxes(range=[0, 100])
    # draw horizontal line at 50%
    fig.add_shape(type="line", x0=-0.5, y0=50, x1=13.5, y1=50, line=dict(color="black", width=3))
    # resize the plot
    fig.update_layout(
        autosize=False,
        width=800,
        height=600,
    )
    return fig

# Results

1. LLM in the wild Except for variance 6, where the problem can be explicitly solved using math, no llm does substantially better than random guessing.
2. "universal prompt engineering" is a myth. Marginal improvements, if at all (some even worse than no engineering)
3. **Except for variance 6, llama significant decrease; GPT-4 turbo consistent decrease; Claude-3 interesting; variant 6 is a Linda problem under math disguise** -- **giving the hint somehow hurts the performance**.
4. GPT-4 and Claude-3 are not learning the wrong answer from the hint; llama is not consistent; maybe worth doing a **few-shot** with incorrect answers (include incorrect reasoning)
5. future work
6. few shot learning doesn't necessarily help, despite model size (future work, test for saturation)
7. future work
8. It's worth checking the outputs of llama-8B; otherwise, almost consistent increase in performance
9. For bigger models, fail to reject null hypothesis for all models except for variant 6; for smaller models, reject null hypothesis 
10. abstraction: fs -> control, we observe improvement in performance (fails at abstraction); future work (variant 6 strong control)???

In [9]:
# Bob 

def compare_(prompting_methods):
    compare_models("one_because", prompting_methods).show()
    compare_models("one_sothat", prompting_methods).show()
    compare_models("one_to", prompting_methods).show()
    compare_models("two_because", prompting_methods).show()
    compare_models("two_sothat", prompting_methods).show()
    compare_models("two_to", prompting_methods).show()
    compare_models("three", prompting_methods).show()
    compare_models("four", prompting_methods).show()
    compare_models("six", prompting_methods).show()

compare_(prompting_methods = ['baseline', 'zs_cot', 'os', 'os_cot', 'os_bob', 'os_bob_cot'])


Error reading outputs/claude-3-opus-20240229/responses_os_bob_synthetic_dataset_linda_variant_one_because_gold.json


Error reading outputs/claude-3-opus-20240229/responses_os_cot_synthetic_dataset_linda_variant_one_sothat_gold.json


Error reading outputs/claude-3-opus-20240229/responses_os_bob_synthetic_dataset_linda_variant_one_to_gold.json


Error reading outputs/claude-3-opus-20240229/responses_zs_cot_synthetic_dataset_linda_variant_two_to_gold.json
Error reading outputs/claude-3-opus-20240229/responses_os_synthetic_dataset_linda_variant_two_to_gold.json
Error reading outputs/claude-3-opus-20240229/responses_os_bob_synthetic_dataset_linda_variant_two_to_gold.json
Error reading outputs/claude-3-opus-20240229/responses_os_bob_cot_synthetic_dataset_linda_variant_two_to_gold.json


In [10]:
compare_(prompting_methods = ['baseline', 'zs_cot', 'os', 'os_cot', 'os_incorrect', 'os_incorrect_cot'])

Error reading outputs/claude-3-opus-20240229/responses_os_cot_synthetic_dataset_linda_variant_one_sothat_gold.json


Error reading outputs/claude-3-opus-20240229/responses_os_incorrect_synthetic_dataset_linda_variant_one_to_gold.json


Error reading outputs/claude-3-opus-20240229/responses_zs_cot_synthetic_dataset_linda_variant_two_to_gold.json
Error reading outputs/claude-3-opus-20240229/responses_os_synthetic_dataset_linda_variant_two_to_gold.json
Error reading outputs/claude-3-opus-20240229/responses_os_incorrect_synthetic_dataset_linda_variant_two_to_gold.json
Error reading outputs/claude-3-opus-20240229/responses_os_incorrect_cot_synthetic_dataset_linda_variant_two_to_gold.json
