In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

### How do the Claude 3 model family and GPT-4 perform on zero-shot MMLU College Math?

In [2]:
ZeroShotFourOptionResponses = pd.read_csv(
    "../data/ZeroShotFourOptionResponses_20240505205946.csv",
    index_col=0,
)
ZeroShotFourOptionResponses

Unnamed: 0_level_0,question_id,response,model_id,is_correct
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
89b40ce8-6eeb-4890-8a2c-2339e0aa7d13,0,B,gpt-4-turbo-2024-04-09,True
8ef0d0ab-3e7f-4e93-af0e-c18adfd01981,1,D,gpt-4-turbo-2024-04-09,True
addc10b8-0c0f-407b-a68b-e406a6ab0699,2,A,gpt-4-turbo-2024-04-09,False
35d854e4-c202-4d41-9c4b-78c4356e1271,3,D,gpt-4-turbo-2024-04-09,False
f027e705-89c7-4228-8ac4-a37795c96033,4,C,gpt-4-turbo-2024-04-09,True
...,...,...,...,...
f6a2de3b-0a16-4a71-bcab-34486a7c92bb,95,D,claude-3-opus-20240229,True
d0b4096d-339c-49e4-879e-6b15327e34fa,96,D,claude-3-opus-20240229,False
ede61d4a-09a2-4db4-9038-18c102b42b87,97,C,claude-3-opus-20240229,True
40cdf72a-b615-4161-96b5-0b77dc19dd9c,98,C,claude-3-opus-20240229,False


In [3]:
correct_answers_by_model_df = (
    ZeroShotFourOptionResponses.groupby("model_id")["is_correct"].sum().reset_index()
)
correct_answers_by_model_df["model_id"] = pd.Categorical(
    correct_answers_by_model_df["model_id"],
    categories=[
        "gpt-4-turbo-2024-04-09",
        "claude-3-haiku-20240307",
        "claude-3-sonnet-20240229",
        "claude-3-opus-20240229",
    ],
    ordered=True,
)
correct_answers_by_model_df = correct_answers_by_model_df.sort_values("model_id")
correct_answers_by_model_df

Unnamed: 0,model_id,is_correct
3,gpt-4-turbo-2024-04-09,54
0,claude-3-haiku-20240307,35
2,claude-3-sonnet-20240229,30
1,claude-3-opus-20240229,45


In [9]:
fig = px.bar(
    correct_answers_by_model_df,
    x="model_id",
    y="is_correct",
    orientation="v",
    color="model_id",
    labels={
        "model_id": "Model",
        "is_correct": "Number of correct responses out of 100",
    },
    title="Number of correct responses by model for MMLU College Math Test",
)

fig.update_xaxes(showticklabels=False, visible=False)
fig.update_yaxes(title_text="Number of correct responses out of 100")
fig.update_layout(
    autosize=True,
    height=500,
)

fig.show()
fig.write_html("../plots/correct_responses_by_model.html")

### Can we select a subset of 10 questions which maximises the gap between the expert model (GPT-4) and the weakest non-expert model (Claude 3 Haiku)?

In [10]:
MMLUMathQuestionsSubset = pd.read_csv(
    "../data/MMLUMathQuestionsSubset_20240505210028.csv",
    index_col=0,
)

In [11]:
questions_subset = ZeroShotFourOptionResponses[
    ZeroShotFourOptionResponses["question_id"].isin(MMLUMathQuestionsSubset.index)
]

correct_answers_on_subset_by_model_df = (
    questions_subset.groupby("model_id")["is_correct"].sum().reset_index()
)
correct_answers_on_subset_by_model_df["model_id"] = pd.Categorical(
    correct_answers_on_subset_by_model_df["model_id"],
    categories=[
        "gpt-4-turbo-2024-04-09",
        "claude-3-haiku-20240307",
        "claude-3-sonnet-20240229",
        "claude-3-opus-20240229",
    ],
    ordered=True,
)
correct_answers_on_subset_by_model_df = (
    correct_answers_on_subset_by_model_df.sort_values("model_id")
)
correct_answers_on_subset_by_model_df

Unnamed: 0,model_id,is_correct
3,gpt-4-turbo-2024-04-09,10
0,claude-3-haiku-20240307,0
2,claude-3-sonnet-20240229,2
1,claude-3-opus-20240229,6


In [12]:
fig = px.bar(
    correct_answers_on_subset_by_model_df,
    x="model_id",
    y="is_correct",
    orientation="v",
    color="model_id",
    labels={
        "model_id": "Model",
        "is_correct": "Number of correct responses out of 10",
    },
    title="Number of correct responses by model for selected subset of MMLU College Math Test",
)

fig.update_xaxes(showticklabels=False, visible=False)
fig.update_yaxes(title_text="Number of correct responses out of 10")
fig.update_layout(
    autosize=True,
    height=500,
)

fig.show()
fig.write_html("../plots/correct_responses_on_subset_by_model.html")

### What are the distributions of argument lengths for each judge model?

In [13]:
SelectedOptionArgumentResponses = pd.read_csv(
    "../data/SelectedOptionArgumentResponses_20240505212905.csv",
    index_col=0,
)

In [14]:
argument_length_traces = []
argument_length_medians = {}
for requested_response_length in ["short", "medium", "long"]:
    data = SelectedOptionArgumentResponses[
        SelectedOptionArgumentResponses["requested_response_length"]
        == requested_response_length
    ]["argument"].str.len()
    argument_length_traces.append(
        go.Histogram(
            x=data,
            name=requested_response_length,
            opacity=0.75,
            nbinsx=10,
        )
    )
    # Add a vertical line for the median
    argument_length_median = np.median(data)
    argument_length_medians[requested_response_length] = argument_length_median
    argument_length_traces.append(
        go.Scatter(
            x=[argument_length_median, argument_length_median],
            y=[0, 25],
            mode="lines",
            name=f"{requested_response_length} median",
            showlegend=True,
            line=dict(color="black", width=1, dash="dash"),
        )
    )

layout = go.Layout(
    title="Distributions of the lengths of arguments generated by GPT-4",
    xaxis=dict(title="Argument length in characters"),
    barmode="overlay",
    autosize=True,
    height=500,
)
fig = go.Figure(data=argument_length_traces, layout=layout)
fig.show()
fig.write_html("../plots/argument_length_distributions.html")

### How does argument classification accuracy vary across the Claude 3 model family?

In [15]:
ArgumentClassificationResponses = pd.read_csv(
    "../data/ArgumentClassificationResponses_20240505220601.csv", index_col=0
)

In [16]:
classifications_with_argument_length = pd.merge(
    ArgumentClassificationResponses,
    SelectedOptionArgumentResponses.reset_index()[["id", "requested_response_length"]],
    left_on="argument_id",
    right_on="id",
)[["argument_id", "model_id", "is_correct", "requested_response_length"]]
requested_response_lengths = ["short", "medium", "long"]
model_ids = [
    "claude-3-haiku-20240307",
    "claude-3-sonnet-20240229",
    "claude-3-opus-20240229",
]

In [17]:
classifications_with_argument_length.groupby(["model_id", "requested_response_length"])[
    "is_correct"
].sum()

model_id                  requested_response_length
claude-3-haiku-20240307   long                         13
                          medium                       13
                          short                        15
claude-3-opus-20240229    long                         19
                          medium                       20
                          short                        25
claude-3-sonnet-20240229  long                         17
                          medium                       18
                          short                        22
Name: is_correct, dtype: int64

In [18]:
scatter_plots = []
data_by_requested_response_length = {
    requested_response_length: classifications_with_argument_length[
        classifications_with_argument_length["requested_response_length"]
        == requested_response_length
    ]
    for requested_response_length in requested_response_lengths
}

for requested_response_length, data in data_by_requested_response_length.items():
    scatter_plots.append(
        go.Scatter(
            name=requested_response_length,
            x=model_ids,
            y=[
                data[data["model_id"] == model_id]["is_correct"].sum()
                for model_id in model_ids
            ],
            mode="markers+lines",
            marker=dict(symbol="cross"),
        )
    )

# scatter_plots.append(
#     go.Scatter(
#         x=model_ids,
#         y=[40, 40, 40],
#         mode="lines",
#         name="total number of arguments",
#         line=dict(color="black", dash="dash"),
#     )
# )

fig = go.Figure(data=scatter_plots)
fig.update_layout(
    title="Judge accuracy across the Claude 3 model family for different argument lengths",
    yaxis_title="Number of correct judgments",
    xaxis_title="Model",
    autosize=True,
    height=500,
)

fig.show()
fig.write_html("../plots/judge_accuracy_by_model.html")

### How does argument classification accuracy vary with argument length?

In [19]:
scatter_plots = []
data_by_model_id = {
    model_id: classifications_with_argument_length[
        classifications_with_argument_length["model_id"] == model_id
    ]
    for model_id in model_ids
}

for model_id, data in data_by_model_id.items():
    scatter_plots.append(
        go.Scatter(
            name=model_id,
            x=list(argument_length_medians.values()),
            y=[
                data[data["requested_response_length"] == requested_response_length][
                    "is_correct"
                ].sum()
                for requested_response_length in requested_response_lengths
            ],
            mode="markers+lines",
            marker=dict(symbol="cross"),
        )
    )

# scatter_plots.append(
#     go.Scatter(
#         x=list(argument_length_medians.values()),
#         y=[40, 40, 40],
#         mode="lines",
#         name="total number of arguments",
#         line=dict(color="black", dash="dash"),
#     )
# )

fig = go.Figure(data=scatter_plots)
fig.update_layout(
    title="Judge accuracy for different argument lengths across the Claude 3 model family",
    yaxis_title="Number of correct judgments",
    xaxis_title="Median argument length in characters",
    autosize=True,
    height=500,
)

fig.show()
fig.write_html("../plots/judge_accuracy_by_argument_length.html")

### Does debate improve zero-shot performance?

In [31]:
ordered_claude_model_ids = [
    "claude-3-haiku-20240307",
    "claude-3-sonnet-20240229",
    "claude-3-opus-20240229",
]

In [32]:
ZeroShotTwoOptionResponses = pd.read_csv(
    "../data/ZeroShotTwoOptionResponses_20240505220731.csv", index_col=0
)
two_option_zero_shot_by_model_df = (
    ZeroShotTwoOptionResponses.groupby("model_id")["is_correct"].sum().reset_index()
)
two_option_zero_shot_by_model_df["model_id"] = pd.Categorical(
    two_option_zero_shot_by_model_df["model_id"],
    categories=ordered_claude_model_ids,
    ordered=True,
)
two_option_zero_shot_by_model_df = two_option_zero_shot_by_model_df.sort_values(
    "model_id"
)
two_option_zero_shot_by_model_df

Unnamed: 0,model_id,is_correct
0,claude-3-haiku-20240307,12
2,claude-3-sonnet-20240229,10
1,claude-3-opus-20240229,16


In [33]:
TwoOptionDebateResponses = pd.read_csv(
    "../data/TwoOptionDebateResponses_20240505220907.csv", index_col=0
)
two_option_debate_by_model_df = (
    TwoOptionDebateResponses.groupby("model_id")["is_correct"].sum().reset_index()
)
two_option_debate_by_model_df["model_id"] = pd.Categorical(
    two_option_debate_by_model_df["model_id"],
    categories=ordered_claude_model_ids,
    ordered=True,
)
two_option_debate_by_model_df = two_option_debate_by_model_df.sort_values("model_id")
two_option_debate_by_model_df

Unnamed: 0,model_id,is_correct
0,claude-3-haiku-20240307,11
2,claude-3-sonnet-20240229,10
1,claude-3-opus-20240229,15


In [37]:
fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=ordered_claude_model_ids,
        y=two_option_zero_shot_by_model_df["is_correct"],
        name="Zero-shot",
    )
)

fig.add_trace(
    go.Bar(
        x=ordered_claude_model_ids,
        y=two_option_debate_by_model_df["is_correct"],
        name="Debate",
    )
)
fig.update_layout(
    barmode="group",
    title="Judge accuracy across the Claude 3 model family with/without debate",
    yaxis_title="Number of correct judgments",
    xaxis_title="Model",
    autosize=True,
    height=500,
)

fig.show()
fig.write_html("../plots/judge_accuracy_by_model_with_debate.html")