# Analysis of Model judgments for robustness to question variation

In [1]:
import numpy as np
import pandas as pd

import output_processing as op

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_list = [
    # Main set of models with instruct divide and size variety
    "meta-llama/Llama-3.2-1B",
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.2-3B",
    "meta-llama/Llama-3.2-3B-Instruct",
    "meta-llama/Llama-3.1-8B",
    "meta-llama/Llama-3.1-8B-Instruct",
    "meta-llama/Llama-3.3-70B-Instruct",
    # Small reference model - would allow for pretraining variation
    "gpt2-medium",
    # Other open models
    "allenai/OLMo-2-1124-7B",
    "allenai/OLMo-2-1124-7B-Instruct",
    "mistralai/Ministral-8B-Instruct-2410",
    "google/gemma-7b",
    "google/gemma-7b-it",
    # Large platform model
    "openai-gpt-4",
]

def read_and_organize_model_results(model_name):
    model_results = pd.read_csv(f"runs/runs-42_07_16/{model_name}-results.csv")
    # model_results.replace([0.0], -65504, inplace=True)
    model_results = op.organize_distribution(model_results)
    model_results["model_name"] = model_name.split("/")[-1]
    model_results.loc[model_results["Covered"] == True, "Judgment"] = "Covered"
    model_results.loc[model_results["Covered"] == True, "Judgment_prob"] = model_results["Covered_prob"]
    model_results.loc[model_results["NotCovered"] == True, "Judgment"] = "NotCovered"
    model_results.loc[model_results["NotCovered"] == True, "Judgment_prob"] = model_results["NotCovered_prob"]
    return model_results

In [3]:
# TODO organize the output for better clarity
combined_results = [read_and_organize_model_results(model_name) for model_name in model_list]
combined_df = pd.concat(combined_results).reset_index()
combined_df.loc[:, "item"] = combined_df["title"] + combined_df["version"]
combined_df.shape

(17388, 28)

In [4]:
# Print summary of the experimental results
print(combined_df.model_name.unique())

['Llama-3.2-1B' 'Llama-3.2-1B-Instruct' 'Llama-3.2-3B'
 'Llama-3.2-3B-Instruct' 'Llama-3.1-8B' 'Llama-3.1-8B-Instruct'
 'Llama-3.3-70B-Instruct' 'gpt2-medium' 'OLMo-2-1124-7B'
 'OLMo-2-1124-7B-Instruct' 'Ministral-8B-Instruct-2410' 'gemma-7b'
 'gemma-7b-it' 'openai-gpt-4']


In [5]:
# Looking at one of the result samples
combined_df.sample()

Unnamed: 0,index,title,prompt_type,prompt,version,output,output_text,cum_logprob,YES_probs,Yes_probs,...,UnAff_prob,Covered_prob,NotCovered_prob,Covered,NotCovered,entropy,model_name,Judgment,Judgment_prob,item
2150,908,Wind Damage,disagreement_negation,Tom's home insurance policy includes coverage ...,controversial,yes,yes.,,0.072278,0.440817,...,0.302091,0.440817,0.302091,True,False,1.071909,Llama-3.2-1B-Instruct,Covered,0.440817,Wind Damagecontroversial


## Prepare the table with Categorical counts and Distributional Spread

In [6]:
# Categorical Counts for Yes/No question variant
question_variant_mask = combined_df["prompt_type"] == "yes_or_no"
yes_or_no_df = combined_df[question_variant_mask]
count_labels = yes_or_no_df.groupby('model_name', as_index=False, sort=False).aggregate(
    {
        'Covered': 'sum',
        'NotCovered': 'sum',
    }
)
count_labels = count_labels.rename({'model_name' : 'Model'}, axis=1)
count_labels

Unnamed: 0,Model,Covered,NotCovered
0,Llama-3.2-1B,138,0
1,Llama-3.2-1B-Instruct,138,0
2,Llama-3.2-3B,127,11
3,Llama-3.2-3B-Instruct,53,85
4,Llama-3.1-8B,80,58
5,Llama-3.1-8B-Instruct,0,138
6,Llama-3.3-70B-Instruct,59,79
7,gpt2-medium,5,133
8,OLMo-2-1124-7B,70,68
9,OLMo-2-1124-7B-Instruct,53,85


In [7]:
# Latex table just for categorical counts
count_labels.to_latex("reports/yes_or_no_categorical_counts.tex", index=True, float_format="%.2f")

In [8]:
# Distributional spread for Yes/No question variant
question_variant_mask = combined_df["prompt_type"] == "yes_or_no"
yes_or_no_df = combined_df[question_variant_mask]

# Util function
yes_or_no_judgement_range = yes_or_no_df[['model_name', 'Covered_prob', 'NotCovered_prob']].melt(id_vars='model_name', value_vars=['Covered_prob', 'NotCovered_prob']).groupby('model_name', as_index=False).agg(
    Min = pd.NamedAgg('value', lambda x: np.round(np.min(x), 3)),
    Max = pd.NamedAgg('value', lambda x: np.round(np.max(x), 3)),
    ).rename({"model_name": "Model"}, axis=1)

yes_or_no_judgement_range

Unnamed: 0,Model,Min,Max
0,Llama-3.1-8B,0.138,0.367
1,Llama-3.1-8B-Instruct,0.11,0.742
2,Llama-3.2-1B,0.064,0.291
3,Llama-3.2-1B-Instruct,0.151,0.592
4,Llama-3.2-3B,0.093,0.517
5,Llama-3.2-3B-Instruct,0.159,0.69
6,Llama-3.3-70B-Instruct,0.022,0.849
7,Ministral-8B-Instruct-2410,0.211,0.595
8,OLMo-2-1124-7B,0.19,0.564
9,OLMo-2-1124-7B-Instruct,0.001,0.991


In [9]:
# Separate latex table for yes_or_no judgement range
yes_or_no_judgement_range.to_latex("reports/yes_or_no_distributional_spread.tex", index=True, float_format="%.2f")

In [10]:
# Combined table
# Merge the two tables
yes_or_no_table = pd.concat([count_labels.set_index('Model'), yes_or_no_judgement_range.set_index('Model')], axis=1, join='inner')
yes_or_no_table

Unnamed: 0_level_0,Covered,NotCovered,Min,Max
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Llama-3.2-1B,138,0,0.064,0.291
Llama-3.2-1B-Instruct,138,0,0.151,0.592
Llama-3.2-3B,127,11,0.093,0.517
Llama-3.2-3B-Instruct,53,85,0.159,0.69
Llama-3.1-8B,80,58,0.138,0.367
Llama-3.1-8B-Instruct,0,138,0.11,0.742
Llama-3.3-70B-Instruct,59,79,0.022,0.849
gpt2-medium,5,133,0.129,0.308
OLMo-2-1124-7B,70,68,0.19,0.564
OLMo-2-1124-7B-Instruct,53,85,0.001,0.991


In [11]:
yes_or_no_table.to_latex("reports/yes_or_no_table.tex", index=True, float_format="%.2f")

## Prepare the majority votes frequency table

In [12]:
# Calculate the majority votes
majority_vote_by_model = combined_df.groupby(['title', 'version', 'model_name'], as_index=False, sort=False).aggregate(
     {
        'Covered': 'sum',
        'NotCovered': 'sum',
    }
)
majority_vote_by_model.loc[:, "majority_count"] = majority_vote_by_model[['Covered', 'NotCovered']].max(axis=1)
majority_vote_by_model.shape
majority_vote_table_df = majority_vote_by_model[["model_name", "majority_count"]].value_counts().reset_index(name="Frequency")\
.pivot_table(columns =['majority_count'], index="model_name", aggfunc="sum", margins=True)\

majority_vote_table_df.replace(np.nan, 0, inplace=True)
# Remove the "All" from columns and rows
majority_vote_table_df = majority_vote_table_df.drop(["All"], axis=0)
majority_vote_table_df = majority_vote_table_df.drop([("Frequency", "All")], axis=1)

majority_vote_table_df = majority_vote_table_df.rename({'model_name': 'Model', 'majority_count': 'Majority Count'})
majority_vote_table_df.index = majority_vote_table_df.index.rename('Model')
majority_vote_table_df.columns.names = [None, 'Majority Count']
majority_vote_table_df

Unnamed: 0_level_0,Frequency,Frequency,Frequency,Frequency,Frequency
Majority Count,5,6,7,8,9
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Llama-3.1-8B,40.0,52.0,46.0,0.0,0.0
Llama-3.1-8B-Instruct,6.0,39.0,59.0,31.0,3.0
Llama-3.2-1B,12.0,57.0,69.0,0.0,0.0
Llama-3.2-1B-Instruct,129.0,9.0,0.0,0.0,0.0
Llama-3.2-3B,95.0,40.0,3.0,0.0,0.0
Llama-3.2-3B-Instruct,75.0,48.0,15.0,0.0,0.0
Llama-3.3-70B-Instruct,25.0,33.0,78.0,2.0,0.0
Ministral-8B-Instruct-2410,24.0,65.0,30.0,18.0,1.0
OLMo-2-1124-7B,46.0,65.0,20.0,7.0,0.0
OLMo-2-1124-7B-Instruct,51.0,57.0,30.0,0.0,0.0


In [13]:
majority_vote_table_df.to_latex("reports/majority-votes-freq-table.tex", float_format="%.0f")

## Minority responses

In [14]:
# Minority responses
combined_df.loc[combined_df.Covered == True, "judgment"] = "Covered"
combined_df.loc[combined_df.NotCovered == True, "judgment"] = "Not Covered"
majority_vote_by_model.loc[:, "majority"] = majority_vote_by_model.apply(lambda x: "Covered" if x.Covered >= x.NotCovered else "Not Covered",  axis=1)
combined_df.loc[:, "in_minority"]= False
for index, row in majority_vote_by_model.iterrows():
    item_model_mask = (combined_df["title"] == row.title) & (combined_df["version"] == row.version) & (combined_df["model_name"] == row.model_name)
    in_minority_mask = combined_df["judgment"]!= row.majority
    combined_df.loc[item_model_mask & in_minority_mask, "in_minority"] = True

combined_df.in_minority.sum(), combined_df.shape[0]

## Minority responses by question variants for each model

(5565, 17388)

In [15]:
minority_responses = combined_df[combined_df.in_minority == True]

In [16]:
minority_pivot = minority_responses[["model_name", "prompt_type"]].value_counts().reset_index().pivot_table(index="model_name", columns="prompt_type")
minority_pivot.replace(np.nan, 0, inplace=True)
minority_pivot

Unnamed: 0_level_0,count,count,count,count,count,count,count,count,count
prompt_type,agreement,agreement_negation,disagreement,disagreement_negation,negation,no_or_yes,options,options_flipped,yes_or_no
model_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Llama-3.1-8B,21.0,101.0,112.0,37.0,29.0,16.0,33.0,30.0,29.0
Llama-3.1-8B-Instruct,5.0,29.0,58.0,74.0,32.0,1.0,1.0,89.0,1.0
Llama-3.2-1B,0.0,138.0,138.0,0.0,0.0,16.0,48.0,17.0,0.0
Llama-3.2-1B-Instruct,13.0,125.0,125.0,13.0,0.0,125.0,116.0,13.0,13.0
Llama-3.2-3B,123.0,137.0,104.0,2.0,1.0,1.0,127.0,1.0,10.0
Llama-3.2-3B-Instruct,47.0,74.0,74.0,64.0,41.0,29.0,65.0,57.0,23.0
Llama-3.3-70B-Instruct,14.0,57.0,46.0,89.0,99.0,11.0,19.0,18.0,4.0
Ministral-8B-Instruct-2410,10.0,48.0,75.0,70.0,61.0,7.0,72.0,17.0,9.0
OLMo-2-1124-7B,68.0,63.0,65.0,61.0,27.0,21.0,44.0,66.0,11.0
OLMo-2-1124-7B-Instruct,34.0,68.0,106.0,70.0,48.0,9.0,26.0,57.0,17.0


## Minority responses by question variants

In [17]:
# Minority responses
combined_df.loc[combined_df.Covered == True, "judgment"] = "Covered"
combined_df.loc[combined_df.NotCovered == True, "judgment"] = "Not Covered"
majority_vote_by_model.loc[:, "majority"] = majority_vote_by_model.apply(lambda x: "Covered" if x.Covered >= x.NotCovered else "Not Covered",  axis=1)
combined_df.loc[:, "in_minority"]= False
for index, row in majority_vote_by_model.iterrows():
    item_model_mask = (combined_df["title"] == row.title) & (combined_df["version"] == row.version) & (combined_df["model_name"] == row.model_name)
    in_minority_mask = combined_df["judgment"]!= row.majority
    combined_df.loc[item_model_mask & in_minority_mask, "in_minority"] = True

combined_df.in_minority.sum(), combined_df.shape[0]

(5565, 17388)

In [18]:
minority_responses = combined_df[combined_df.in_minority == True]
minority_responses[["prompt_type"]].value_counts().reset_index()

Unnamed: 0,prompt_type,count
0,disagreement,1185
1,agreement_negation,947
2,disagreement_negation,845
3,options,713
4,agreement,486
5,options_flipped,476
6,negation,475
7,no_or_yes,261
8,yes_or_no,177


In [19]:
(minority_responses[["prompt_type"]].value_counts()/minority_responses.shape[0]).reset_index()

Unnamed: 0,prompt_type,count
0,disagreement,0.212938
1,agreement_negation,0.170171
2,disagreement_negation,0.151842
3,options,0.128122
4,agreement,0.087332
5,options_flipped,0.085535
6,negation,0.085355
7,no_or_yes,0.0469
8,yes_or_no,0.031806


In [20]:
minority_counts = minority_responses[["prompt_type"]].value_counts().reset_index().set_index("prompt_type")
minority_counts.columns = ['Count']
minority_props = (minority_counts/minority_responses.shape[0]).reset_index().set_index("prompt_type")
minority_props.columns = ['Proportion']
pd.concat([minority_counts, minority_props], axis=1, join='inner')

Unnamed: 0_level_0,Count,Proportion
prompt_type,Unnamed: 1_level_1,Unnamed: 2_level_1
disagreement,1185,0.212938
agreement_negation,947,0.170171
disagreement_negation,845,0.151842
options,713,0.128122
agreement,486,0.087332
options_flipped,476,0.085535
negation,475,0.085355
no_or_yes,261,0.0469
yes_or_no,177,0.031806


In [21]:
pd.concat([minority_counts, minority_props], axis=1, join='inner').columns

Index(['Count', 'Proportion'], dtype='object')

## Make the table for Jensen-Shannon Distance for non Yes/No question variant

In [22]:
divergences = op.calculate_relative_measures(combined_df)
divergences.sample()

Unnamed: 0,title,version,model_name,prompt_type,js_dist,kl_div
10075,Escape of Water II,controversial,OLMo-2-1124-7B-Instruct,agreement_negation,0.056223,0.011105


In [23]:
gpt4_divergences = divergences[divergences["model_name"] == "openai-gpt-4"]
gpt4_divergences.js_dist.value_counts()

js_dist
inf         443
0.044400      1
0.797847      1
0.015075      1
0.832481      1
           ... 
0.000320      1
0.000182      1
0.464500      1
0.464500      1
0.464499      1
Name: count, Length: 662, dtype: int64

In [24]:
max_distance_variant_for_model = divergences.groupby(['model_name', 'prompt_type'], as_index=False, sort=False).aggregate(
    Mean = pd.NamedAgg('js_dist', "mean"),
    Std = pd.NamedAgg('js_dist', "std")
).groupby(['model_name'], as_index=False, sort=False).apply(lambda x: x.loc[x["Mean"].idxmax(), :])
max_distance_variant_for_model = max_distance_variant_for_model.rename({'model_name' : 'Model', 'prompt_type': 'Variant'}, axis=1)
max_distance_variant_for_model = max_distance_variant_for_model.replace(
    {
        "agreement_negation": "Agr. w/ Neg.", 
        "no_or_yes": "N/Y", 
        "disagreement_negation": "Disagr. w/ Neg.", 
        "options_flipped": "Options F.", 
        "options": "Options", 
        "negation": "Negation",
    }
)
max_distance_variant_for_model

  ).groupby(['model_name'], as_index=False, sort=False).apply(lambda x: x.loc[x["Mean"].idxmax(), :])


Unnamed: 0,Model,Variant,Mean,Std
0,Llama-3.2-1B,Agr. w/ Neg.,0.239636,0.03391
1,Llama-3.2-1B-Instruct,N/Y,0.256461,0.032199
2,Llama-3.2-3B,Agr. w/ Neg.,0.190244,0.05674
3,Llama-3.2-3B-Instruct,Options F.,0.295296,0.048751
4,Llama-3.1-8B,Options F.,0.1138,0.054546
5,Llama-3.1-8B-Instruct,Options F.,0.335724,0.027631
6,Llama-3.3-70B-Instruct,Negation,0.33564,0.168559
7,gpt2-medium,Disagr. w/ Neg.,0.166641,0.019487
8,OLMo-2-1124-7B,Disagr. w/ Neg.,0.373452,0.049581
9,OLMo-2-1124-7B-Instruct,Options F.,0.533304,0.115831


In [25]:
max_distance_variant_for_model.to_latex("reports/robustness-prompt-type-distance.tex", float_format="%0.2f", index=False)