# Analysis of Model judgments for robustness to question variation

In [1]:
import numpy as np
import pandas as pd

import output_processing as op

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_list = [
    # Main set of models with instruct divide and size variety
    "meta-llama/Llama-3.2-1B",
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.2-3B",
    "meta-llama/Llama-3.2-3B-Instruct",
    "meta-llama/Llama-3.1-8B",
    "meta-llama/Llama-3.1-8B-Instruct",
    # Small reference model - would allow for pretraining variation
    "gpt2-medium",
    # Other open models
    "allenai/OLMo-2-1124-7B",
    "allenai/OLMo-2-1124-7B-Instruct",
    "mistralai/Ministral-8B-Instruct-2410",
    "google/gemma-7b",
    "google/gemma-7b-it",
    # Large platform model
    "openai-gpt-4",
]

def read_and_organize_model_results(model_name):
    model_results = pd.read_csv(f"runs/results_09_29/{model_name}-results.csv")
    model_results.replace([0.0], -65504, inplace=True)
    model_results = op.organize_distribution(model_results)
    model_results["model_name"] = model_name.split("/")[-1]
    model_results.loc[model_results["Covered"] == True, "Judgment"] = "Covered"
    model_results.loc[model_results["Covered"] == True, "Judgment_prob"] = model_results["Covered_prob"]
    model_results.loc[model_results["NotCovered"] == True, "Judgment"] = "NotCovered"
    model_results.loc[model_results["NotCovered"] == True, "Judgment_prob"] = model_results["NotCovered_prob"]
    return model_results

In [3]:
# TODO organize the output for better clarity
combined_results = [read_and_organize_model_results(model_name) for model_name in model_list]
combined_df = pd.concat(combined_results).reset_index()
combined_df.loc[:, "item"] = combined_df["title"] + combined_df["version"]
combined_df.shape

(16146, 28)

In [4]:
# Print summary of the experimental results
print(combined_df.model_name.unique())

['Llama-3.2-1B' 'Llama-3.2-1B-Instruct' 'Llama-3.2-3B'
 'Llama-3.2-3B-Instruct' 'Llama-3.1-8B' 'Llama-3.1-8B-Instruct'
 'gpt2-medium' 'OLMo-2-1124-7B' 'OLMo-2-1124-7B-Instruct'
 'Ministral-8B-Instruct-2410' 'gemma-7b' 'gemma-7b-it' 'openai-gpt-4']


In [5]:
# Looking at one of the result samples
combined_df.sample()

Unnamed: 0,index,title,prompt_type,prompt,version,output,output_text,cum_logprob,YES_probs,Yes_probs,...,UnAff_prob,Covered_prob,NotCovered_prob,Covered,NotCovered,entropy,model_name,Judgment,Judgment_prob,item
15112,208,Personal Accident I,no_or_yes,Gwen's car insurance policy includes coverage ...,unambiguous_covered,Yes,Yes,,-15.260053,4.3e-05,...,1.670954e-08,4.3e-05,1.670954e-08,True,False,0.000476,openai-gpt-4,Covered,4.3e-05,Personal Accident Iunambiguous_covered


## Prepare the table with Categorical counts and Distributional Spread

In [6]:
# Categorical Counts for Yes/No question variant
question_variant_mask = combined_df["prompt_type"] == "yes_or_no"
yes_or_no_df = combined_df[question_variant_mask]
count_labels = yes_or_no_df.groupby('model_name', as_index=False, sort=False).aggregate(
    {
        'Covered': 'sum',
        'NotCovered': 'sum',
    }
)
count_labels = count_labels.rename({'model_name' : 'Model'}, axis=1)
count_labels

Unnamed: 0,Model,Covered,NotCovered
0,Llama-3.2-1B,138,0
1,Llama-3.2-1B-Instruct,138,0
2,Llama-3.2-3B,129,9
3,Llama-3.2-3B-Instruct,51,87
4,Llama-3.1-8B,75,63
5,Llama-3.1-8B-Instruct,0,138
6,gpt2-medium,5,133
7,OLMo-2-1124-7B,73,65
8,OLMo-2-1124-7B-Instruct,53,85
9,Ministral-8B-Instruct-2410,87,51


In [7]:
# Latex table just for categorical counts
count_labels.to_latex("reports/yes_or_no_categorical_counts.tex", index=True, float_format="%.2f")

In [8]:
# Distributional spread for Yes/No question variant
question_variant_mask = combined_df["prompt_type"] == "yes_or_no"
yes_or_no_df = combined_df[question_variant_mask]

# Util function
yes_or_no_judgement_range = yes_or_no_df[['model_name', 'Covered_prob', 'NotCovered_prob']].melt(id_vars='model_name', value_vars=['Covered_prob', 'NotCovered_prob']).groupby('model_name', as_index=False).agg(
    Min = pd.NamedAgg('value', lambda x: np.round(np.min(x), 2)),
    Max = pd.NamedAgg('value', lambda x: np.round(np.max(x), 2)),
    ).rename({"model_name": "Model"}, axis=1)

yes_or_no_judgement_range

Unnamed: 0,Model,Min,Max
0,Llama-3.1-8B,0.12,0.34
1,Llama-3.1-8B-Instruct,0.09,0.7
2,Llama-3.2-1B,0.06,0.27
3,Llama-3.2-1B-Instruct,0.13,0.5
4,Llama-3.2-3B,0.08,0.5
5,Llama-3.2-3B-Instruct,0.15,0.65
6,Ministral-8B-Instruct-2410,0.2,0.58
7,OLMo-2-1124-7B,0.18,0.56
8,OLMo-2-1124-7B-Instruct,0.0,0.99
9,gemma-7b,0.17,0.43


In [9]:
# Separate latex table for yes_or_no judgement range
yes_or_no_judgement_range.to_latex("reports/yes_or_no_distributional_spread.tex", index=True, float_format="%.2f")

In [10]:
# Combined table
# Merge the two tables
yes_or_no_table = pd.concat([count_labels.set_index('Model'), yes_or_no_judgement_range.set_index('Model')], axis=1, join='inner')
yes_or_no_table

Unnamed: 0_level_0,Covered,NotCovered,Min,Max
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Llama-3.2-1B,138,0,0.06,0.27
Llama-3.2-1B-Instruct,138,0,0.13,0.5
Llama-3.2-3B,129,9,0.08,0.5
Llama-3.2-3B-Instruct,51,87,0.15,0.65
Llama-3.1-8B,75,63,0.12,0.34
Llama-3.1-8B-Instruct,0,138,0.09,0.7
gpt2-medium,5,133,0.12,0.3
OLMo-2-1124-7B,73,65,0.18,0.56
OLMo-2-1124-7B-Instruct,53,85,0.0,0.99
Ministral-8B-Instruct-2410,87,51,0.2,0.58


In [11]:
yes_or_no_table.to_latex("reports/yes_or_no_table.tex", index=True, float_format="%.2f")

## Prepare the majority votes frequency table

In [12]:
# Calculate the majority votes
majority_vote_by_model = combined_df.groupby(['title', 'version', 'model_name'], as_index=False, sort=False).aggregate(
     {
        'Covered': 'sum',
        'NotCovered': 'sum',
    }
)
majority_vote_by_model.loc[:, "majority_count"] = majority_vote_by_model[['Covered', 'NotCovered']].max(axis=1)
majority_vote_by_model.shape
majority_vote_table_df = majority_vote_by_model[["model_name", "majority_count"]].value_counts().reset_index(name="Frequency")\
.pivot_table(columns =['majority_count'], index="model_name", aggfunc="sum", margins=True)\

majority_vote_table_df.replace(np.nan, 0, inplace=True)
# Remove the "All" from columns and rows
majority_vote_table_df = majority_vote_table_df.drop(["All"], axis=0)
majority_vote_table_df = majority_vote_table_df.drop([("Frequency", "All")], axis=1)

majority_vote_table_df = majority_vote_table_df.rename({'model_name': 'Model', 'majority_count': 'Majority Count'})
majority_vote_table_df.index = majority_vote_table_df.index.rename('Model')
majority_vote_table_df.columns.names = [None, 'Majority Count']
majority_vote_table_df

Unnamed: 0_level_0,Frequency,Frequency,Frequency,Frequency,Frequency
Majority Count,5,6,7,8,9
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Llama-3.1-8B,40.0,50.0,48.0,0.0,0.0
Llama-3.1-8B-Instruct,8.0,41.0,56.0,29.0,4.0
Llama-3.2-1B,10.0,59.0,69.0,0.0,0.0
Llama-3.2-1B-Instruct,129.0,9.0,0.0,0.0,0.0
Llama-3.2-3B,79.0,55.0,4.0,0.0,0.0
Llama-3.2-3B-Instruct,77.0,46.0,15.0,0.0,0.0
Ministral-8B-Instruct-2410,20.0,67.0,26.0,24.0,1.0
OLMo-2-1124-7B,47.0,64.0,20.0,7.0,0.0
OLMo-2-1124-7B-Instruct,51.0,57.0,30.0,0.0,0.0
gemma-7b,44.0,60.0,29.0,5.0,0.0


In [13]:
majority_vote_table_df.to_latex("reports/majority-votes-freq-table.tex", float_format="%.0f")

## Make the table for Jensen-Shannon Distance for non Yes/No question variant

In [14]:
divergences = op.calculate_relative_measures(combined_df)
divergences.sample()

Unnamed: 0,title,version,model_name,prompt_type,js_dist,kl_div
4669,Garden Plants I,unambiguous_covered,Llama-3.1-8B,disagreement_negation,0.062089,0.015705


In [15]:
max_distance_variant_for_model = divergences.groupby(['model_name', 'prompt_type'], as_index=False, sort=False).aggregate(
    Mean = pd.NamedAgg('js_dist', "mean"),
    Std = pd.NamedAgg('js_dist', "std")
).groupby(['model_name'], as_index=False, sort=False).apply(lambda x: x.loc[x["Mean"].idxmax(), :])
max_distance_variant_for_model = max_distance_variant_for_model.rename({'model_name' : 'Model', 'prompt_type': 'Variant'}, axis=1)
max_distance_variant_for_model = max_distance_variant_for_model.replace(
    {
        "agreement_negation": "Agr. w/ Neg.", 
        "no_or_yes": "N/Y", 
        "disagreement_negation": "Disagr. w/ Neg.", 
        "options_flipped": "Options F.", 
        "options": "Options", 
        "negation": "Negation",
    }
)
max_distance_variant_for_model

  ).groupby(['model_name'], as_index=False, sort=False).apply(lambda x: x.loc[x["Mean"].idxmax(), :])


Unnamed: 0,Model,Variant,Mean,Std
0,Llama-3.2-1B,Agr. w/ Neg.,0.234301,0.033502
1,Llama-3.2-1B-Instruct,N/Y,0.244847,0.031106
2,Llama-3.2-3B,Agr. w/ Neg.,0.185209,0.055686
3,Llama-3.2-3B-Instruct,Options F.,0.260064,0.050909
4,Llama-3.1-8B,Options F.,0.095668,0.051488
5,Llama-3.1-8B-Instruct,Options F.,0.277773,0.027844
6,gpt2-medium,Disagr. w/ Neg.,0.166143,0.019285
7,OLMo-2-1124-7B,Disagr. w/ Neg.,0.38294,0.052281
8,OLMo-2-1124-7B-Instruct,Options F.,0.532485,0.11583
9,Ministral-8B-Instruct-2410,Options,0.250857,0.026262


In [16]:
max_distance_variant_for_model.to_latex("reports/robustness-prompt-type-distance.tex", float_format="%0.2f", index=False)