# Analysis of Model judgments for robustness to question variation

In [18]:
import numpy as np
import pandas as pd

import output_processing as op

In [19]:
model_list = [
    # Main set of models with instruct divide and size variety
    "meta-llama/Llama-3.2-1B",
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.2-3B",
    "meta-llama/Llama-3.2-3B-Instruct",
    "meta-llama/Llama-3.1-8B",
    "meta-llama/Llama-3.1-8B-Instruct",
    # Small reference model - would allow for pretraining variation
    "gpt2-medium",
    # Other open models
    "allenai/OLMo-2-1124-7B",
    "allenai/OLMo-2-1124-7B-Instruct",
    "mistralai/Ministral-8B-Instruct-2410",
    "google/gemma-7b",
    "google/gemma-7b-it",
    # Large platform model
    "openai-gpt-4",
]

def read_and_organize_model_results(model_name):
    model_results = pd.read_csv(f"runs/results_09_29/{model_name}-results.csv")
    model_results.replace([0.0], -65504, inplace=True)
    model_results = op.organize_distribution(model_results)
    model_results["model_name"] = model_name.split("/")[-1]
    model_results.loc[model_results["Covered"] == True, "Judgment"] = "Covered"
    model_results.loc[model_results["Covered"] == True, "Judgment_prob"] = model_results["Covered_prob"]
    model_results.loc[model_results["NotCovered"] == True, "Judgment"] = "NotCovered"
    model_results.loc[model_results["NotCovered"] == True, "Judgment_prob"] = model_results["NotCovered_prob"]
    return model_results

In [20]:
# TODO organize the output for better clarity
combined_results = [read_and_organize_model_results(model_name) for model_name in model_list]
combined_df = pd.concat(combined_results).reset_index()
combined_df.loc[:, "item"] = combined_df["title"] + combined_df["version"]
combined_df.shape

(16146, 28)

In [21]:
# Print summary of the experimental results
print(combined_df.model_name.unique())

['Llama-3.2-1B' 'Llama-3.2-1B-Instruct' 'Llama-3.2-3B'
 'Llama-3.2-3B-Instruct' 'Llama-3.1-8B' 'Llama-3.1-8B-Instruct'
 'gpt2-medium' 'OLMo-2-1124-7B' 'OLMo-2-1124-7B-Instruct'
 'Ministral-8B-Instruct-2410' 'gemma-7b' 'gemma-7b-it' 'openai-gpt-4']


In [22]:
# Looking at one of the result samples
combined_df.sample()

Unnamed: 0,index,title,prompt_type,prompt,version,output,output_text,cum_logprob,YES_probs,Yes_probs,...,UnAff_prob,Covered_prob,NotCovered_prob,Covered,NotCovered,entropy,model_name,Judgment,Judgment_prob,item
235,235,Trace and Access II,no_or_yes,"Jett has insurance that covers ""Trace and Acce...",unambiguous_covered,"""","""No."" The insurance company's definition of ""...",,-5.315608,0.130517,...,0.100291,0.130517,0.100291,True,False,0.698251,Llama-3.2-1B,Covered,0.130517,Trace and Access IIunambiguous_covered


In [25]:
# Categorical Counts for Yes/No question variant
question_variant_mask = combined_df["prompt_type"] == "yes_or_no"
yes_or_no_df = combined_df[question_variant_mask]
count_labels = yes_or_no_df.groupby('model_name', as_index=False, sort=False).aggregate(
    {
        'Covered': 'sum',
        'NotCovered': 'sum',
    }
)
count_labels = count_labels.rename({'model_name' : 'Model'}, axis=1)
count_labels

Unnamed: 0,Model,Covered,NotCovered
0,Llama-3.2-1B,138,0
1,Llama-3.2-1B-Instruct,138,0
2,Llama-3.2-3B,129,9
3,Llama-3.2-3B-Instruct,51,87
4,Llama-3.1-8B,75,63
5,Llama-3.1-8B-Instruct,0,138
6,gpt2-medium,5,133
7,OLMo-2-1124-7B,73,65
8,OLMo-2-1124-7B-Instruct,53,85
9,Ministral-8B-Instruct-2410,87,51


In [28]:
# Latex table just for categorical counts
count_labels.to_latex("reports/yes_or_no_categorical_counts.tex", index=True, float_format="%.2f")

In [24]:
# Distributional spread for Yes/No question variant
question_variant_mask = combined_df["prompt_type"] == "yes_or_no"
yes_or_no_df = combined_df[question_variant_mask]

# Util function
yes_or_no_judgement_range = yes_or_no_df[['model_name', 'Covered_prob', 'NotCovered_prob']].melt(id_vars='model_name', value_vars=['Covered_prob', 'NotCovered_prob']).groupby('model_name', as_index=False).agg(
    Min = pd.NamedAgg('value', lambda x: np.round(np.min(x), 2)),
    Max = pd.NamedAgg('value', lambda x: np.round(np.max(x), 2)),
    ).rename({"model_name": "Model"}, axis=1)

yes_or_no_judgement_range

Unnamed: 0,Model,Min,Max
0,Llama-3.1-8B,0.12,0.34
1,Llama-3.1-8B-Instruct,0.09,0.7
2,Llama-3.2-1B,0.06,0.27
3,Llama-3.2-1B-Instruct,0.13,0.5
4,Llama-3.2-3B,0.08,0.5
5,Llama-3.2-3B-Instruct,0.15,0.65
6,Ministral-8B-Instruct-2410,0.2,0.58
7,OLMo-2-1124-7B,0.18,0.56
8,OLMo-2-1124-7B-Instruct,0.0,0.99
9,gemma-7b,0.17,0.43


In [29]:
# Separate latex table for yes_or_no judgement range
yes_or_no_judgement_range.to_latex("reports/yes_or_no_distributional_spread.tex", index=True, float_format="%.2f")

In [27]:
# Combined table
# Merge the two tables
yes_or_no_table = pd.concat([count_labels.set_index('Model'), yes_or_no_judgement_range.set_index('Model')], axis=1, join='inner')
yes_or_no_table

Unnamed: 0_level_0,Covered,NotCovered,Min,Max
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Llama-3.2-1B,138,0,0.06,0.27
Llama-3.2-1B-Instruct,138,0,0.13,0.5
Llama-3.2-3B,129,9,0.08,0.5
Llama-3.2-3B-Instruct,51,87,0.15,0.65
Llama-3.1-8B,75,63,0.12,0.34
Llama-3.1-8B-Instruct,0,138,0.09,0.7
gpt2-medium,5,133,0.12,0.3
OLMo-2-1124-7B,73,65,0.18,0.56
OLMo-2-1124-7B-Instruct,53,85,0.0,0.99
Ministral-8B-Instruct-2410,87,51,0.2,0.58


In [30]:
yes_or_no_table.to_latex("reports/yes_or_no_table.tex", index=True, float_format="%.2f")

In [None]:
# 