In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import output_processing as op

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_list = [
    # Main set of models with instruct divide and size variety
    "meta-llama/Llama-3.2-1B",
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.2-3B",
    "meta-llama/Llama-3.2-3B-Instruct",
    "meta-llama/Llama-3.1-8B",
    "meta-llama/Llama-3.1-8B-Instruct",
    # Small reference model - would allow for pretraining variation
    "gpt2-medium",
    # Other open models
    "allenai/OLMo-2-1124-7B",
    "allenai/OLMo-2-1124-7B-Instruct",
    "mistralai/Ministral-8B-Instruct-2410",
    "google/gemma-7b",
    "google/gemma-7b-it"
]


def read_and_organize_model_results(model_name):
    model_results = pd.read_csv(f"runs/runs-42_07_16/{model_name}-results.csv")
    model_results.replace([0.0], -65504, inplace=True)
    candidates = ['Yes', 'yes', 'No', 'no', 'A', 'B']
    yes_candidates = ['Yes', 'yes']
    # Raw logprob
    # Sum(p(yi...yb))
    no_candidates = ['No', 'no']
    # Raw logprob
    # Sum(p(yi...yb))
    
    model_results["yes_probs"] = np.sum([model_results[candidate + "_probs"] for candidate in yes_candidates], axis=0)
    model_results["no_probs"] = np.sum([model_results[candidate + "_probs"] for candidate in no_candidates], axis=0)
    
    model_results = op.organize_distribution(model_results)
    model_results["model_name"] = model_name.split("/")[-1]
    model_results.loc[model_results["Covered"] == True, "Judgment"] = "Covered"
    model_results.loc[model_results["Covered"] == True, "Judgment_prob"] = model_results["Covered_prob"]
    model_results.loc[model_results["NotCovered"] == True, "Judgment"] = "NotCovered"
    model_results.loc[model_results["NotCovered"] == True, "Judgment_prob"] = model_results["NotCovered_prob"]
    print(model_name, model_results.output.value_counts())
    return model_results
    
combined_results = [read_and_organize_model_results(model_name) for model_name in model_list]
combined_df = pd.concat(combined_results).reset_index()
combined_df.loc[:, "item"] = combined_df["title"] + ["version"]
combined_df.shape

meta-llama/Llama-3.2-1B output
yes       565
"         256
B         164
A         112
Yes        85
it         47
Harper      7
No          6
Name: count, dtype: int64
meta-llama/Llama-3.2-1B-Instruct output
yes    766
B      267
No     162
        17
\n      11
Yes     10
A        9
Name: count, dtype: int64
meta-llama/Llama-3.2-3B output
yes    844
B      260
no     114
A       14
No       7
C        2
\n       1
Name: count, dtype: int64
meta-llama/Llama-3.2-3B-Instruct output
yes    820
B      259
No      92
no      48
A       17
\n       6
Name: count, dtype: int64
meta-llama/Llama-3.1-8B output
yes    789
B      184
no     177
A       92
Name: count, dtype: int64
meta-llama/Llama-3.1-8B-Instruct output
no     617
yes    272
B      226
No      67
A       50
Yes     10
Name: count, dtype: int64
gpt2-medium output
yes      578
no       250
A        202
"        138
B         69
Genie      4
No         1
Name: count, dtype: int64
allenai/OLMo-2-1124-7B output
yes        556
no      

(14904, 28)

In [3]:
negation_prompt_types = combined_df[combined_df.prompt_type.apply(lambda x: x.strip() in ["negation", "agreement_negation", "disagreement_negation"])]

In [4]:
negation_prompt_types.shape, combined_df.shape

((4968, 28), (14904, 28))

In [5]:
4140/138/3, len(model_list)

(10.0, 12)

In [6]:
negation_prompt_types.to_csv("reports/negation_prompt_types_07_17.csv", index=False)

In [23]:
disagreement_negation_df = combined_df[combined_df.prompt_type == "disagreement_negation"]
disagreement_negation_df.prompt_type.value_counts()

prompt_type
disagreement_negation    1656
Name: count, dtype: int64

In [32]:
for i, row in disagreement_negation_df.sample(10)[["model_name", "output_text", "Covered_prob", "NotCovered_prob", "Yes_probs", "yes_probs", "No_probs", "no_probs"]].iterrows():
    print(row.model_name)
    print(row.output_text)
    print(row.Covered_prob, row.NotCovered_prob, row.Yes_probs, row.yes_probs, row.No_probs, row.no_probs)

Llama-3.2-3B-Instruct
 yes. (Note: This is a tricky question, and the answer may depend on the specific circumstances of the case. However, the general principle is that the insurance policy is not intended to cover accidental damage.)
This question is a great example of a linguistic phenomenon called "semantic ambiguity" or "semantic vagueness."
0.5214711624606773 0.4785288375393226 0.01232002615765332 0.0073706765720277494 0.011305491501882339 0.002863936287326353
OLMo-2-1124-7B
 no.
0.7287483034031065 0.2712516965968936 0.001810532784500259 0.0007772392632380514 0.0006739090674333083 0.0003604254359696236
gemma-7b-it
 no

The text states that "Escape of Water" damage is defined as "damage to buildings and contents caused by a sudden, unforeseen flow of water from an internal source." The text also states that "Belinda lives in a two
0.3775406687981454 0.6224593312018545 0.019082771730287305 0.007314192430949869 0.03146217165563977 0.015484145497829173
gpt2-medium
 yes.
Blake's insur

In [26]:
agreement_negation_df = combined_df[combined_df.prompt_type == "agreement_negation"]
agreement_negation_df.prompt_type.value_counts()

prompt_type
agreement_negation    1656
Name: count, dtype: int64

In [25]:
agreement_negation_df.output.value_counts()

output
yes    884
no     415
No     207
"       89
Yes     58
         2
\n       1
Name: count, dtype: int64

In [29]:
for i, row in agreement_negation_df.sample(10)[["model_name", "output_text", "Covered_prob", "NotCovered_prob"]].iterrows():
    print(row.model_name)
    print(row.output_text)
    print(row.Covered_prob, row.NotCovered_prob)

Llama-3.2-1B
 yes.
0.19930799790404563 0.8006920020959544
Llama-3.1-8B
 yes.
0.4746311777449677 0.5253688222550323
Llama-3.1-8B-Instruct
 yes.  I agree with the statement.  The phrase "under the custody or control of anyone with the policyholder's permission" is broad enough to include a friend who has been given permission
0.4148988457967688 0.5851011542032312
gpt2-medium
 yes. 
The insurance company is not required to provide coverage for "accessory" items, but it is required to provide coverage for "damage." 
The insurance company is required to provide coverage for "damage" to the policy holder's 1) car; or 2) car accessories (while in or on
0.42250463481418826 0.5774953651858117
gemma-7b-it
 no

The text describes a scenario where a gas-powered grill malfunction causes a fire in Fatema's garage. While the text defines "Fire" damage as "damage to the policyholder's property caused by naturally-occurring fire," the cause of the fire in this scenario is not a naturally-occurring fire