In [1]:
import pandas as pd

In [2]:
ALL_DATASETS = ["arc", "hellaswag", "mmlu", "truthfulqa", "winogrande"]
ALL_MODELS = ["Falcon-7b", "Falcon-40b", "gpt-3.5-turbo", "gpt-4-turbo", "Llama-7b", "Llama-13b", "Llama-70b", "Llama3-8b", "Llama3-70b", "Mistral", "Mixtral", "Solar", "Yi-6b", "Yi-34b"]
ALL_PROMPTS = ["first_prompt", "second_prompt"]
ALL_VALUES = ["raw_logits", "norm_logits"]

## Mann-Whitney

In [3]:
mw_results = pd.read_csv("./stat_tests_output/mann_whitney.csv")
mw_results["value"] = mw_results["value"].apply(lambda n: "max logit" if n == "raw_logits" else "MSP")

In [4]:
print("Across all datasets and prompts")
pd.pivot_table(mw_results,
               index = ["model"],
               columns = ["value"],
               values = "p_value",
               aggfunc = "max")

Across all datasets and prompts


value,MSP,max logit
model,Unnamed: 1_level_1,Unnamed: 2_level_1
Falcon-40b,0.3031019,0.9657384
Falcon-7b,0.9380454,0.9908665
Llama-13b,0.04574468,0.05745981
Llama-70b,3.717172e-13,0.04643834
Llama-7b,0.8049005,0.4232695
Llama3-70b,3.3898970000000002e-40,1.7358620000000002e-18
Llama3-8b,3.035213e-16,4.083141e-10
Mistral,4.045435e-06,5.243064e-06
Mixtral,1.046939e-05,5.536093e-08
Solar,0.004236637,1.947827e-07


In [5]:
mw_results["small_p"] = mw_results["p_value"] < 1e-4
mw_results["medium_p"] = mw_results["p_value"] < 1e-2
mw_results.groupby(["model", "value"]).sum()["small_p"]

model          value    
Falcon-40b     MSP           7
               max logit     4
Falcon-7b      MSP           1
               max logit     0
Llama-13b      MSP           6
               max logit     6
Llama-70b      MSP          10
               max logit     8
Llama-7b       MSP           6
               max logit     5
Llama3-70b     MSP          10
               max logit    10
Llama3-8b      MSP          10
               max logit    10
Mistral        MSP          10
               max logit    10
Mixtral        MSP          10
               max logit    10
Solar          MSP           9
               max logit    10
Yi-34b         MSP          10
               max logit    10
Yi-6b          MSP          10
               max logit    10
gpt-3.5-turbo  MSP          10
gpt-4-turbo    MSP          10
Name: small_p, dtype: int64

In [7]:
# for dataset in ALL_DATASETS:
#     print(dataset)
#     pivot = pd.pivot_table(mw_results[mw_results["dataset"] == dataset],
#                    index = ["model"],
#                    columns = ["value"],
#                    values = "p_value",
#                    aggfunc = "max")
#     pivot["norm_logits"] = round(pivot["norm_logits"], 3)
#     pivot["raw_logits"] = round(pivot["raw_logits"], 3)
#     display(pivot)
#     print()

In [6]:
for dataset in ALL_DATASETS:
    print(dataset)
    section = mw_results[mw_results["dataset"] == dataset]
    display(section.groupby(["model", "value"]).sum()["small_p"])
    print()

arc


model          value    
Falcon-40b     MSP          2
               max logit    1
Falcon-7b      MSP          0
               max logit    0
Llama-13b      MSP          2
               max logit    2
Llama-70b      MSP          2
               max logit    2
Llama-7b       MSP          2
               max logit    2
Llama3-70b     MSP          2
               max logit    2
Llama3-8b      MSP          2
               max logit    2
Mistral        MSP          2
               max logit    2
Mixtral        MSP          2
               max logit    2
Solar          MSP          2
               max logit    2
Yi-34b         MSP          2
               max logit    2
Yi-6b          MSP          2
               max logit    2
gpt-3.5-turbo  MSP          2
gpt-4-turbo    MSP          2
Name: small_p, dtype: int64


hellaswag


model          value    
Falcon-40b     MSP          2
               max logit    2
Falcon-7b      MSP          0
               max logit    0
Llama-13b      MSP          2
               max logit    2
Llama-70b      MSP          2
               max logit    2
Llama-7b       MSP          2
               max logit    1
Llama3-70b     MSP          2
               max logit    2
Llama3-8b      MSP          2
               max logit    2
Mistral        MSP          2
               max logit    2
Mixtral        MSP          2
               max logit    2
Solar          MSP          2
               max logit    2
Yi-34b         MSP          2
               max logit    2
Yi-6b          MSP          2
               max logit    2
gpt-3.5-turbo  MSP          2
gpt-4-turbo    MSP          2
Name: small_p, dtype: int64


mmlu


model          value    
Falcon-40b     MSP          2
               max logit    1
Falcon-7b      MSP          0
               max logit    0
Llama-13b      MSP          2
               max logit    2
Llama-70b      MSP          2
               max logit    2
Llama-7b       MSP          2
               max logit    2
Llama3-70b     MSP          2
               max logit    2
Llama3-8b      MSP          2
               max logit    2
Mistral        MSP          2
               max logit    2
Mixtral        MSP          2
               max logit    2
Solar          MSP          2
               max logit    2
Yi-34b         MSP          2
               max logit    2
Yi-6b          MSP          2
               max logit    2
gpt-3.5-turbo  MSP          2
gpt-4-turbo    MSP          2
Name: small_p, dtype: int64


truthfulqa


model          value    
Falcon-40b     MSP          1
               max logit    0
Falcon-7b      MSP          1
               max logit    0
Llama-13b      MSP          0
               max logit    0
Llama-70b      MSP          2
               max logit    2
Llama-7b       MSP          0
               max logit    0
Llama3-70b     MSP          2
               max logit    2
Llama3-8b      MSP          2
               max logit    2
Mistral        MSP          2
               max logit    2
Mixtral        MSP          2
               max logit    2
Solar          MSP          1
               max logit    2
Yi-34b         MSP          2
               max logit    2
Yi-6b          MSP          2
               max logit    2
gpt-3.5-turbo  MSP          2
gpt-4-turbo    MSP          2
Name: small_p, dtype: int64


winogrande


model          value    
Falcon-40b     MSP          0
               max logit    0
Falcon-7b      MSP          0
               max logit    0
Llama-13b      MSP          0
               max logit    0
Llama-70b      MSP          2
               max logit    0
Llama-7b       MSP          0
               max logit    0
Llama3-70b     MSP          2
               max logit    2
Llama3-8b      MSP          2
               max logit    2
Mistral        MSP          2
               max logit    2
Mixtral        MSP          2
               max logit    2
Solar          MSP          2
               max logit    2
Yi-34b         MSP          2
               max logit    2
Yi-6b          MSP          2
               max logit    2
gpt-3.5-turbo  MSP          2
gpt-4-turbo    MSP          2
Name: small_p, dtype: int64


