In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import output_processing as op
from functools import reduce

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
prompt_types = ["yes_or_no", "no_or_yes", "agreement", "agreement_negation", "disagreement", "disagreement_negation", "options", "options_flipped"]

In [8]:
models = ["meta-llama/Llama-3.1-8B-Instruct", "google/gemma-7b-it"]
prompt_divergences_across_models = list()
# pd.DataFrame({
#     'model' : list(), 
#     'mean_divergence' : list(),
#     'prompt_type': list()
# })
missing_probs_summary = pd.DataFrame()
for model_name in models:
    model_results = pd.read_csv(f"runs_05_15/{model_name}-results.csv")
    model_results = op.organize_distribution(model_results)

    missing_probs = op.summarize_missing_probs(model_results)
    missing_probs["model"] = model_name
    
    missing_probs_summary = pd.concat([missing_probs_summary, missing_probs])
    
    divergences = op.get_divergences(model_results)
    
    prompt_divergences = op.get_divergences_for_prompt_type(divergences)
    prompt_divergences["model"] = model_name
    print(prompt_divergences)
    
    prompt_divergences_across_models.append(prompt_divergences)

                       mean_divergence                             model
prompt_type                                                             
no_or_yes                     0.000673  meta-llama/Llama-3.1-8B-Instruct
agreement                     0.005827  meta-llama/Llama-3.1-8B-Instruct
agreement_negation            0.007292  meta-llama/Llama-3.1-8B-Instruct
disagreement                  0.008161  meta-llama/Llama-3.1-8B-Instruct
disagreement_negation         0.009153  meta-llama/Llama-3.1-8B-Instruct
options                       0.648216  meta-llama/Llama-3.1-8B-Instruct
options_flipped               0.648216  meta-llama/Llama-3.1-8B-Instruct
                       mean_divergence               model
prompt_type                                               
no_or_yes                          inf  google/gemma-7b-it
agreement                          inf  google/gemma-7b-it
agreement_negation            0.000041  google/gemma-7b-it
disagreement                  0.000142  google/g

In [9]:
missing_probs_summary

Unnamed: 0,0
Yes_prob,5
No_prob,2
A_prob,0
B_prob,0
model,meta-llama/Llama-3.1-8B-Instruct
Yes_prob,318
No_prob,409
A_prob,0
B_prob,0
model,google/gemma-7b-it


In [10]:
prompt_divergences_across_models = pd.concat(prompt_divergences_across_models)

In [6]:
prompt_divergences_across_models.reset_index().pivot_table(index="model", columns=["prompt_type"], sort=False)

Unnamed: 0_level_0,mean_divergence,mean_divergence,mean_divergence,mean_divergence,mean_divergence,mean_divergence,mean_divergence
prompt_type,no_or_yes,agreement,agreement_negation,disagreement,disagreement_negation,options,options_flipped
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
meta-llama/Llama-3.1-8B-Instruct,0.000673,0.005827,0.007292,0.008161,0.009153,0.648216,0.648216
google/gemma-7b-it,inf,inf,4.1e-05,0.000142,inf,0.666163,0.666163
