# Import pickle file

In [1]:
import matplotlib.pyplot as plt
import pickle
import torch

import statistics
from matplotlib import pyplot as plt
import numpy as np
from functools import reduce
import io


class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        return super().find_class(module, name)

def open_data_pickle(filename):
    with open(filename, 'rb') as f:
        data = CPU_Unpickler(f).load()
    return data


def merge_dicts(dict1, dict2, dict3):
    merged_dict = dict1.copy()

    for key, value in dict2.items():
        if key in merged_dict and isinstance(merged_dict[key], dict) and isinstance(dict2[key], dict):
            merged_dict[key] = merge_dicts(merged_dict[key], dict2[key], dict3.get(key, {}))
        else:
            merged_dict[key] = dict2[key]

    
    for key, value in dict3.items():
        if key in merged_dict and isinstance(merged_dict[key], dict) and isinstance(dict3[key], dict):
            merged_dict[key] = merge_dicts(merged_dict[key], dict3[key], {})
        else:
            merged_dict[key] = dict3[key]

    return merged_dict

def print_dict_keys(dictionary, keys=[]):
    i=0
    for key, value in dictionary.items():
        keys.append(key)
        if isinstance(value, dict) and i<1:
            i+=1
            print_dict_keys(value)
    return keys

In [2]:
import plotly.graph_objects as go
import statistics
def visualize_accuracies(data, models, languages, prompts, task):
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2']
    # colors2 = ['#ff0000', '#00ff00', '#0000ff', '#ffff00', '#00ffff', '#ff00ff', '#800080']

    if task =='SA':
        baselines = {('English', 'null'): [[0.5], [0.535], [0.675], [0.51], [0.765], [0.5]], ('German', 'null'): [[0.5], [0.5], [0.54], [0.55], [0.47], [0.5]], ('French', 'null'): [[0.54], [0.625], [0.525], [0.585], [0.7], [0.5]]}
    else:
        baselines = {('English', 'null'): [[0.37373737373737376], [0.3686868686868687], [0.3838383838383838], [0.3939393939393939], [0.3434343434343434], [0.33]], ('German', 'null'): [[0.3181818181818182], [0.3282828282828283], [0.3484848484848485], [0.37373737373737376], [0.3686868686868687], [0.33]], ('French', 'null'): [[0.30808080808080807], [0.36363636363636365], [0.3333333333333333], [0.3383838383838384], [0.3333333333333333], [0.33]]}


    fig = go.Figure()

    for prompt_index, prompt in enumerate(prompts):
        # x = [model+ ' '+prompt for model in models]
        x = [model for model in models]
        y = data[(languages[0], prompt)]
        rounded_y = [round(statistics.mean(val), 2) for val in y]

        # print((languages[0], prompt), len(y),prompt_index, y)
        # print(y[-1])
        # print(x * len(y[-1]))
    
        fig.add_trace(go.Bar(
            x=x * len(y[-1]),
            y=rounded_y,  # Use rounded_y for the y-values of the bar plot
            text=rounded_y,
            name=f"{languages[0]} - {prompt}",
            # width=0.1,
            marker=dict(color=colors[prompt_index])
        ))

        # Create box plot for each bar plot
        # x2=[model+ ' '+prompt for model in models]
        for model_index, model in enumerate(models):
            # print([x2[model_index]]* len(y[prompt_index]))
            # print(y[model_index])
            fig.add_trace(go.Box(
                x=[x[model_index]]* len(y[-1]), # Assign the same x-value to the box plot as the corresponding bar plot
                y=y[model_index],
                # name=f"{model} {languages[0]} - {prompt} Range ",
                marker=dict(color='rgb(26, 118, 255)'),
                # width=0.3,
                # line=dict(color=colors2[model_index])
                showlegend=False
            ))
            

        # Add a line for each model with its baseline accuracy
        for model_index in range(len(models)):
            baseline = baselines[(languages[0], 'null')][model_index][0]
            fig.add_shape(type="line",
                        x0= model_index- 0.4,
                        x1= model_index+0.4,
                        y0=baseline,
                        y1=baseline,
                        line=dict(color="red", width=2))
            
    # Create a custom legend item for the shape color
    
    # Create a custom legend item for the shape color
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode="markers",
            marker=dict(color="red"),
            showlegend=True,
            name="Baseline",
        )
    )

    # Create a custom legend item for the shape color
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode="markers",
            marker=dict(color="black"),
            showlegend=True,
            name="Chance",
        )
    )

    # Set y-axis range from 0 to 1
    fig.update_yaxes(range=[0, 1])

    if task =='SA':
        # Add baseline line at y=0.5
        fig.add_shape(type="line",
                    x0=-0.5,
                    x1=len(models) - 0.5,
                    y0=0.5,
                    y1=0.5,
                    line=dict(color="black", width=2, dash='dash'))
    else:
        # Add baseline line at y=0.33
        fig.add_shape(type="line",
                    x0=-0.5,
                    x1=len(models) - 0.5,
                    y0=0.33,
                    y1=0.33,
                    line=dict(color="black", width=2, dash='dash'))

    fig.update_layout(
        title=dict(text=f'Accuracies of Models on Different Prompts for {task}, language {languages[0]}',xanchor='center',x=0.5,y=0.95),
        xaxis=dict(title='Models'),
        yaxis=dict(title='Accuracy'),
        boxmode='group',
        barmode='group',
        legend=dict(orientation='h', yanchor='top', x=0, y=1.3),
        showlegend=True,
        width=1200,  # Set the width of the figure
        height=400  # Set the height of the figure
        # margin=dict(t=200)
    )
    fig.show()
    # save to file=f'Accuracies_v{version}_{fn}.png'



import plotly.graph_objects as go
def visualize_accuracies_no_box_plot(data, models, languages, prompts, task):
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2'] #(color='rgb(26, 118, 255)'
    if task =='SA':
        baselines = {('English', 'null'): [[0.5], [0.535], [0.675], [0.51], [0.765]], ('German', 'null'): [[0.5], [0.5], [0.54], [0.55], [0.47]], ('French', 'null'): [[0.54], [0.625], [0.525], [0.585], [0.7]]}
    else:
        baselines = {('English', 'null'): [[0.37373737373737376], [0.3686868686868687], [0.3838383838383838], [0.3939393939393939], [0.3434343434343434]], ('German', 'null'): [[0.3181818181818182], [0.3282828282828283], [0.3484848484848485], [0.37373737373737376], [0.3686868686868687]], ('French', 'null'): [[0.30808080808080807], [0.36363636363636365], [0.3333333333333333], [0.3383838383838384], [0.3333333333333333]]}

    fig = go.Figure()

    for prompt_index, prompt in enumerate(prompts):
        x = [model for model in models]
        y = data[(languages[0], prompt)]
        rounded_y = [round(val, 2) for val in y]  # Round the values to two decimal places

        fig.add_trace(go.Bar(
            x=x,
            y=y,
            text= rounded_y,
            name=f"{languages[0]} - {prompt}",
            marker=dict(color=colors[prompt_index])
        ))

        # Add a line for each model with its baseline accuracy
        baseline = [baselines[(languages[0], 'null')][model_index][0] for model_index in range(len(models))]
        fig.add_trace(go.Scatter(
            x=x,
            y=baseline,
            mode='lines',
            # name=f"{languages[0]} Baseline",
            # name="Baseline",
            line=dict(color='red', dash='dash')
        ))

    # To DO: add range/ box plot to bar plot

    # Set y-axis range from 0 to 1
    fig.update_yaxes(range=[0, 1])

    if task =='SA':
        # Add baseline line at y=0.5
        fig.add_shape(type="line",
                    x0=-0.5,
                    x1=len(models) - 0.5,
                    y0=0.5,
                    y1=0.5,
                    line=dict(color="black", width=2, dash='dash'))
    else:
        # Add baseline line at y=0.33
        fig.add_shape(type="line",
                    x0=-0.5,
                    x1=len(models) - 0.5,
                    y0=0.33,
                    y1=0.33,
                    line=dict(color="black", width=2, dash='dash'))

    fig.update_layout(
        title=f'Accuracies of Models on Different Prompts for {task}, language {languages[0]}',
        xaxis=dict(title='Models'),
        yaxis=dict(title='Accuracy'),
        barmode='group',
        legend=dict(x=0, y=1.1, orientation='h'),
        showlegend=True
    )
    fig.show()
    # save to file=f'Accuracies_v{version}_{fn}.png'

def get_acc_per_prompt_type(data, model='bloom', task='SA', lang='en', seed='42', prompt_type='active'):
    """
    Computes accuracy given predicted and target labels
    input:
        predictions: a list of predicted labels
        targets: a list of true labels
    output:
        accuracy: float value of the classification accuracy
    """

    nli=False

    path = [seed, lang, model, task, prompt_type]
    all_prompt_vars = reduce(lambda d, k: d[k], path, data)
    num_prompt_vars = len(all_prompt_vars.keys())

    targets = []
    predictions = []
    for i in range(num_prompt_vars):
        num_sentences = len(
            reduce(lambda d, k: d[k], path + [f'prompt_id_{i}'], data))-1
        for sen_id in range(num_sentences):
            sentence = reduce(
                lambda d, k: d[k], path + [f'prompt_id_{i}', sen_id], data)
            if nli:
                pred = 'no' if sentence['pred']=='maybe' else sentence['pred']
            else:
                pred = sentence['pred']
            true = sentence['true']
            targets.append(true)
            predictions.append(pred)

    correct = sum(pred == true for pred, true in zip(predictions, targets))
    total = len(predictions)
    accuracy = correct / total

    return accuracy



def get_acc_per_prompt_variation(data, model='bloom', task='SA', lang='en', seed='42', prompt_type='active'):
    """
    Computes accuracy per prompt type and per prompt id within
    input:
        predictions: a list of predicted labels
        targets: a list of true labels
    output:
        accuracy: float value of the classification accuracy
    """
    path = [seed, lang, model, task, prompt_type]
    all_prompt_vars = reduce(lambda d, k: d[k], path, data)
    num_prompt_vars = len(all_prompt_vars.keys())

    list_prompt_type_acc = []
    for i in range(num_prompt_vars):
        prompt_dict = reduce(
            lambda d, k: d[k], path + [f'prompt_id_{i}'], data)
        acc = prompt_dict['acc']
        list_prompt_type_acc.append(acc)

    return list_prompt_type_acc



def get_acc_plot(data, languages, models, prompt_types, task, seed, version, fn=get_acc_per_prompt_type):
    lang_map = {'en': 'English', 'de': 'German', 'fr':'French'}

    plot_data = {}
    for lang in languages:
        for LM_model in models:
            for prompt_type in prompt_types:
                key = (lang_map[lang], prompt_type)
                if key in plot_data:
                        # Append data to existing key
                        plot_data[key].append(fn(
                            data, model=LM_model, task=task, lang=lang, seed=seed, prompt_type=prompt_type))
                else:
                        # Create new key and assign data
                        plot_data[key] = [fn(
                            data, model=LM_model, task=task, lang=lang, seed=seed, prompt_type=prompt_type)]
    # print(plot_data)
    languages_ = [lang_map[lang] for lang in languages]
    if fn == get_acc_per_prompt_type:
        visualize_accuracies_no_box_plot(plot_data, models, languages_, prompt_types, task)
    else:
        visualize_accuracies(plot_data, models, languages_, prompt_types, task)


In [3]:
def yes_no_split(data, key, condition, lang, model, task,prompt_type='active'):
    var_for_each_prompt=[]
    path = ['42', lang, model, task, prompt_type]
    all_prompt_vars = reduce(lambda d, k: d[k], path, data)
    num_prompt_vars = len(all_prompt_vars.keys())
    
    if lang =='de':
        answer_map= {'yes':'ja', 'no':'nein', 'maybe': 'vielleicht'}
    elif lang =='fr':
        answer_map= {'yes':'oui', 'no':'non', 'maybe': 'peut-être'}
    else:
        answer_map= {'yes':'yes', 'no':'no', 'maybe': 'maybe'}

    for i in range(num_prompt_vars):
        num_sentences = len(
            reduce(lambda d, k: d[k], path + [f'prompt_id_{i}'], data))-1
        for sen_id in range(num_sentences):
            sentence = reduce(
                lambda d, k: d[k], path + [f'prompt_id_{i}', sen_id], data)
            if sentence[key] == answer_map[condition]:
                var_for_each_prompt.append(1)
    print(condition, len(var_for_each_prompt))

In [4]:
def get_distribution(data,models, lang, task, prompt_type='active', nli=False):
    print(f'-------------Label distribution for model {models[0]} {prompt_type} for {lang}------------')
    yes_no_split(data,'true','yes', lang, models[0], task, prompt_type=prompt_type)
    yes_no_split(data,'true','no', lang, models[0], task, prompt_type=prompt_type)
    if nli:
        yes_no_split(data,'true','maybe', lang, models[0], task, prompt_type=prompt_type)

    for model in models:
        print(f'-------------Prediction distribution for model {model} {prompt_type} for {lang}------------')
        yes_no_split(data,'pred','yes', lang, model, task, prompt_type=prompt_type)
        yes_no_split(data,'pred','no', lang,  model, task, prompt_type=prompt_type)
        if nli:
            yes_no_split(data,'pred','maybe', lang, model, task, prompt_type=prompt_type)

In [5]:
import plotly.graph_objects as go

def visualize_boxplot(data,box_plot_names, num_plots, ylabel='Difference', title='Comparison of Differences between yes and no probability', file='Box_plots_v1.html'):
    fig = go.Figure()

    for i in range(len(box_plot_names)):
        print(data[i])
        fig.add_trace(go.Box(y=data[i], name=box_plot_names[i]))

    fig.update_layout(
        title=title,
        xaxis=dict(
            tickvals=num_plots,
            ticktext=box_plot_names,
            tickangle=45,
            title='Tasks and Sentence Types'
        ),
        yaxis=dict(
            title=ylabel
        ),
        showlegend=False
    )
    
    # Set y-axis range from 0 to 1
    fig.update_yaxes(range=[0, 1])

    if task =='SA':
        # Add baseline line at y=0.5
        fig.add_shape(type="line",
                    x0=-0.5,
                    x1=len(models) - 0.5,
                    y0=0.5,
                    y1=0.5,
                    line=dict(color="black", width=2, dash='dash'))
    else:
        # Add baseline line at y=0.33
        fig.add_shape(type="line",
                    x0=-0.5,
                    x1=len(models) - 0.5,
                    y0=0.33,
                    y1=0.33,
                    line=dict(color="black", width=2, dash='dash'))

    # fig.write_html(file)
    fig.show()

def get_acc_box_plot(data, lang, model, prompt_types,task, seed):
    plot_data = []
    for prompt_type in prompt_types:
        plot_data.append(get_acc_per_prompt_variation(data, model,task, lang, seed,prompt_type))
    print(plot_data)
    num_plots = [i for i in range(len(prompt_types))]
    visualize_boxplot(plot_data, prompt_types, num_plots, ylabel='Accuracy',title=f'Comparison of accuracy within prompt type on task {task} and lang {lang}',
                         file=f'Box_plots_v{version}.png')

In [6]:
import numpy as np

def get_acc_table(data, languages, models, prompt_types, task, seed, version, fn=get_acc_per_prompt_type):
    lang_map = {'en': 'English', 'de': 'German', 'fr':'French'}

    plot_data = {}
    for lang in languages:
        for model in models:
            for prompt_type in prompt_types:
                key = (lang_map[lang], prompt_type)
                if key in plot_data:
                        # Append data to existing key
                        plot_data[key].append(fn(
                            data, model=model, task=task, lang=lang, seed=seed, prompt_type=prompt_type))
                else:
                        # Create new key and assign data
                        plot_data[key] = [fn(
                            data, model=model, task=task, lang=lang, seed=seed, prompt_type=prompt_type)]
    print('plot_data', plot_data)
    languages_ = [lang_map[lang] for lang in languages]
    model_stats = calculate_accuracy_stats(plot_data)

    # # Print the table
    # print("{:<20} {:<20} {:<20} {:<20}".format("Model", "Mean Accuracy", "Variance", "Best Accuracy Index"))
    # for model, mean_acc, var_acc, best_acc_idx in model_stats:
    #     model, mean_acc, var_acc, best_acc_idx = list(model), list(mean_acc), list(var_acc), list(best_acc_idx)
    #     print(model, mean_acc, var_acc, best_acc_idx)
    #     print(type(model), type(mean_acc), type(var_acc), type(best_acc_idx))
    #     print("{:<20} {:<20.3f} {:<20.6f} {:<20}".format(model, mean_acc, var_acc, best_acc_idx))

def calculate_accuracy_stats(data):
    model_stats = []
    
    for model, accuracies in data.items():
        mean_acc = np.mean(accuracies, axis=1)
        var_acc = np.var(accuracies, axis=1)
        best_acc_idx = np.argmax(accuracies, axis=1)
        model_stats.append((model, mean_acc, var_acc, best_acc_idx))
    
    return model_stats



In [7]:
# friedmanchisquare is for testing significance between prompts in same prompt type
from scipy.stats import friedmanchisquare
def get_friedman_test(data, model,task, lang, prompt_type):
    all_acc_for_prompt_type= get_acc_per_prompt_variation(data, model=model, task=task, lang=lang, seed='42', prompt_type=prompt_type)
    acc_list_of_lists = [[acc] for acc in all_acc_for_prompt_type]
    print(acc_list_of_lists)
    score, p_value = friedmanchisquare(*acc_list_of_lists)

    print("Friedman chi-square score:", score)
    print("p-value:", p_value)


In [8]:
from scipy import stats

def within_1_class_significance_test(data, model,task, lang,prompt_type1):
    active_accuracies = get_acc_per_prompt_variation(data, model=model, task=task, lang=lang, seed=seed, prompt_type=prompt_type1)

    # Perform a paired t-test for active accuracies
    t_statistic_active, p_value_active = stats.ttest_1samp(active_accuracies, 0.5)
    print("p-value:", p_value)
    alpha = 0.05  # Significance level

    if p_value_active < alpha:
        print(f"There is a significant difference in the accuracies within the {prompt_type1} class.")
    else:
        print(f"There is no significant difference in the accuracies within the {prompt_type1} class.")


In [9]:
from scipy import stats

def between_2_class_significance_test(data, model,task, lang,prompt_type1, prompt_type2):
    # active_accuracies = [0.5, 0.475, 0.5, 0.5, 0.5, 0.49, 0.5, 0.5, 0.5, 0.505]
    # passive_accuracies = [0.5, 0.49, 0.5, 0.5, 0.5, 0.48, 0.5, 0.5, 0.5, 0.5]

    active_accuracies = get_acc_per_prompt_variation(data, model=model, task=task, lang=lang, seed=seed, prompt_type=prompt_type1)
    passive_accuracies = get_acc_per_prompt_variation(data, model=model, task=task, lang=lang, seed=seed, prompt_type=prompt_type2)
    print(active_accuracies)
    print(passive_accuracies)

    # Perform an independent samples t-test
    # t_statistic, p_value = stats.ttest_ind(active_accuracies, passive_accuracies)

    # Perform a Mann-Whitney U test
    u_statistic, p_value = stats.mannwhitneyu(active_accuracies, passive_accuracies, alternative='two-sided')

    # Uncomment the appropriate line based on the test you want to use

    alpha = 0.05  # Significance level
    print("p-value:", p_value)
    
    if p_value < alpha:
        print("There is a significant difference between the accuracies of the active and passive classes.")
    else:
        print("There is no significant difference between the accuracies of the active and passive classes.")


In [10]:
from scipy import stats

# active_accuracies = [0.5, 0.475, 0.5, 0.5, 0.5, 0.49, 0.5, 0.5, 0.5, 0.505]
# passive_accuracies = [0.5, 0.49, 0.5, 0.5, 0.5, 0.48, 0.5, 0.5, 0.5, 0.5]

def get_wilcoxon_test(data, model,task, lang,prompt_type1, prompt_type2):
    
    active_accuracies = get_acc_per_prompt_variation(data, model=model, task=task, lang=lang, seed=seed, prompt_type=prompt_type1)
    passive_accuracies = get_acc_per_prompt_variation(data, model=model, task=task, lang=lang, seed=seed, prompt_type=prompt_type2)
    print(active_accuracies)
    print(passive_accuracies)

    # Perform a paired t-test
    # t_statistic, p_value = stats.ttest_rel(active_accuracies, passive_accuracies)

    # Perform a Wilcoxon signed-rank test
    w_statistic, p_value = stats.wilcoxon(active_accuracies, passive_accuracies)

    alpha = 0.05  # Significance level
    print("p-value:", p_value)
    
    if p_value < alpha:
        print("There is a significant difference between the accuracies in the same class.")
    else:
        print("There is no significant difference between the accuracies in the same class.")

# Experiments with other versions

In [None]:
# evaluation settings
models = ['bloom', 'llama'] 
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
# languages = ['de', 'en', 'fr']
seeds = ['42']

version=15
task = 'SA'
seed = seeds[0]
# lang='de'

### avg of ids for logit

In [None]:
# avg
data = merge_dicts(open_data_pickle('logits_dict_seed_42_lang_en_v15.pickle'),  open_data_pickle('logits_dict_seed_42_lang_de_v16.pickle'),  open_data_pickle('logits_dict_seed_42_lang_fr_v17.pickle'))

In [None]:
task='SA'
lang='en'
get_distribution(data,models, lang, task)
print()
lang='de'
get_distribution(data,models, lang, task)
print()
lang='fr'
get_distribution(data,models, lang, task)

-------------Label distribution for model bloom for en------------
yes 1000
no 1000
-------------Prediction distribution for model bloom for en------------
yes 1528
no 472
-------------Prediction distribution for model llama for en------------
yes 1884
no 116

-------------Label distribution for model bloom for de------------
yes 1000
no 1000
-------------Prediction distribution for model bloom for de------------
yes 1841
no 159
-------------Prediction distribution for model llama for de------------
yes 1994
no 6

-------------Label distribution for model bloom for fr------------
yes 1000
no 1000
-------------Prediction distribution for model bloom for fr------------
yes 1828
no 172
-------------Prediction distribution for model llama for fr------------
yes 1551
no 449


In [None]:
task='SA'
# print('------------- English Acc plots------------')
# get_acc_plot(data, ['en'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

# print('------------- French Acc plots------------')
# get_acc_plot(data, ['fr'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

------------- German Acc plots------------


In [None]:
task='NLI'
lang='en'
get_distribution(data,models, lang, task, nli=True)
print()
lang='de'
get_distribution(data,models, lang, task, nli=True)
print()
lang='fr'
get_distribution(data,models, lang, task, nli=True)

-------------Label distribution for model bloom for en------------
yes 660
no 660
maybe 0
-------------Prediction distribution for model bloom for en------------
yes 541
no 779
maybe 0
-------------Prediction distribution for model llama for en------------
yes 1268
no 4
maybe 48

-------------Label distribution for model bloom for de------------
yes 660
no 660
maybe 0
-------------Prediction distribution for model bloom for de------------
yes 392
no 56
maybe 872
-------------Prediction distribution for model llama for de------------
yes 1320
no 0
maybe 0

-------------Label distribution for model bloom for fr------------
yes 660
no 660
maybe 0
-------------Prediction distribution for model bloom for fr------------
yes 837
no 483
maybe 0
-------------Prediction distribution for model llama for fr------------
yes 498
no 620
maybe 202


In [None]:
task = 'NLI'
# print('------------- English Acc plots------------')
# get_acc_plot(data, ['en'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

# print('------------- French Acc plots------------')
# get_acc_plot(data, ['fr'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

------------- German Acc plots------------


In [None]:
print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type, nli=True)


------------- German Acc plots------------


### summ of ids for logit

In [None]:
# sum
data = merge_dicts(open_data_pickle('logits_dict_seed_42_lang_en_v20.pickle'),  open_data_pickle('logits_dict_seed_42_lang_de_v20.pickle'),  open_data_pickle('logits_dict_seed_42_lang_fr_v20.pickle'))


In [None]:
task='SA'
lang='en'
get_distribution(data,models, lang, task)
print()
lang='de'
get_distribution(data,models, lang, task)
print()
lang='fr'
get_distribution(data,models, lang, task)

-------------Label distribution for model bloom for en------------
yes 1000
no 1000
-------------Prediction distribution for model bloom for en------------
yes 1528
no 472
-------------Prediction distribution for model llama for en------------
yes 1885
no 115

-------------Label distribution for model bloom for de------------
yes 1000
no 1000
-------------Prediction distribution for model bloom for de------------
yes 887
no 1113
-------------Prediction distribution for model llama for de------------
yes 1965
no 35

-------------Label distribution for model bloom for fr------------
yes 1000
no 1000
-------------Prediction distribution for model bloom for fr------------
yes 1828
no 172
-------------Prediction distribution for model llama for fr------------
yes 1997
no 3


In [None]:
task='SA'

# print('------------- English Acc plots------------')
# get_acc_plot(data, ['en'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

# print('------------- French Acc plots------------')
# get_acc_plot(data, ['fr'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

------------- German Acc plots------------


In [None]:
task='NLI'
lang='en'
get_distribution(data,models, lang, task, nli=True)
print()
lang='de'
get_distribution(data,models, lang, task, nli=True)
print()
lang='fr'
get_distribution(data,models, lang, task, nli=True)

-------------Label distribution for model bloom for en------------
yes 660
no 660
maybe 0
-------------Prediction distribution for model bloom for en------------
yes 541
no 779
maybe 0
-------------Prediction distribution for model llama for en------------
yes 1269
no 4
maybe 47

-------------Label distribution for model bloom for de------------
yes 660
no 660
maybe 0
-------------Prediction distribution for model bloom for de------------
yes 53
no 7
maybe 1260
-------------Prediction distribution for model llama for de------------
yes 952
no 223
maybe 145

-------------Label distribution for model bloom for fr------------
yes 660
no 660
maybe 0
-------------Prediction distribution for model bloom for fr------------
yes 837
no 482
maybe 1
-------------Prediction distribution for model llama for fr------------
yes 828
no 14
maybe 478


In [None]:
task = 'NLI'
# print('------------- English Acc plots------------')
# get_acc_plot(data, ['en'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

# print('------------- French Acc plots------------')
# get_acc_plot(data, ['fr'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

------------- German Acc plots------------


In [None]:
print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type, nli=True)

------------- German Acc plots------------


### fixed neutral, summ of ids for logit

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
# languages = ['de', 'en', 'fr']
seeds = ['42']

version=15
task = 'SA'
seed = seeds[0]
# lang='de'

In [None]:
version=51
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [None]:
print(print_dict_keys(data))

['42', 'en', 't0', 'NLI', 'active', 'prompt_id_0', 0, 'yes', 'no', 'maybe', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 'acc', 'prompt_id_1', '

In [None]:
task='NLI'
lang='en'
get_distribution(data,models, lang, task, nli=True)
print()
lang='de'
get_distribution(data,models, lang, task, nli=True)
print()
lang='fr'
get_distribution(data,models, lang, task, nli=True)

-------------Label distribution for model bloom for en------------
yes 660
no 660
maybe 660
-------------Prediction distribution for model bloom for en------------
yes 825
no 1155
maybe 0
-------------Prediction distribution for model bloomz for en------------
yes 971
no 1009
maybe 0
-------------Prediction distribution for model flan for en------------
yes 949
no 1031
maybe 0
-------------Prediction distribution for model llama for en------------
yes 1891
no 11
maybe 78

-------------Label distribution for model bloom for de------------
yes 660
no 660
maybe 660
-------------Prediction distribution for model bloom for de------------
yes 62
no 18
maybe 1900
-------------Prediction distribution for model bloomz for de------------
yes 56
no 132
maybe 1792
-------------Prediction distribution for model flan for de------------
yes 645
no 1246
maybe 89
-------------Prediction distribution for model llama for de------------
yes 1405
no 386
maybe 189

-------------Label distribution for model 

In [None]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


In [None]:
print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type, nli=True)

------------- German Acc plots------------


### max of ids for logit

In [None]:
# max
data = merge_dicts(open_data_pickle('logits_dict_seed_42_lang_en_v21.pickle'),  open_data_pickle('logits_dict_seed_42_lang_de_v21.pickle'),  open_data_pickle('logits_dict_seed_42_lang_fr_v21.pickle'))


In [None]:
task='SA'
lang='en'
get_distribution(data,models, lang, task)
print()
lang='de'
get_distribution(data,models, lang, task)
print()
lang='fr'
get_distribution(data,models, lang, task)

-------------Label distribution for model bloom for en------------
yes 1000
no 1000
-------------Prediction distribution for model bloom for en------------
yes 1528
no 472
-------------Prediction distribution for model llama for en------------
yes 1885
no 115

-------------Label distribution for model bloom for de------------
yes 1000
no 1000
-------------Prediction distribution for model bloom for de------------
yes 1654
no 346
-------------Prediction distribution for model llama for de------------
yes 1965
no 35

-------------Label distribution for model bloom for fr------------
yes 1000
no 1000
-------------Prediction distribution for model bloom for fr------------
yes 1828
no 172
-------------Prediction distribution for model llama for fr------------
yes 1997
no 3


In [None]:
task='SA'
# print('------------- English Acc plots------------')
# get_acc_plot(data, ['en'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

# print('------------- French Acc plots------------')
# get_acc_plot(data, ['fr'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

------------- German Acc plots------------


In [None]:
task='NLI'
lang='en'
get_distribution(data,models, lang, task, nli=True)
print()
lang='de'
get_distribution(data,models, lang, task, nli=True)
print()
lang='fr'
get_distribution(data,models, lang, task, nli=True)

-------------Label distribution for model bloom for en------------
yes 660
no 660
maybe 0
-------------Prediction distribution for model bloom for en------------
yes 541
no 779
maybe 0
-------------Prediction distribution for model llama for en------------
yes 1268
no 4
maybe 48

-------------Label distribution for model bloom for de------------
yes 660
no 660
maybe 0
-------------Prediction distribution for model bloom for de------------
yes 56
no 4
maybe 1260
-------------Prediction distribution for model llama for de------------
yes 956
no 222
maybe 142

-------------Label distribution for model bloom for fr------------
yes 660
no 660
maybe 0
-------------Prediction distribution for model bloom for fr------------
yes 837
no 482
maybe 1
-------------Prediction distribution for model llama for fr------------
yes 831
no 13
maybe 476


In [None]:
task = 'NLI'
# print('------------- English Acc plots------------')
# get_acc_plot(data, ['en'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

# print('------------- French Acc plots------------')
# get_acc_plot(data, ['fr'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

------------- German Acc plots------------


In [None]:
print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type, nli=True)

------------- German Acc plots------------


### baseline, no prompt

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['null']
# languages = ['de', 'en', 'fr']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=56
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [None]:
print(data)

{'42': {'en': {'bloom': {'SA': {'null': {'prompt_id_0': {0: {'yes': 5.4115631087370275e-08, 'no': 9.586956366547383e-07, 'diff': 9.045800197782228e-07, 'pred': 'no', 'true': 'yes'}, 1: {'yes': 5.960129669801972e-07, 'no': 4.808413905266207e-06, 'diff': 4.212401108816266e-06, 'pred': 'no', 'true': 'no'}, 2: {'yes': 9.051509763935428e-09, 'no': 1.74269615627054e-07, 'diff': 1.652181111921891e-07, 'pred': 'no', 'true': 'no'}, 3: {'yes': 1.367636883742307e-07, 'no': 3.3705941859807353e-06, 'diff': 3.2338305118173594e-06, 'pred': 'no', 'true': 'no'}, 4: {'yes': 2.7939481128669286e-07, 'no': 3.0320516088977456e-06, 'diff': 2.752656882876181e-06, 'pred': 'no', 'true': 'yes'}, 5: {'yes': 1.8443738269979804e-07, 'no': 3.7229974623187445e-06, 'diff': 3.5385601222515106e-06, 'pred': 'no', 'true': 'no'}, 6: {'yes': 2.6273227149431477e-07, 'no': 2.196610466853599e-06, 'diff': 1.9338781385158654e-06, 'pred': 'no', 'true': 'yes'}, 7: {'yes': 6.087514492492119e-08, 'no': 1.2213300806251937e-06, 'diff'

In [None]:
task='SA'
lang='en'
get_distribution(data,models, lang, task, prompt_type='null')

-------------Label distribution for model bloom null for en------------
yes 100
no 100
-------------Prediction distribution for model bloom null for en------------
yes 4
no 196
-------------Prediction distribution for model bloomz null for en------------
yes 11
no 189
-------------Prediction distribution for model flan null for en------------
yes 39
no 161
-------------Prediction distribution for model llama null for en------------
yes 2
no 198
-------------Prediction distribution for model t0 null for en------------
yes 55
no 145


In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'null'): [[0.5], [0.535], [0.675], [0.51], [0.765]]}



------------- German Acc plots------------
{('German', 'null'): [[0.5], [0.5], [0.54], [0.55], [0.47]]}



------------- French Acc plots------------
{('French', 'null'): [[0.54], [0.625], [0.525], [0.585], [0.7]]}



In [None]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'null'): [[0.37373737373737376], [0.3686868686868687], [0.3838383838383838], [0.3939393939393939], [0.3434343434343434]]}
['bloom', 'bloomz', 'flan', 'llama', 't0']



------------- German Acc plots------------
{('German', 'null'): [[0.3181818181818182], [0.3282828282828283], [0.3484848484848485], [0.37373737373737376], [0.3686868686868687]]}
['bloom', 'bloomz', 'flan', 'llama', 't0']



------------- French Acc plots------------
{('French', 'null'): [[0.30808080808080807], [0.36363636363636365], [0.3333333333333333], [0.3383838383838384], [0.3333333333333333]]}
['bloom', 'bloomz', 'flan', 'llama', 't0']



### yes no

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=57
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.565, 0.615, 0.64, 0.555, 0.58, 0.525, 0.615, 0.56, 0.54, 0.535], [0.965, 0.95, 0.965, 0.955, 0.955, 0.965, 0.965, 0.96, 0.95, 0.965], [0.98, 0.975, 0.965, 0.97, 0.975, 0.975, 0.97, 0.975, 0.965, 0.965], [0.67, 0.52, 0.5, 0.515, 0.665, 0.5, 0.5, 0.505, 0.5, 0.515], [0.86, 0.87, 0.87, 0.88, 0.86, 0.87, 0.89, 0.88, 0.895, 0.875]], ('English', 'passive'): [[0.565, 0.575, 0.57, 0.53, 0.525, 0.52, 0.545, 0.555, 0.535, 0.52], [0.955, 0.955, 0.94, 0.955, 0.955, 0.965, 0.96, 0.955, 0.94, 0.965], [0.975, 0.97, 0.975, 0.975, 0.965, 0.98, 0.97, 0.965, 0.97, 0.975], [0.605, 0.51, 0.505, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], [0.885, 0.88, 0.875, 0.895, 0.875, 0.87, 0.87, 0.895, 0.9, 0.885]], ('English', 'auxiliary'): [[0.565, 0.575, 0.64, 0.535, 0.525, 0.615, 0.56, 0.535, 0.55, 0.515], [0.965, 0.955, 0.965, 0.97, 0.965, 0.965, 0.96, 0.965, 0.94, 0.96], [0.975, 0.97, 0.965, 0.975, 0.975, 0.97, 0.98, 0.975, 0.98, 0.97], [0.67, 0.62,

------------- German Acc plots------------
{('German', 'active'): [[0.555, 0.545, 0.55, 0.605, 0.59, 0.585, 0.605, 0.55, 0.55, 0.615], [0.52, 0.5, 0.55, 0.53, 0.585, 0.58, 0.5, 0.55, 0.52, 0.515], [0.5, 0.5, 0.5, 0.5, 0.515, 0.51, 0.5, 0.5, 0.5, 0.52], [0.505, 0.515, 0.5, 0.5, 0.54, 0.505, 0.5, 0.5, 0.505, 0.5], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], ('German', 'passive'): [[0.53, 0.59, 0.615, 0.58, 0.575, 0.61, 0.58, 0.585, 0.58, 0.6], [0.47, 0.515, 0.52, 0.54, 0.525, 0.65, 0.495, 0.545, 0.59, 0.555], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], [0.505, 0.51, 0.5, 0.505, 0.505, 0.505, 0.5, 0.5, 0.505, 0.5], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], ('German', 'auxiliary'): [[0.575, 0.58, 0.55, 0.555, 0.585, 0.605, 0.585, 0.615, 0.595, 0.585], [0.515, 0.52, 0.55, 0.515, 0.58, 0.5, 0.495, 0.515, 0.555, 0.585], [0.5, 0.5, 0.505, 0.5, 0.505, 0.5, 0.5, 0.515, 0.515, 0.5], [0.5, 0.5, 0.5, 0.51, 0.505, 0.5, 0.5, 0.5, 0.515, 0.5], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5,

------------- French Acc plots------------
{('French', 'active'): [[0.49, 0.485, 0.53, 0.51, 0.5, 0.505, 0.52, 0.515, 0.505, 0.52], [0.98, 0.98, 0.965, 0.98, 0.975, 0.975, 0.975, 0.975, 0.975, 0.975], [0.585, 0.515, 0.545, 0.505, 0.535, 0.505, 0.51, 0.5, 0.495, 0.51], [0.51, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.51, 0.5], [0.495, 0.475, 0.51, 0.445, 0.495, 0.46, 0.48, 0.48, 0.475, 0.47]], ('French', 'passive'): [[0.5, 0.505, 0.515, 0.58, 0.555, 0.5, 0.53, 0.515, 0.505, 0.47], [0.98, 0.985, 0.975, 0.98, 0.975, 0.98, 0.98, 0.96, 0.98, 0.98], [0.52, 0.495, 0.5, 0.565, 0.51, 0.505, 0.44, 0.5, 0.495, 0.515], [0.505, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], [0.485, 0.5, 0.495, 0.465, 0.485, 0.44, 0.52, 0.47, 0.495, 0.49]], ('French', 'auxiliary'): [[0.49, 0.47, 0.53, 0.515, 0.505, 0.52, 0.51, 0.52, 0.505, 0.52], [0.98, 0.985, 0.965, 0.985, 0.975, 0.975, 0.97, 0.975, 0.985, 0.98], [0.585, 0.535, 0.535, 0.505, 0.51, 0.505, 0.5, 0.515, 0.52, 0.535], [0.51, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,

In [None]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.3838383838383838, 0.3383838383838384, 0.3686868686868687, 0.3383838383838384, 0.37373737373737376, 0.3434343434343434, 0.3686868686868687, 0.35353535353535354, 0.3888888888888889, 0.3838383838383838], [0.45454545454545453, 0.4595959595959596, 0.4696969696969697, 0.4494949494949495, 0.5050505050505051, 0.494949494949495, 0.5050505050505051, 0.47474747474747475, 0.4696969696969697, 0.4393939393939394], [0.601010101010101, 0.5959595959595959, 0.6060606060606061, 0.601010101010101, 0.6111111111111112, 0.6060606060606061, 0.601010101010101, 0.601010101010101, 0.5959595959595959, 0.5909090909090909], [0.3333333333333333, 0.3333333333333333, 0.3383838383838384, 0.3484848484848485, 0.35353535353535354, 0.3434343434343434, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333], [0.3333333333333333, 0.3383838383838384, 0.3484848484848485, 0.3484848484848485, 0.3282828282828283, 0.3333333333333333, 0.

------------- German Acc plots------------
{('German', 'active'): [[0.3434343434343434, 0.3383838383838384, 0.3333333333333333, 0.3333333333333333, 0.32323232323232326, 0.29797979797979796, 0.3282828282828283, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333], [0.3686868686868687, 0.3787878787878788, 0.3686868686868687, 0.35353535353535354, 0.3838383838383838, 0.3888888888888889, 0.35353535353535354, 0.3686868686868687, 0.4090909090909091, 0.3888888888888889], [0.3484848484848485, 0.41414141414141414, 0.3888888888888889, 0.3838383838383838, 0.3434343434343434, 0.3434343434343434, 0.3383838383838384, 0.40404040404040403, 0.42424242424242425, 0.41414141414141414], [0.3333333333333333, 0.36363636363636365, 0.32323232323232326, 0.3434343434343434, 0.3383838383838384, 0.3383838383838384, 0.3333333333333333, 0.3333333333333333, 0.3282828282828283, 0.3383838383838384], [0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.333333333333

------------- French Acc plots------------
{('French', 'active'): [[0.3787878787878788, 0.3434343434343434, 0.35353535353535354, 0.3838383838383838, 0.37373737373737376, 0.37373737373737376, 0.3888888888888889, 0.3383838383838384, 0.35858585858585856, 0.3282828282828283], [0.4797979797979798, 0.45454545454545453, 0.46464646464646464, 0.4595959595959596, 0.51010101010101, 0.4797979797979798, 0.5, 0.4797979797979798, 0.4797979797979798, 0.48484848484848486], [0.3333333333333333, 0.31313131313131315, 0.3383838383838384, 0.2676767676767677, 0.3282828282828283, 0.3333333333333333, 0.3787878787878788, 0.3333333333333333, 0.30808080808080807, 0.3383838383838384], [0.3939393939393939, 0.3282828282828283, 0.35858585858585856, 0.32323232323232326, 0.35858585858585856, 0.36363636363636365, 0.398989898989899, 0.35858585858585856, 0.35353535353535354, 0.3484848484848485], [0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.33333

### Yes no again

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
# languages = ['de', 'en', 'fr']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=52
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [None]:
# task='SA'
# lang='en'
# get_distribution(data,models, lang, task)
# print()
# lang='de'
# get_distribution(data,models, lang, task)
# print()
# lang='fr'
# get_distribution(data,models, lang, task)

In [None]:
task='SA'

# print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

# print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

# print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

In [None]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


#### var van yes no again

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
# languages = ['de', 'en', 'fr']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=52
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [None]:
for model in models:
    print(f'------------- Acc box plots for model {model} ------------')
    get_acc_box_plot(data, 'en', model, prompt_types,task, seed)
    get_acc_box_plot(data, 'de', model, prompt_types,task, seed)
    get_acc_box_plot(data, 'fr', model, prompt_types,task, seed)
    print('=========================================================')
    print()

------------- Acc box plots for model bloom ------------



------------- Acc box plots for model bloomz ------------



------------- Acc box plots for model flan ------------



------------- Acc box plots for model llama ------------



------------- Acc box plots for model t0 ------------





### Yes no slightly different prompts SA en and de

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive']
# languages = ['de', 'en']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=54
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'))


In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)


------------- English Acc plots------------
{('English', 'active'): [0.548, 0.9535, 0.702, 0.5095, 0.7755], ('English', 'passive'): [0.5255, 0.937, 0.6895, 0.5045, 0.7745]}


------------- German Acc plots------------
{('German', 'active'): [0.5505, 0.504, 0.5715, 0.497, 0.4995], ('German', 'passive'): [0.549, 0.498, 0.5435, 0.497, 0.5]}


In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)


------------- English Acc plots------------
{('English', 'active'): [[0.5, 0.515, 0.55, 0.525, 0.55, 0.58, 0.62, 0.555, 0.535, 0.55], [0.97, 0.96, 0.945, 0.975, 0.94, 0.945, 0.95, 0.955, 0.935, 0.96], [0.96, 0.12, 0.06, 0.965, 0.97, 0.095, 0.96, 0.96, 0.955, 0.975], [0.5, 0.5, 0.495, 0.5, 0.505, 0.5, 0.505, 0.575, 0.51, 0.505], [0.835, 0.625, 0.695, 0.81, 0.85, 0.575, 0.86, 0.855, 0.845, 0.805]], ('English', 'passive'): [[0.505, 0.54, 0.54, 0.5, 0.51, 0.535, 0.535, 0.555, 0.51, 0.525], [0.945, 0.925, 0.915, 0.955, 0.94, 0.94, 0.95, 0.93, 0.93, 0.94], [0.96, 0.055, 0.035, 0.975, 0.97, 0.035, 0.97, 0.965, 0.96, 0.97], [0.5, 0.51, 0.5, 0.5, 0.505, 0.5, 0.505, 0.515, 0.5, 0.51], [0.8, 0.565, 0.63, 0.81, 0.85, 0.69, 0.86, 0.86, 0.845, 0.835]]}
['bloom', 'bloomz', 'flan', 'llama', 't0', 'bloom', 'bloomz', 'flan', 'llama', 't0', 'bloom', 'bloomz', 'flan', 'llama', 't0', 'bloom', 'bloomz', 'flan', 'llama', 't0', 'bloom', 'bloomz', 'flan', 'llama', 't0', 'bloom', 'bloomz', 'flan', 'llama', 't0'

------------- German Acc plots------------
{('German', 'active'): [[0.57, 0.535, 0.545, 0.5, 0.58, 0.535, 0.59, 0.56, 0.55, 0.54], [0.47, 0.49, 0.5, 0.53, 0.48, 0.495, 0.5, 0.49, 0.52, 0.565], [0.615, 0.5, 0.505, 0.655, 0.625, 0.5, 0.605, 0.5, 0.54, 0.67], [0.5, 0.475, 0.5, 0.5, 0.5, 0.49, 0.5, 0.5, 0.5, 0.505], [0.5, 0.495, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], ('German', 'passive'): [[0.56, 0.54, 0.57, 0.515, 0.525, 0.55, 0.57, 0.575, 0.575, 0.51], [0.51, 0.495, 0.5, 0.495, 0.5, 0.49, 0.495, 0.485, 0.5, 0.51], [0.53, 0.5, 0.5, 0.535, 0.65, 0.5, 0.56, 0.505, 0.5, 0.655], [0.5, 0.49, 0.5, 0.5, 0.5, 0.48, 0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]]}
['bloom', 'bloomz', 'flan', 'llama', 't0', 'bloom', 'bloomz', 'flan', 'llama', 't0', 'bloom', 'bloomz', 'flan', 'llama', 't0', 'bloom', 'bloomz', 'flan', 'llama', 't0', 'bloom', 'bloomz', 'flan', 'llama', 't0', 'bloom', 'bloomz', 'flan', 'llama', 't0', 'bloom', 'bloomz', 'flan', 'llama', 't0', 'bloom', 'bloom

#### var van yes no slightly different

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive']
# languages = ['de', 'en']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=54
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'))


In [None]:
for model in models:
    print(f'------------- Acc box plots for model {model} ------------')
    get_acc_box_plot(data, 'en', model, prompt_types,task, seed)
    get_acc_box_plot(data, 'de', model, prompt_types,task, seed)
    print('=========================================================')
    print()

------------- Acc box plots for model bloom ------------
[[0.5, 0.515, 0.55, 0.525, 0.55, 0.58, 0.62, 0.555, 0.535, 0.55], [0.505, 0.54, 0.54, 0.5, 0.51, 0.535, 0.535, 0.555, 0.51, 0.525]]
[0.5, 0.515, 0.55, 0.525, 0.55, 0.58, 0.62, 0.555, 0.535, 0.55]
[0.505, 0.54, 0.54, 0.5, 0.51, 0.535, 0.535, 0.555, 0.51, 0.525]


[[0.57, 0.535, 0.545, 0.5, 0.58, 0.535, 0.59, 0.56, 0.55, 0.54], [0.56, 0.54, 0.57, 0.515, 0.525, 0.55, 0.57, 0.575, 0.575, 0.51]]
[0.57, 0.535, 0.545, 0.5, 0.58, 0.535, 0.59, 0.56, 0.55, 0.54]
[0.56, 0.54, 0.57, 0.515, 0.525, 0.55, 0.57, 0.575, 0.575, 0.51]



------------- Acc box plots for model bloomz ------------
[[0.97, 0.96, 0.945, 0.975, 0.94, 0.945, 0.95, 0.955, 0.935, 0.96], [0.945, 0.925, 0.915, 0.955, 0.94, 0.94, 0.95, 0.93, 0.93, 0.94]]
[0.97, 0.96, 0.945, 0.975, 0.94, 0.945, 0.95, 0.955, 0.935, 0.96]
[0.945, 0.925, 0.915, 0.955, 0.94, 0.94, 0.95, 0.93, 0.93, 0.94]


[[0.47, 0.49, 0.5, 0.53, 0.48, 0.495, 0.5, 0.49, 0.52, 0.565], [0.51, 0.495, 0.5, 0.495, 0.5, 0.49, 0.495, 0.485, 0.5, 0.51]]
[0.47, 0.49, 0.5, 0.53, 0.48, 0.495, 0.5, 0.49, 0.52, 0.565]
[0.51, 0.495, 0.5, 0.495, 0.5, 0.49, 0.495, 0.485, 0.5, 0.51]



------------- Acc box plots for model flan ------------
[[0.96, 0.12, 0.06, 0.965, 0.97, 0.095, 0.96, 0.96, 0.955, 0.975], [0.96, 0.055, 0.035, 0.975, 0.97, 0.035, 0.97, 0.965, 0.96, 0.97]]
[0.96, 0.12, 0.06, 0.965, 0.97, 0.095, 0.96, 0.96, 0.955, 0.975]
[0.96, 0.055, 0.035, 0.975, 0.97, 0.035, 0.97, 0.965, 0.96, 0.97]


[[0.615, 0.5, 0.505, 0.655, 0.625, 0.5, 0.605, 0.5, 0.54, 0.67], [0.53, 0.5, 0.5, 0.535, 0.65, 0.5, 0.56, 0.505, 0.5, 0.655]]
[0.615, 0.5, 0.505, 0.655, 0.625, 0.5, 0.605, 0.5, 0.54, 0.67]
[0.53, 0.5, 0.5, 0.535, 0.65, 0.5, 0.56, 0.505, 0.5, 0.655]



------------- Acc box plots for model llama ------------
[[0.5, 0.5, 0.495, 0.5, 0.505, 0.5, 0.505, 0.575, 0.51, 0.505], [0.5, 0.51, 0.5, 0.5, 0.505, 0.5, 0.505, 0.515, 0.5, 0.51]]
[0.5, 0.5, 0.495, 0.5, 0.505, 0.5, 0.505, 0.575, 0.51, 0.505]
[0.5, 0.51, 0.5, 0.5, 0.505, 0.5, 0.505, 0.515, 0.5, 0.51]


[[0.5, 0.475, 0.5, 0.5, 0.5, 0.49, 0.5, 0.5, 0.5, 0.505], [0.5, 0.49, 0.5, 0.5, 0.5, 0.48, 0.5, 0.5, 0.5, 0.5]]
[0.5, 0.475, 0.5, 0.5, 0.5, 0.49, 0.5, 0.5, 0.5, 0.505]
[0.5, 0.49, 0.5, 0.5, 0.5, 0.48, 0.5, 0.5, 0.5, 0.5]



------------- Acc box plots for model t0 ------------
[[0.835, 0.625, 0.695, 0.81, 0.85, 0.575, 0.86, 0.855, 0.845, 0.805], [0.8, 0.565, 0.63, 0.81, 0.85, 0.69, 0.86, 0.86, 0.845, 0.835]]
[0.835, 0.625, 0.695, 0.81, 0.85, 0.575, 0.86, 0.855, 0.845, 0.805]
[0.8, 0.565, 0.63, 0.81, 0.85, 0.69, 0.86, 0.86, 0.845, 0.835]


[[0.5, 0.495, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]]
[0.5, 0.495, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]





### no yes version (only in possible answers)

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
# languages = ['de', 'en', 'fr']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=53
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [None]:
task='SA'
lang='en'
get_distribution(data,models, lang, task)
print()
lang='de'
get_distribution(data,models, lang, task)
print()
lang='fr'
get_distribution(data,models, lang, task)

-------------Label distribution for model bloom for en------------
yes 1000
no 1000
-------------Prediction distribution for model bloom for en------------
yes 1528
no 472
-------------Prediction distribution for model bloomz for en------------
yes 1077
no 923
-------------Prediction distribution for model flan for en------------
yes 1039
no 961
-------------Prediction distribution for model llama for en------------
yes 1881
no 119
-------------Prediction distribution for model t0 for en------------
yes 786
no 1214

-------------Label distribution for model bloom for de------------
yes 1000
no 1000
-------------Prediction distribution for model bloom for de------------
yes 887
no 1113
-------------Prediction distribution for model bloomz for de------------
yes 344
no 1656
-------------Prediction distribution for model flan for de------------
yes 176
no 1824
-------------Prediction distribution for model llama for de------------
yes 1965
no 35
-------------Prediction distribution for mo

In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


In [None]:
task='NLI'
lang='en'
get_distribution(data,models, lang, task, nli=True)
print()
lang='de'
get_distribution(data,models, lang, task, nli=True)
print()
lang='fr'
get_distribution(data,models, lang, task, nli=True)

-------------Label distribution for model bloom for en------------
yes 660
no 660
maybe 660
-------------Prediction distribution for model bloom for en------------
yes 825
no 1155
maybe 0
-------------Prediction distribution for model bloomz for en------------
yes 971
no 1009
maybe 0
-------------Prediction distribution for model flan for en------------
yes 944
no 1036
maybe 0
-------------Prediction distribution for model llama for en------------
yes 1890
no 12
maybe 78
-------------Prediction distribution for model t0 for en------------
yes 580
no 130
maybe 1270

-------------Label distribution for model bloom for de------------
yes 660
no 660
maybe 660
-------------Prediction distribution for model bloom for de------------
yes 62
no 18
maybe 1900
-------------Prediction distribution for model bloomz for de------------
yes 56
no 132
maybe 1792
-------------Prediction distribution for model flan for de------------
yes 666
no 1242
maybe 72
-------------Prediction distribution for model

In [None]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


In [None]:

get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type, nli=True)

### ABC

In [11]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=85
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [12]:
print(print_dict_keys(data))

['42', 'en', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 'acc', 'prompt_id_1'

In [13]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


In [14]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


### ABC answer probs normalized dim =0

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=63
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [None]:
print(print_dict_keys(data))

['42', 'en', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 'acc', 'prompt_id_1'

In [None]:
# get_acc_table(data, languages, models, prompt_types, task, seed, version, fn=get_acc_per_prompt_type)
task='SA'

# print('------------- English Acc plots------------')
get_acc_table(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

# print('------------- German Acc plots------------')
get_acc_table(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

# print('------------- French Acc plots------------')
get_acc_table(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

plot_data {('English', 'active'): [[0.565, 0.585, 0.65, 0.58, 0.605, 0.555, 0.55, 0.68, 0.63, 0.645], [0.97, 0.96, 0.97, 0.97, 0.97, 0.96, 0.97, 0.96, 0.95, 0.955], [0.955, 0.96, 0.955, 0.97, 0.945, 0.955, 0.965, 0.97, 0.965, 0.96], [0.77, 0.78, 0.775, 0.72, 0.72, 0.75, 0.79, 0.7, 0.795, 0.915], [0.815, 0.79, 0.815, 0.755, 0.78, 0.805, 0.805, 0.8, 0.785, 0.815]], ('English', 'passive'): [[0.6, 0.59, 0.585, 0.605, 0.565, 0.67, 0.605, 0.645, 0.67, 0.655], [0.955, 0.96, 0.965, 0.955, 0.96, 0.955, 0.96, 0.96, 0.96, 0.955], [0.965, 0.965, 0.965, 0.965, 0.97, 0.975, 0.96, 0.945, 0.97, 0.975], [0.715, 0.64, 0.805, 0.65, 0.75, 0.78, 0.745, 0.645, 0.805, 0.855], [0.815, 0.8, 0.795, 0.82, 0.83, 0.825, 0.82, 0.8, 0.825, 0.815]], ('English', 'auxiliary'): [[0.555, 0.57, 0.615, 0.565, 0.57, 0.555, 0.65, 0.64, 0.595, 0.63], [0.97, 0.955, 0.96, 0.965, 0.96, 0.955, 0.955, 0.96, 0.95, 0.95], [0.97, 0.975, 0.955, 0.97, 0.97, 0.965, 0.96, 0.97, 0.97, 0.955], [0.78, 0.785, 0.765, 0.895, 0.75, 0.765, 0.73,

In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.565, 0.585, 0.65, 0.58, 0.605, 0.555, 0.55, 0.68, 0.63, 0.645], [0.97, 0.96, 0.97, 0.97, 0.97, 0.96, 0.97, 0.96, 0.95, 0.955], [0.955, 0.96, 0.955, 0.97, 0.945, 0.955, 0.965, 0.97, 0.965, 0.96], [0.77, 0.78, 0.775, 0.72, 0.72, 0.75, 0.79, 0.7, 0.795, 0.915], [0.815, 0.79, 0.815, 0.755, 0.78, 0.805, 0.805, 0.8, 0.785, 0.815]], ('English', 'passive'): [[0.6, 0.59, 0.585, 0.605, 0.565, 0.67, 0.605, 0.645, 0.67, 0.655], [0.955, 0.96, 0.965, 0.955, 0.96, 0.955, 0.96, 0.96, 0.96, 0.955], [0.965, 0.965, 0.965, 0.965, 0.97, 0.975, 0.96, 0.945, 0.97, 0.975], [0.715, 0.64, 0.805, 0.65, 0.75, 0.78, 0.745, 0.645, 0.805, 0.855], [0.815, 0.8, 0.795, 0.82, 0.83, 0.825, 0.82, 0.8, 0.825, 0.815]], ('English', 'auxiliary'): [[0.555, 0.57, 0.615, 0.565, 0.57, 0.555, 0.65, 0.64, 0.595, 0.63], [0.97, 0.955, 0.96, 0.965, 0.96, 0.955, 0.955, 0.96, 0.95, 0.95], [0.97, 0.975, 0.955, 0.97, 0.97, 0.965, 0.96, 0.97, 0.97, 0.955], [0.78, 0.785

------------- German Acc plots------------
{('German', 'active'): [[0.515, 0.495, 0.53, 0.48, 0.505, 0.495, 0.53, 0.52, 0.545, 0.53], [0.63, 0.58, 0.66, 0.635, 0.64, 0.645, 0.655, 0.61, 0.585, 0.62], [0.83, 0.845, 0.805, 0.785, 0.745, 0.79, 0.795, 0.745, 0.85, 0.79], [0.74, 0.695, 0.68, 0.68, 0.79, 0.7, 0.65, 0.72, 0.825, 0.76], [0.675, 0.7, 0.66, 0.695, 0.675, 0.695, 0.585, 0.71, 0.675, 0.705]], ('German', 'passive'): [[0.525, 0.465, 0.485, 0.51, 0.505, 0.515, 0.51, 0.49, 0.5, 0.55], [0.625, 0.63, 0.595, 0.67, 0.64, 0.695, 0.645, 0.655, 0.62, 0.65], [0.84, 0.84, 0.79, 0.82, 0.815, 0.845, 0.82, 0.79, 0.835, 0.79], [0.755, 0.78, 0.665, 0.72, 0.8, 0.705, 0.69, 0.72, 0.82, 0.775], [0.655, 0.62, 0.655, 0.64, 0.725, 0.69, 0.675, 0.675, 0.69, 0.7]], ('German', 'auxiliary'): [[0.495, 0.495, 0.51, 0.495, 0.47, 0.545, 0.49, 0.52, 0.51, 0.5], [0.63, 0.565, 0.655, 0.635, 0.675, 0.64, 0.62, 0.61, 0.625, 0.615], [0.835, 0.815, 0.805, 0.775, 0.815, 0.81, 0.82, 0.785, 0.8, 0.87], [0.665, 0.67, 0.715,

------------- French Acc plots------------
{('French', 'active'): [[0.435, 0.43, 0.51, 0.465, 0.495, 0.49, 0.5, 0.46, 0.5, 0.525], [0.91, 0.94, 0.915, 0.93, 0.955, 0.93, 0.925, 0.925, 0.935, 0.945], [0.645, 0.625, 0.59, 0.575, 0.595, 0.545, 0.595, 0.58, 0.525, 0.595], [0.71, 0.765, 0.525, 0.655, 0.6, 0.775, 0.68, 0.655, 0.66, 0.74], [0.555, 0.6, 0.54, 0.53, 0.57, 0.58, 0.62, 0.66, 0.61, 0.585]], ('French', 'passive'): [[0.43, 0.43, 0.455, 0.485, 0.5, 0.485, 0.505, 0.45, 0.555, 0.48], [0.905, 0.915, 0.925, 0.905, 0.94, 0.955, 0.92, 0.925, 0.945, 0.925], [0.65, 0.5, 0.485, 0.625, 0.565, 0.51, 0.505, 0.615, 0.505, 0.63], [0.625, 0.61, 0.545, 0.73, 0.55, 0.73, 0.59, 0.61, 0.685, 0.745], [0.525, 0.53, 0.58, 0.55, 0.585, 0.595, 0.56, 0.58, 0.615, 0.585]], ('French', 'auxiliary'): [[0.475, 0.39, 0.515, 0.44, 0.465, 0.49, 0.44, 0.525, 0.42, 0.495], [0.905, 0.935, 0.89, 0.945, 0.925, 0.92, 0.935, 0.945, 0.94, 0.925], [0.665, 0.65, 0.59, 0.63, 0.565, 0.6, 0.585, 0.625, 0.64, 0.605], [0.675, 0.63

In [None]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.3333333333333333, 0.3434343434343434, 0.3888888888888889, 0.3383838383838384, 0.3484848484848485, 0.3838383838383838, 0.3686868686868687, 0.3434343434343434, 0.3333333333333333, 0.3484848484848485], [0.51010101010101, 0.4797979797979798, 0.4797979797979798, 0.5252525252525253, 0.5151515151515151, 0.5151515151515151, 0.5050505050505051, 0.47474747474747475, 0.5252525252525253, 0.5], [0.6565656565656566, 0.6363636363636364, 0.6666666666666666, 0.6363636363636364, 0.6363636363636364, 0.6616161616161617, 0.6565656565656566, 0.6515151515151515, 0.6666666666666666, 0.6414141414141414], [0.4494949494949495, 0.3686868686868687, 0.3787878787878788, 0.40404040404040403, 0.3838383838383838, 0.43434343434343436, 0.3888888888888889, 0.3888888888888889, 0.35858585858585856, 0.3838383838383838], [0.3282828282828283, 0.3333333333333333, 0.35353535353535354, 0.3888888888888889, 0.3484848484848485, 0.37373737373737376, 0.38383838383

------------- German Acc plots------------
{('German', 'active'): [[0.3434343434343434, 0.29797979797979796, 0.3434343434343434, 0.3686868686868687, 0.3181818181818182, 0.3383838383838384, 0.3181818181818182, 0.35353535353535354, 0.3838383838383838, 0.3838383838383838], [0.3787878787878788, 0.3888888888888889, 0.3787878787878788, 0.41414141414141414, 0.4444444444444444, 0.41919191919191917, 0.4393939393939394, 0.40404040404040403, 0.3888888888888889, 0.3787878787878788], [0.4898989898989899, 0.4494949494949495, 0.4696969696969697, 0.5050505050505051, 0.4696969696969697, 0.48484848484848486, 0.4292929292929293, 0.48484848484848486, 0.4696969696969697, 0.5], [0.3484848484848485, 0.3181818181818182, 0.36363636363636365, 0.3181818181818182, 0.30808080808080807, 0.3333333333333333, 0.3282828282828283, 0.2878787878787879, 0.2676767676767677, 0.32323232323232326], [0.35858585858585856, 0.32323232323232326, 0.3333333333333333, 0.32323232323232326, 0.35858585858585856, 0.29292929292929293, 0.30

------------- French Acc plots------------
{('French', 'active'): [[0.3383838383838384, 0.3282828282828283, 0.3383838383838384, 0.32323232323232326, 0.3383838383838384, 0.36363636363636365, 0.3383838383838384, 0.3181818181818182, 0.3888888888888889, 0.3484848484848485], [0.5, 0.4898989898989899, 0.47474747474747475, 0.5, 0.4494949494949495, 0.4898989898989899, 0.48484848484848486, 0.5050505050505051, 0.494949494949495, 0.4494949494949495], [0.4292929292929293, 0.398989898989899, 0.40404040404040403, 0.45454545454545453, 0.43434343434343436, 0.4393939393939394, 0.45454545454545453, 0.40404040404040403, 0.398989898989899, 0.3787878787878788], [0.3787878787878788, 0.398989898989899, 0.35858585858585856, 0.36363636363636365, 0.36363636363636365, 0.43434343434343436, 0.3484848484848485, 0.398989898989899, 0.3787878787878788, 0.36363636363636365], [0.3888888888888889, 0.3484848484848485, 0.35353535353535354, 0.3939393939393939, 0.41919191919191917, 0.41414141414141414, 0.3939393939393939, 0.

### ABC answer probs normalized dim =1

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=64
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [None]:
print(print_dict_keys(data))

['42', 'en', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 'acc', 'prompt_id_1'

In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.52, 0.555, 0.665, 0.545, 0.635, 0.545, 0.57, 0.525, 0.52, 0.52], [0.96, 0.96, 0.96, 0.955, 0.96, 0.955, 0.96, 0.965, 0.965, 0.95], [0.96, 0.97, 0.955, 0.975, 0.955, 0.965, 0.965, 0.95, 0.955, 0.96], [0.765, 0.78, 0.56, 0.745, 0.53, 0.725, 0.71, 0.505, 0.545, 0.765], [0.755, 0.77, 0.825, 0.77, 0.83, 0.745, 0.74, 0.7, 0.845, 0.77]], ('English', 'passive'): [[0.515, 0.53, 0.555, 0.515, 0.565, 0.495, 0.52, 0.52, 0.505, 0.5], [0.955, 0.955, 0.955, 0.945, 0.965, 0.945, 0.96, 0.955, 0.965, 0.945], [0.965, 0.965, 0.97, 0.965, 0.97, 0.965, 0.965, 0.95, 0.96, 0.97], [0.695, 0.71, 0.785, 0.655, 0.775, 0.8, 0.595, 0.55, 0.755, 0.735], [0.705, 0.775, 0.765, 0.805, 0.815, 0.835, 0.78, 0.65, 0.835, 0.825]], ('English', 'auxiliary'): [[0.52, 0.52, 0.665, 0.545, 0.545, 0.57, 0.525, 0.52, 0.58, 0.5], [0.96, 0.96, 0.96, 0.965, 0.955, 0.96, 0.965, 0.95, 0.96, 0.955], [0.965, 0.965, 0.95, 0.96, 0.965, 0.965, 0.935, 0.96, 0.965, 0.965],

------------- German Acc plots------------
{('German', 'active'): [[0.505, 0.52, 0.5, 0.505, 0.495, 0.5, 0.5, 0.505, 0.52, 0.515], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], [0.545, 0.67, 0.615, 0.55, 0.63, 0.545, 0.545, 0.56, 0.695, 0.7], [0.5, 0.525, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.54, 0.5], [0.5, 0.5, 0.5, 0.5, 0.495, 0.5, 0.5, 0.5, 0.5, 0.5]], ('German', 'passive'): [[0.5, 0.5, 0.5, 0.505, 0.51, 0.5, 0.5, 0.505, 0.505, 0.5], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.505, 0.5], [0.64, 0.665, 0.55, 0.62, 0.655, 0.54, 0.52, 0.57, 0.585, 0.65], [0.52, 0.525, 0.5, 0.505, 0.515, 0.5, 0.51, 0.5, 0.535, 0.52], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], ('German', 'auxiliary'): [[0.51, 0.5, 0.5, 0.5, 0.5, 0.5, 0.505, 0.515, 0.495, 0.5], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], [0.66, 0.735, 0.6, 0.575, 0.555, 0.55, 0.635, 0.68, 0.72, 0.615], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.545], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], ('German', 'm

------------- French Acc plots------------
{('French', 'active'): [[0.465, 0.44, 0.515, 0.495, 0.54, 0.485, 0.485, 0.485, 0.525, 0.52], [0.89, 0.925, 0.865, 0.905, 0.955, 0.91, 0.895, 0.915, 0.94, 0.93], [0.505, 0.53, 0.505, 0.52, 0.53, 0.51, 0.5, 0.5, 0.5, 0.56], [0.69, 0.74, 0.525, 0.625, 0.605, 0.775, 0.69, 0.62, 0.69, 0.61], [0.495, 0.525, 0.525, 0.51, 0.5, 0.505, 0.54, 0.525, 0.515, 0.495]], ('French', 'passive'): [[0.415, 0.45, 0.445, 0.485, 0.49, 0.485, 0.54, 0.45, 0.515, 0.465], [0.88, 0.9, 0.9, 0.855, 0.93, 0.95, 0.895, 0.89, 0.93, 0.905], [0.53, 0.495, 0.495, 0.55, 0.51, 0.505, 0.5, 0.495, 0.5, 0.53], [0.585, 0.615, 0.54, 0.665, 0.605, 0.765, 0.63, 0.61, 0.7, 0.595], [0.475, 0.55, 0.535, 0.525, 0.5, 0.5, 0.57, 0.51, 0.54, 0.545]], ('French', 'auxiliary'): [[0.465, 0.4, 0.515, 0.45, 0.485, 0.485, 0.445, 0.52, 0.44, 0.475], [0.89, 0.92, 0.865, 0.94, 0.91, 0.895, 0.905, 0.93, 0.94, 0.92], [0.5, 0.5, 0.525, 0.53, 0.515, 0.505, 0.5, 0.54, 0.525, 0.54], [0.68, 0.645, 0.525, 0.755, 

In [None]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.3434343434343434, 0.31313131313131315, 0.32323232323232326, 0.3181818181818182, 0.30808080808080807, 0.35858585858585856, 0.3181818181818182, 0.35858585858585856, 0.3282828282828283, 0.3181818181818182], [0.494949494949495, 0.4595959595959596, 0.4797979797979798, 0.5404040404040404, 0.4696969696969697, 0.4898989898989899, 0.5151515151515151, 0.4898989898989899, 0.51010101010101, 0.48484848484848486], [0.5959595959595959, 0.601010101010101, 0.6060606060606061, 0.5959595959595959, 0.601010101010101, 0.6060606060606061, 0.601010101010101, 0.5959595959595959, 0.601010101010101, 0.5959595959595959], [0.3838383838383838, 0.3434343434343434, 0.3686868686868687, 0.37373737373737376, 0.36363636363636365, 0.3787878787878788, 0.37373737373737376, 0.3686868686868687, 0.3686868686868687, 0.3939393939393939], [0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 

------------- German Acc plots------------
{('German', 'active'): [[0.3383838383838384, 0.3383838383838384, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3383838383838384, 0.32323232323232326, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333], [0.3686868686868687, 0.36363636363636365, 0.37373737373737376, 0.35858585858585856, 0.36363636363636365, 0.36363636363636365, 0.3383838383838384, 0.3787878787878788, 0.36363636363636365, 0.37373737373737376], [0.41919191919191917, 0.43434343434343436, 0.41414141414141414, 0.4444444444444444, 0.4595959595959596, 0.41414141414141414, 0.40404040404040403, 0.37373737373737376, 0.41414141414141414, 0.4494949494949495], [0.32323232323232326, 0.3282828282828283, 0.3282828282828283, 0.3333333333333333, 0.3282828282828283, 0.32323232323232326, 0.3383838383838384, 0.32323232323232326, 0.3333333333333333, 0.3333333333333333], [0.3787878787878788, 0.3434343434343434, 0.3434343434343434, 0.3434343434343434, 0.35858585858585856, 0.343

------------- French Acc plots------------
{('French', 'active'): [[0.3333333333333333, 0.3282828282828283, 0.3333333333333333, 0.3282828282828283, 0.3383838383838384, 0.35353535353535354, 0.3333333333333333, 0.3333333333333333, 0.3383838383838384, 0.3383838383838384], [0.3484848484848485, 0.3787878787878788, 0.3434343434343434, 0.3939393939393939, 0.3333333333333333, 0.35858585858585856, 0.35353535353535354, 0.3888888888888889, 0.4090909090909091, 0.3383838383838384], [0.3333333333333333, 0.3333333333333333, 0.3383838383838384, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333], [0.37373737373737376, 0.3939393939393939, 0.398989898989899, 0.3434343434343434, 0.35353535353535354, 0.3888888888888889, 0.3181818181818182, 0.398989898989899, 0.36363636363636365, 0.36363636363636365], [0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,

### logit scaling Yes no

In [11]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=93
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [12]:
print(print_dict_keys(data))

['42', 'en', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 'acc', 'prompt_id_1'

In [13]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


In [14]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


### logit scaling ABC

In [15]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=90
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [16]:
print(print_dict_keys(data))

['42', 'en', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 'acc', 'prompt_id_1'

In [17]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


In [18]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


### logit scaling without

In [19]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=94
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [20]:
print(print_dict_keys(data))

['42', 'en', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 'acc', 'prompt_id_1'

In [21]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


In [22]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


### logit scaling No yes

In [23]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=91
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [24]:
print(print_dict_keys(data))

['42', 'en', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 'acc', 'prompt_id_1'

In [25]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


In [26]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


### Ano norm NO yes

In [15]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=85
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [28]:
print(print_dict_keys(data))

['42', 'en', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 'acc', 'prompt_id_1'

In [16]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


In [17]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


### sample size 2000

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=60
# data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))
data =open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle')

In [None]:
print(print_dict_keys(data))

['42', 'en', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 

In [None]:
# task='SA'
# lang='en'
# prompt_type='active'
# get_distribution(data, models, lang, task, prompt_type)
# print()
# lang='de'
# get_distribution(data,models, lang, task, prompt_type)
# print()
# lang='fr'
# get_distribution(data,models, lang, task, prompt_type)

In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

# print('------------- German Acc plots------------')
# get_acc_plot(data, ['de'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_variation)

# print('------------- French Acc plots------------')
# get_acc_plot(data, ['fr'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.509, 0.5295, 0.648, 0.5155, 0.601, 0.5165, 0.5325, 0.514, 0.513, 0.5185], [0.9605, 0.9595, 0.9715, 0.959, 0.971, 0.956, 0.9605, 0.963, 0.9605, 0.9555], [0.966, 0.9635, 0.9635, 0.9665, 0.959, 0.9665, 0.966, 0.958, 0.9595, 0.967], [0.7645, 0.756, 0.563, 0.7295, 0.5285, 0.708, 0.7055, 0.5025, 0.5625, 0.7605], [0.791, 0.777, 0.8625, 0.7925, 0.882, 0.795, 0.79, 0.755, 0.85, 0.8015]], ('English', 'passive'): [[0.5085, 0.5095, 0.524, 0.503, 0.5435, 0.5045, 0.5095, 0.5115, 0.5055, 0.5055], [0.9525, 0.953, 0.9525, 0.95, 0.963, 0.954, 0.9555, 0.955, 0.965, 0.9515], [0.9635, 0.965, 0.968, 0.97, 0.968, 0.9635, 0.964, 0.9615, 0.961, 0.964], [0.6665, 0.6775, 0.784, 0.643, 0.734, 0.8095, 0.605, 0.5515, 0.7085, 0.7355], [0.765, 0.8085, 0.8185, 0.822, 0.865, 0.8565, 0.8065, 0.717, 0.864, 0.869]], ('English', 'auxiliary'): [[0.509, 0.511, 0.648, 0.5695, 0.5165, 0.5325, 0.5125, 0.5185, 0.5675, 0.508], [0.9605, 0.967, 0.9715, 0.9705, 

In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

# print('------------- German Acc plots------------')
# get_acc_plot(data, ['de'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

# print('------------- French Acc plots------------')
# get_acc_plot(data, ['fr'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_type)

------------- English Acc plots------------
{('English', 'active'): [0.53975, 0.9617, 0.96355, 0.65805, 0.80965], ('English', 'passive'): [0.5125, 0.9552, 0.96485, 0.6915, 0.8192], ('English', 'auxiliary'): [0.5393, 0.96235, 0.9619, 0.71, 0.8177], ('English', 'modal'): [0.55175, 0.96295, 0.96115, 0.6402, 0.82975], ('English', 'common'): [0.5496428571428571, 0.9625714285714285, 0.9605, 0.6647142857142857, 0.8132857142857143], ('English', 'rare_synonyms'): [0.5827142857142857, 0.9655714285714285, 0.964, 0.6348571428571429, 0.8175714285714286], ('English', 'identical_modal'): [0.54255, 0.95965, 0.96255, 0.64175, 0.8175]}


In [None]:
# task='NLI'
# lang='en'
# prompt_type='active'

# get_distribution(data,models, lang, task, nli=True, prompt_type=prompt_type)
# print()
# lang='de'
# get_distribution(data,models, lang, task, nli=True, prompt_type=prompt_type)
# print()
# lang='fr'
# get_distribution(data,models, lang, task, nli=True, prompt_type=prompt_type)

In [None]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

# print('------------- German Acc plots------------')
# get_acc_plot(data, ['de'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_variation)

# print('------------- French Acc plots------------')
# get_acc_plot(data, ['fr'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.3533533533533533, 0.3353353353353353, 0.34384384384384387, 0.34384384384384387, 0.3383383383383383, 0.3613613613613614, 0.34134134134134136, 0.35135135135135137, 0.34684684684684686, 0.33683683683683685], [0.46296296296296297, 0.47047047047047047, 0.45495495495495497, 0.4994994994994995, 0.47347347347347346, 0.46846846846846846, 0.508008008008008, 0.474974974974975, 0.5005005005005005, 0.47097097097097096], [0.6196196196196196, 0.6161161161161162, 0.6176176176176176, 0.6121121121121121, 0.6186186186186187, 0.6156156156156156, 0.6176176176176176, 0.6131131131131131, 0.6146146146146146, 0.6166166166166166], [0.3918918918918919, 0.3533533533533533, 0.35785785785785784, 0.3768768768768769, 0.38538538538538536, 0.42292292292292294, 0.3893893893893894, 0.3923923923923924, 0.36536536536536535, 0.3858858858858859], [0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.333333

### withouth possible answers

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=61
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [None]:
print(print_dict_keys(data))

['42', 'en', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 'acc', 'prompt_id_1'

In [None]:
# task='SA'
# lang='en'
# prompt_type='active'
# get_distribution(data, models, lang, task, prompt_type)
# print()
# lang='de'
# get_distribution(data,models, lang, task, prompt_type)
# print()
# lang='fr'
# get_distribution(data,models, lang, task, prompt_type)

In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.59, 0.625, 0.585, 0.615, 0.545, 0.63, 0.58, 0.585, 0.555, 0.57], [0.945, 0.935, 0.965, 0.945, 0.955, 0.955, 0.945, 0.945, 0.915, 0.965], [0.97, 0.97, 0.955, 0.97, 0.96, 0.97, 0.97, 0.965, 0.965, 0.965], [0.71, 0.585, 0.5, 0.54, 0.76, 0.51, 0.52, 0.65, 0.575, 0.51], [0.91, 0.91, 0.9, 0.89, 0.91, 0.895, 0.9, 0.905, 0.905, 0.885]], ('English', 'passive'): [[0.595, 0.605, 0.6, 0.545, 0.595, 0.515, 0.565, 0.605, 0.545, 0.51], [0.925, 0.92, 0.94, 0.91, 0.945, 0.96, 0.94, 0.905, 0.905, 0.965], [0.97, 0.975, 0.975, 0.965, 0.97, 0.97, 0.97, 0.965, 0.97, 0.97], [0.755, 0.585, 0.595, 0.55, 0.57, 0.5, 0.545, 0.83, 0.525, 0.5], [0.9, 0.9, 0.92, 0.91, 0.905, 0.885, 0.905, 0.915, 0.89, 0.865]], ('English', 'auxiliary'): [[0.59, 0.565, 0.585, 0.52, 0.63, 0.58, 0.615, 0.57, 0.655, 0.515], [0.945, 0.93, 0.965, 0.965, 0.955, 0.945, 0.93, 0.965, 0.94, 0.965], [0.97, 0.965, 0.955, 0.975, 0.965, 0.97, 0.96, 0.965, 0.97, 0.96], [0.705, 0

------------- German Acc plots------------
{('German', 'active'): [[0.56, 0.54, 0.495, 0.535, 0.5, 0.515, 0.52, 0.515, 0.545, 0.515], [0.605, 0.585, 0.53, 0.535, 0.51, 0.715, 0.505, 0.625, 0.545, 0.61], [0.81, 0.77, 0.725, 0.62, 0.575, 0.605, 0.775, 0.83, 0.715, 0.585], [0.565, 0.715, 0.505, 0.525, 0.77, 0.555, 0.525, 0.5, 0.785, 0.505], [0.5, 0.5, 0.5, 0.505, 0.505, 0.5, 0.5, 0.5, 0.5, 0.5]], ('German', 'passive'): [[0.525, 0.53, 0.5, 0.535, 0.515, 0.51, 0.515, 0.515, 0.525, 0.505], [0.66, 0.655, 0.63, 0.595, 0.535, 0.64, 0.51, 0.66, 0.55, 0.695], [0.82, 0.82, 0.77, 0.765, 0.715, 0.785, 0.785, 0.75, 0.705, 0.75], [0.625, 0.725, 0.5, 0.575, 0.605, 0.53, 0.735, 0.505, 0.58, 0.515], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], ('German', 'auxiliary'): [[0.52, 0.5, 0.495, 0.52, 0.515, 0.52, 0.52, 0.515, 0.51, 0.5], [0.7, 0.665, 0.53, 0.55, 0.715, 0.505, 0.61, 0.61, 0.545, 0.615], [0.815, 0.82, 0.715, 0.63, 0.565, 0.79, 0.89, 0.605, 0.565, 0.805], [0.575, 0.52, 0.505, 0.78, 0.555, 

------------- French Acc plots------------
{('French', 'active'): [[0.43, 0.45, 0.555, 0.46, 0.55, 0.505, 0.49, 0.525, 0.515, 0.535], [0.985, 0.985, 0.965, 0.98, 0.985, 0.985, 0.985, 0.985, 0.975, 0.975], [0.52, 0.505, 0.5, 0.5, 0.515, 0.52, 0.5, 0.5, 0.5, 0.515], [0.52, 0.525, 0.505, 0.505, 0.51, 0.52, 0.51, 0.52, 0.59, 0.53], [0.575, 0.54, 0.715, 0.53, 0.76, 0.565, 0.53, 0.535, 0.59, 0.555]], ('French', 'passive'): [[0.385, 0.445, 0.445, 0.58, 0.545, 0.51, 0.535, 0.485, 0.52, 0.48], [0.975, 0.98, 0.98, 0.985, 0.985, 0.985, 0.98, 0.975, 0.97, 0.975], [0.57, 0.51, 0.5, 0.515, 0.505, 0.515, 0.5, 0.51, 0.5, 0.525], [0.505, 0.5, 0.505, 0.52, 0.505, 0.545, 0.505, 0.5, 0.55, 0.5], [0.555, 0.535, 0.58, 0.58, 0.75, 0.535, 0.525, 0.565, 0.56, 0.57]], ('French', 'auxiliary'): [[0.43, 0.475, 0.555, 0.485, 0.505, 0.49, 0.52, 0.535, 0.515, 0.52], [0.985, 0.98, 0.965, 0.985, 0.985, 0.985, 0.975, 0.975, 0.98, 0.975], [0.51, 0.5, 0.505, 0.535, 0.515, 0.5, 0.5, 0.51, 0.53, 0.525], [0.52, 0.495, 0.505,

In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

------------- English Acc plots------------
{('English', 'active'): [0.588, 0.947, 0.966, 0.586, 0.901], ('English', 'passive'): [0.568, 0.9315, 0.97, 0.5955, 0.8995], ('English', 'auxiliary'): [0.5825, 0.9505, 0.9655, 0.613, 0.904], ('English', 'modal'): [0.6075, 0.9395, 0.9675, 0.6105, 0.893], ('English', 'common'): [0.5878571428571429, 0.9528571428571428, 0.9664285714285714, 0.6421428571428571, 0.9028571428571428], ('English', 'rare_synonyms'): [0.6078571428571429, 0.9435714285714286, 0.9692857142857143, 0.5635714285714286, 0.9057142857142857], ('English', 'identical_modal'): [0.6335, 0.928, 0.969, 0.705, 0.9065]}


------------- German Acc plots------------
{('German', 'active'): [0.524, 0.5765, 0.701, 0.595, 0.501], ('German', 'passive'): [0.5175, 0.613, 0.7665, 0.5895, 0.5], ('German', 'auxiliary'): [0.5115, 0.6045, 0.72, 0.5515, 0.4995], ('German', 'modal'): [0.519, 0.572, 0.708, 0.5455, 0.5], ('German', 'common'): [0.5185714285714286, 0.5878571428571429, 0.7421428571428571, 0.5357142857142857, 0.49857142857142855], ('German', 'rare_synonyms'): [0.5235714285714286, 0.5842857142857143, 0.7378571428571429, 0.5214285714285715, 0.4928571428571429], ('German', 'identical_modal'): [0.5441666666666667, 0.6325, 0.8083333333333333, 0.7008333333333333, 0.49916666666666665]}


------------- French Acc plots------------
{('French', 'active'): [0.5015, 0.9805, 0.5075, 0.5235, 0.5895], ('French', 'passive'): [0.493, 0.979, 0.515, 0.5135, 0.5755], ('French', 'auxiliary'): [0.503, 0.979, 0.513, 0.5115, 0.5965], ('French', 'modal'): [0.4855, 0.9795, 0.501, 0.5145, 0.6005], ('French', 'common'): [0.49357142857142855, 0.975, 0.5028571428571429, 0.5171428571428571, 0.5992857142857143], ('French', 'rare_synonyms'): [0.5028571428571429, 0.97, 0.5092857142857142, 0.5135714285714286, 0.6057142857142858], ('French', 'identical_modal'): [0.44333333333333336, 0.9825, 0.5, 0.5216666666666666, 0.6483333333333333]}


In [None]:
# task='NLI'
# lang='en'
# prompt_type='active'

# get_distribution(data,models, lang, task, nli=True, prompt_type=prompt_type)
# print()
# lang='de'
# get_distribution(data,models, lang, task, nli=True, prompt_type=prompt_type)
# print()
# lang='fr'
# get_distribution(data,models, lang, task, nli=True, prompt_type=prompt_type)

In [None]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.3434343434343434, 0.3383838383838384, 0.36363636363636365, 0.3434343434343434, 0.37373737373737376, 0.3434343434343434, 0.36363636363636365, 0.3686868686868687, 0.398989898989899, 0.3434343434343434], [0.4898989898989899, 0.4595959595959596, 0.4595959595959596, 0.494949494949495, 0.4595959595959596, 0.46464646464646464, 0.5202020202020202, 0.48484848484848486, 0.4898989898989899, 0.4595959595959596], [0.6060606060606061, 0.6060606060606061, 0.6111111111111112, 0.6060606060606061, 0.6060606060606061, 0.6111111111111112, 0.601010101010101, 0.5909090909090909, 0.5909090909090909, 0.601010101010101], [0.4393939393939394, 0.40404040404040403, 0.42424242424242425, 0.46464646464646464, 0.3939393939393939, 0.41414141414141414, 0.4595959595959596, 0.4797979797979798, 0.45454545454545453, 0.4797979797979798], [0.3333333333333333, 0.3686868686868687, 0.36363636363636365, 0.3383838383838384, 0.40404040404040403, 0.353535353535

------------- German Acc plots------------
{('German', 'active'): [[0.3434343434343434, 0.3282828282828283, 0.3333333333333333, 0.3333333333333333, 0.32323232323232326, 0.30808080808080807, 0.3383838383838384, 0.3282828282828283, 0.3282828282828283, 0.3181818181818182], [0.36363636363636365, 0.41414141414141414, 0.40404040404040403, 0.3787878787878788, 0.3686868686868687, 0.398989898989899, 0.41414141414141414, 0.3939393939393939, 0.41919191919191917, 0.37373737373737376], [0.3838383838383838, 0.36363636363636365, 0.35353535353535354, 0.4797979797979798, 0.32323232323232326, 0.41414141414141414, 0.3838383838383838, 0.35353535353535354, 0.3686868686868687, 0.40404040404040403], [0.43434343434343436, 0.42424242424242425, 0.45454545454545453, 0.41414141414141414, 0.37373737373737376, 0.41919191919191917, 0.3939393939393939, 0.42424242424242425, 0.3939393939393939, 0.398989898989899], [0.35353535353535354, 0.35353535353535354, 0.3484848484848485, 0.3434343434343434, 0.3383838383838384, 0.3

------------- French Acc plots------------
{('French', 'active'): [[0.3434343434343434, 0.3282828282828283, 0.3333333333333333, 0.3434343434343434, 0.35353535353535354, 0.3484848484848485, 0.3383838383838384, 0.36363636363636365, 0.35858585858585856, 0.36363636363636365], [0.47474747474747475, 0.41414141414141414, 0.4494949494949495, 0.46464646464646464, 0.5202020202020202, 0.4898989898989899, 0.48484848484848486, 0.4444444444444444, 0.4797979797979798, 0.47474747474747475], [0.30808080808080807, 0.29292929292929293, 0.3333333333333333, 0.2727272727272727, 0.3383838383838384, 0.30303030303030304, 0.3282828282828283, 0.29797979797979796, 0.29797979797979796, 0.31313131313131315], [0.5, 0.40404040404040403, 0.3838383838383838, 0.398989898989899, 0.41919191919191917, 0.47474747474747475, 0.4444444444444444, 0.4090909090909091, 0.40404040404040403, 0.4090909090909091], [0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0

### one-shot

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=59
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [None]:
print(print_dict_keys(data))

['42', 'en', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 

In [None]:
# task='SA'
# lang='en'
# prompt_type='active'
# get_distribution(data, models, lang, task, prompt_type)
# print()
# lang='de'
# get_distribution(data,models, lang, task, prompt_type)
# print()
# lang='fr'
# get_distribution(data,models, lang, task, prompt_type)

In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.63, 0.62, 0.61, 0.645, 0.645, 0.665, 0.605, 0.64, 0.625, 0.64], [0.945, 0.94, 0.975, 0.955, 0.965, 0.97, 0.96, 0.95, 0.925, 0.965], [0.975, 0.97, 0.955, 0.975, 0.98, 0.975, 0.97, 0.975, 0.97, 0.965], [0.665, 0.615, 0.5, 0.535, 0.74, 0.5, 0.51, 0.655, 0.605, 0.505], [0.9, 0.92, 0.91, 0.915, 0.915, 0.91, 0.91, 0.915, 0.905, 0.895]], ('English', 'passive'): [[0.605, 0.63, 0.675, 0.58, 0.64, 0.55, 0.64, 0.66, 0.57, 0.545], [0.935, 0.935, 0.95, 0.925, 0.95, 0.97, 0.95, 0.91, 0.91, 0.965], [0.965, 0.97, 0.975, 0.965, 0.98, 0.965, 0.965, 0.97, 0.975, 0.98], [0.71, 0.605, 0.57, 0.53, 0.595, 0.5, 0.535, 0.85, 0.545, 0.5], [0.92, 0.905, 0.915, 0.9, 0.915, 0.92, 0.91, 0.91, 0.905, 0.9]], ('English', 'auxiliary'): [[0.63, 0.665, 0.61, 0.525, 0.665, 0.605, 0.675, 0.64, 0.665, 0.55], [0.945, 0.94, 0.975, 0.965, 0.97, 0.96, 0.94, 0.965, 0.96, 0.965], [0.975, 0.965, 0.96, 0.97, 0.975, 0.97, 0.975, 0.965, 0.97, 0.975], [0.665, 0.72

------------- German Acc plots------------
{('German', 'active'): [[0.54, 0.515, 0.49, 0.5, 0.51, 0.505, 0.5, 0.515, 0.515, 0.51], [0.61, 0.58, 0.54, 0.535, 0.515, 0.69, 0.515, 0.625, 0.55, 0.595], [0.83, 0.765, 0.69, 0.62, 0.57, 0.585, 0.76, 0.825, 0.675, 0.655], [0.59, 0.7, 0.5, 0.53, 0.735, 0.555, 0.52, 0.5, 0.765, 0.5], [0.5, 0.5, 0.5, 0.5, 0.46, 0.5, 0.5, 0.5, 0.5, 0.495]], ('German', 'passive'): [[0.51, 0.51, 0.495, 0.485, 0.5, 0.505, 0.51, 0.505, 0.52, 0.5], [0.63, 0.665, 0.61, 0.6, 0.55, 0.68, 0.525, 0.665, 0.56, 0.665], [0.8, 0.81, 0.71, 0.75, 0.705, 0.72, 0.83, 0.8, 0.715, 0.74], [0.65, 0.695, 0.5, 0.56, 0.6, 0.55, 0.69, 0.51, 0.56, 0.5], [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.495, 0.5, 0.5, 0.5]], ('German', 'auxiliary'): [[0.51, 0.49, 0.49, 0.5, 0.505, 0.5, 0.52, 0.51, 0.5, 0.5], [0.655, 0.71, 0.54, 0.535, 0.69, 0.515, 0.59, 0.595, 0.545, 0.635], [0.795, 0.78, 0.69, 0.64, 0.575, 0.76, 0.845, 0.625, 0.575, 0.805], [0.605, 0.51, 0.5, 0.76, 0.555, 0.52, 0.5, 0.5, 0.535, 0.5], [0.5, 

------------- French Acc plots------------
{('French', 'active'): [[0.45, 0.45, 0.535, 0.455, 0.545, 0.455, 0.515, 0.505, 0.51, 0.515], [0.975, 0.965, 0.94, 0.96, 0.94, 0.96, 0.96, 0.975, 0.92, 0.935], [0.51, 0.52, 0.5, 0.5, 0.505, 0.505, 0.5, 0.5, 0.5, 0.505], [0.515, 0.51, 0.505, 0.5, 0.5, 0.505, 0.5, 0.51, 0.59, 0.515], [0.535, 0.53, 0.65, 0.5, 0.72, 0.505, 0.525, 0.52, 0.55, 0.475]], ('French', 'passive'): [[0.415, 0.46, 0.49, 0.535, 0.57, 0.5, 0.515, 0.515, 0.51, 0.465], [0.975, 0.97, 0.98, 0.965, 0.965, 0.93, 0.965, 0.97, 0.945, 0.96], [0.555, 0.52, 0.5, 0.52, 0.515, 0.505, 0.5, 0.505, 0.5, 0.52], [0.5, 0.5, 0.53, 0.51, 0.5, 0.56, 0.5, 0.5, 0.57, 0.5], [0.585, 0.535, 0.565, 0.505, 0.7, 0.505, 0.515, 0.55, 0.525, 0.555]], ('French', 'auxiliary'): [[0.45, 0.455, 0.535, 0.475, 0.455, 0.515, 0.505, 0.515, 0.51, 0.525], [0.975, 0.975, 0.94, 0.96, 0.96, 0.96, 0.975, 0.935, 0.965, 0.945], [0.5, 0.5, 0.5, 0.52, 0.51, 0.5, 0.5, 0.505, 0.51, 0.51], [0.52, 0.505, 0.505, 0.51, 0.505, 0.5, 0.

In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_type)

------------- English Acc plots------------
{('English', 'active'): [0.6325, 0.955, 0.971, 0.583, 0.9095], ('English', 'passive'): [0.6095, 0.94, 0.971, 0.594, 0.91], ('English', 'auxiliary'): [0.623, 0.9585, 0.97, 0.6045, 0.9105], ('English', 'modal'): [0.6525, 0.95, 0.9635, 0.628, 0.912], ('English', 'common'): [0.6171428571428571, 0.9614285714285714, 0.965, 0.6407142857142857, 0.9128571428571428], ('English', 'rare_synonyms'): [0.64, 0.9557142857142857, 0.9685714285714285, 0.5585714285714286, 0.9114285714285715], ('English', 'identical_modal'): [0.681, 0.943, 0.9665, 0.7085, 0.9105]}


------------- German Acc plots------------
{('German', 'active'): [0.51, 0.5755, 0.6975, 0.5895, 0.4955], ('German', 'passive'): [0.504, 0.615, 0.758, 0.5815, 0.4995], ('German', 'auxiliary'): [0.5025, 0.601, 0.709, 0.5485, 0.4985], ('German', 'modal'): [0.5045, 0.5655, 0.725, 0.5455, 0.4995], ('German', 'common'): [0.5042857142857143, 0.59, 0.7357142857142858, 0.5278571428571428, 0.5], ('German', 'rare_synonyms'): [0.5035714285714286, 0.5807142857142857, 0.7307142857142858, 0.5242857142857142, 0.49142857142857144], ('German', 'identical_modal'): [0.5075, 0.62, 0.7891666666666667, 0.6925, 0.5008333333333334]}


------------- French Acc plots------------
{('French', 'active'): [0.4935, 0.953, 0.5045, 0.515, 0.551], ('French', 'passive'): [0.4975, 0.9625, 0.514, 0.517, 0.554], ('French', 'auxiliary'): [0.494, 0.959, 0.5055, 0.508, 0.5605], ('French', 'modal'): [0.4865, 0.9555, 0.5035, 0.5055, 0.56], ('French', 'common'): [0.48714285714285716, 0.9585714285714285, 0.5021428571428571, 0.5114285714285715, 0.5635714285714286], ('French', 'rare_synonyms'): [0.49857142857142855, 0.9564285714285714, 0.5092857142857142, 0.5085714285714286, 0.5957142857142858], ('French', 'identical_modal'): [0.43, 0.97, 0.5, 0.5108333333333334, 0.6075]}


In [None]:
# task='NLI'
# lang='en'
# prompt_type='active'

# get_distribution(data,models, lang, task, nli=True, prompt_type=prompt_type)
# print()
# lang='de'
# get_distribution(data,models, lang, task, nli=True, prompt_type=prompt_type)
# print()
# lang='fr'
# get_distribution(data,models, lang, task, nli=True, prompt_type=prompt_type)

In [None]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.3383838383838384, 0.3787878787878788, 0.35858585858585856, 0.35858585858585856, 0.36363636363636365, 0.3484848484848485, 0.37373737373737376, 0.36363636363636365, 0.41414141414141414, 0.35858585858585856], [0.47474747474747475, 0.4797979797979798, 0.45454545454545453, 0.48484848484848486, 0.4797979797979798, 0.4595959595959596, 0.5151515151515151, 0.4797979797979798, 0.47474747474747475, 0.46464646464646464], [0.6212121212121212, 0.6161616161616161, 0.6212121212121212, 0.6212121212121212, 0.6111111111111112, 0.6262626262626263, 0.6262626262626263, 0.6060606060606061, 0.6111111111111112, 0.6313131313131313], [0.398989898989899, 0.35858585858585856, 0.3787878787878788, 0.398989898989899, 0.3838383838383838, 0.40404040404040403, 0.3888888888888889, 0.46464646464646464, 0.37373737373737376, 0.37373737373737376], [0.3333333333333333, 0.3939393939393939, 0.3686868686868687, 0.3484848484848485, 0.3838383838383838, 0.38383

------------- German Acc plots------------
{('German', 'active'): [[0.36363636363636365, 0.3484848484848485, 0.3484848484848485, 0.3434343434343434, 0.3333333333333333, 0.3434343434343434, 0.3434343434343434, 0.3282828282828283, 0.3333333333333333, 0.3434343434343434], [0.3181818181818182, 0.36363636363636365, 0.3383838383838384, 0.3939393939393939, 0.35353535353535354, 0.3181818181818182, 0.3484848484848485, 0.3181818181818182, 0.32323232323232326, 0.3686868686868687], [0.398989898989899, 0.3434343434343434, 0.35858585858585856, 0.3939393939393939, 0.3333333333333333, 0.4090909090909091, 0.3686868686868687, 0.35858585858585856, 0.37373737373737376, 0.3434343434343434], [0.398989898989899, 0.3434343434343434, 0.37373737373737376, 0.3838383838383838, 0.36363636363636365, 0.42424242424242425, 0.3333333333333333, 0.3686868686868687, 0.35858585858585856, 0.35353535353535354], [0.37373737373737376, 0.36363636363636365, 0.35858585858585856, 0.3484848484848485, 0.3484848484848485, 0.383838383

------------- French Acc plots------------
{('French', 'active'): [[0.3333333333333333, 0.3383838383838384, 0.3282828282828283, 0.3434343434343434, 0.36363636363636365, 0.36363636363636365, 0.3333333333333333, 0.3333333333333333, 0.3282828282828283, 0.31313131313131315], [0.5151515151515151, 0.398989898989899, 0.46464646464646464, 0.46464646464646464, 0.51010101010101, 0.51010101010101, 0.5303030303030303, 0.5, 0.47474747474747475, 0.47474747474747475], [0.3383838383838384, 0.3282828282828283, 0.3333333333333333, 0.3333333333333333, 0.35858585858585856, 0.31313131313131315, 0.3434343434343434, 0.3838383838383838, 0.30808080808080807, 0.3383838383838384], [0.4393939393939394, 0.35353535353535354, 0.3282828282828283, 0.32323232323232326, 0.3838383838383838, 0.40404040404040403, 0.35858585858585856, 0.4090909090909091, 0.40404040404040403, 0.36363636363636365], [0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.333333

### one-shot 2000

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=62
# data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))
data =open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle')

In [None]:
print(print_dict_keys(data))

['42', 'de', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 

In [None]:
# task='SA'
# lang='en'
# prompt_type='active'
# get_distribution(data, models, lang, task, prompt_type)
# print()
# lang='de'
# get_distribution(data,models, lang, task, prompt_type)
# print()
# lang='fr'
# get_distribution(data,models, lang, task, prompt_type)

In [None]:
task='SA'

# print('------------- English Acc plots------------')
# get_acc_plot(data, ['en'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

# print('------------- French Acc plots------------')
# get_acc_plot(data, ['fr'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_variation)

------------- German Acc plots------------
{('German', 'active'): [[0.512, 0.5075, 0.4985, 0.512, 0.5055, 0.4995, 0.502, 0.5075, 0.5155, 0.506], [0.6055, 0.581, 0.536, 0.5325, 0.506, 0.676, 0.51, 0.608, 0.544, 0.5895], [0.839, 0.7545, 0.7145, 0.61, 0.565, 0.5755, 0.7495, 0.8455, 0.678, 0.6], [0.5795, 0.7145, 0.5015, 0.512, 0.7415, 0.553, 0.5205, 0.5015, 0.794, 0.5015], [0.5005, 0.4995, 0.4995, 0.4985, 0.492, 0.5, 0.499, 0.5, 0.5, 0.499]], ('German', 'passive'): [[0.5065, 0.508, 0.4985, 0.501, 0.5065, 0.504, 0.507, 0.511, 0.5155, 0.5005], [0.618, 0.6375, 0.6365, 0.591, 0.547, 0.637, 0.5135, 0.6375, 0.555, 0.672], [0.8235, 0.808, 0.7425, 0.742, 0.7295, 0.7525, 0.8075, 0.784, 0.708, 0.712], [0.6295, 0.7155, 0.5005, 0.553, 0.5905, 0.539, 0.7185, 0.509, 0.553, 0.5015], [0.5, 0.5, 0.4995, 0.4985, 0.4995, 0.499, 0.4985, 0.5, 0.4995, 0.4995]], ('German', 'auxiliary'): [[0.5055, 0.5055, 0.4985, 0.505, 0.4995, 0.502, 0.505, 0.506, 0.5065, 0.5], [0.6485, 0.6195, 0.536, 0.5205, 0.676, 0.51, 0.581,

In [None]:
# task='NLI'
# lang='en'
# prompt_type='active'

# get_distribution(data,models, lang, task, nli=True, prompt_type=prompt_type)
# print()
# lang='de'
# get_distribution(data,models, lang, task, nli=True, prompt_type=prompt_type)
# print()
# lang='fr'
# get_distribution(data,models, lang, task, nli=True, prompt_type=prompt_type)

In [None]:
task = 'NLI'
# print('------------- English Acc plots------------')
# get_acc_plot(data, ['en'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

# print('------------- French Acc plots------------')
# get_acc_plot(data, ['fr'], models, prompt_types,
#             task, seed, version, fn=get_acc_per_prompt_variation)

------------- German Acc plots------------
{('German', 'active'): [[0.3308308308308308, 0.3308308308308308, 0.3328328328328328, 0.3383383383383383, 0.3328328328328328, 0.32932932932932935, 0.34034034034034033, 0.33233233233233234, 0.33183183183183185, 0.33233233233233234], [0.34734734734734735, 0.3883883883883884, 0.36436436436436437, 0.38038038038038036, 0.35035035035035034, 0.36286286286286284, 0.3758758758758759, 0.3883883883883884, 0.3843843843843844, 0.3738738738738739], [0.3813813813813814, 0.33983983983983984, 0.34434434434434436, 0.3913913913913914, 0.34384384384384387, 0.3908908908908909, 0.37487487487487486, 0.3508508508508508, 0.33933933933933935, 0.3563563563563564], [0.41291291291291293, 0.38238238238238237, 0.3963963963963964, 0.4174174174174174, 0.36686686686686687, 0.4244244244244244, 0.37787787787787785, 0.4109109109109109, 0.38288288288288286, 0.3933933933933934], [0.35735735735735735, 0.34734734734734735, 0.34284284284284283, 0.33383383383383386, 0.3333333333333333, 

### ABC normalized dim= 0 neutral instead of maybe

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=65
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [None]:
print(print_dict_keys(data))

['42', 'en', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 'acc', 'prompt_id_1'

In [None]:
task='SA'

print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.59, 0.555, 0.615, 0.61, 0.635, 0.585, 0.605, 0.61, 0.6, 0.625], [0.93, 0.93, 0.895, 0.945, 0.905, 0.94, 0.94, 0.9, 0.935, 0.945], [0.92, 0.92, 0.96, 0.94, 0.91, 0.945, 0.95, 0.955, 0.97, 0.955], [0.55, 0.565, 0.44, 0.51, 0.53, 0.57, 0.51, 0.465, 0.575, 0.67], [0.81, 0.815, 0.81, 0.775, 0.8, 0.785, 0.795, 0.845, 0.86, 0.815]], ('English', 'passive'): [[0.625, 0.59, 0.625, 0.645, 0.59, 0.59, 0.615, 0.58, 0.575, 0.585], [0.92, 0.93, 0.92, 0.945, 0.94, 0.945, 0.935, 0.9, 0.91, 0.945], [0.91, 0.925, 0.915, 0.94, 0.93, 0.935, 0.94, 0.935, 0.955, 0.935], [0.555, 0.535, 0.465, 0.58, 0.475, 0.59, 0.49, 0.43, 0.575, 0.67], [0.81, 0.835, 0.83, 0.82, 0.835, 0.79, 0.81, 0.84, 0.845, 0.84]], ('English', 'auxiliary'): [[0.6, 0.575, 0.615, 0.605, 0.595, 0.62, 0.61, 0.605, 0.585, 0.575], [0.93, 0.89, 0.875, 0.92, 0.95, 0.945, 0.915, 0.94, 0.885, 0.925], [0.91, 0.94, 0.935, 0.905, 0.95, 0.955, 0.935, 0.95, 0.96, 0.92], [0.57, 0.57, 

------------- German Acc plots------------
{('German', 'active'): [[0.44, 0.485, 0.48, 0.455, 0.445, 0.455, 0.485, 0.475, 0.46, 0.455], [0.57, 0.54, 0.545, 0.5, 0.455, 0.65, 0.695, 0.54, 0.535, 0.52], [0.3, 0.35, 0.365, 0.23, 0.26, 0.25, 0.24, 0.375, 0.34, 0.26], [0.6, 0.64, 0.625, 0.57, 0.685, 0.59, 0.625, 0.61, 0.82, 0.695], [0.59, 0.655, 0.62, 0.58, 0.545, 0.6, 0.605, 0.555, 0.55, 0.6]], ('German', 'passive'): [[0.485, 0.465, 0.47, 0.455, 0.455, 0.48, 0.515, 0.46, 0.445, 0.47], [0.565, 0.585, 0.51, 0.635, 0.505, 0.6, 0.605, 0.55, 0.53, 0.605], [0.3, 0.365, 0.315, 0.29, 0.285, 0.245, 0.315, 0.34, 0.265, 0.26], [0.655, 0.66, 0.585, 0.63, 0.72, 0.655, 0.705, 0.635, 0.795, 0.635], [0.64, 0.59, 0.585, 0.615, 0.62, 0.57, 0.58, 0.605, 0.625, 0.64]], ('German', 'auxiliary'): [[0.475, 0.49, 0.48, 0.49, 0.445, 0.46, 0.485, 0.42, 0.47, 0.51], [0.56, 0.625, 0.56, 0.495, 0.62, 0.675, 0.625, 0.505, 0.47, 0.52], [0.405, 0.415, 0.375, 0.235, 0.25, 0.245, 0.515, 0.23, 0.265, 0.325], [0.66, 0.605, 0.

------------- French Acc plots------------
{('French', 'active'): [[0.43, 0.455, 0.465, 0.39, 0.495, 0.415, 0.455, 0.4, 0.34, 0.435], [0.92, 0.925, 0.855, 0.955, 0.95, 0.96, 0.94, 0.96, 0.955, 0.975], [0.8, 0.795, 0.755, 0.685, 0.82, 0.79, 0.81, 0.74, 0.725, 0.81], [0.525, 0.56, 0.49, 0.53, 0.45, 0.535, 0.545, 0.555, 0.58, 0.465], [0.605, 0.495, 0.495, 0.525, 0.49, 0.535, 0.535, 0.54, 0.56, 0.44]], ('French', 'passive'): [[0.42, 0.41, 0.375, 0.41, 0.475, 0.43, 0.465, 0.42, 0.385, 0.395], [0.955, 0.95, 0.96, 0.94, 0.96, 0.96, 0.925, 0.965, 0.945, 0.96], [0.81, 0.765, 0.725, 0.77, 0.755, 0.825, 0.77, 0.79, 0.725, 0.79], [0.53, 0.49, 0.465, 0.53, 0.48, 0.555, 0.555, 0.575, 0.59, 0.545], [0.545, 0.525, 0.505, 0.54, 0.525, 0.535, 0.53, 0.48, 0.555, 0.53]], ('French', 'auxiliary'): [[0.445, 0.395, 0.465, 0.465, 0.415, 0.445, 0.38, 0.455, 0.455, 0.415], [0.94, 0.97, 0.875, 0.945, 0.965, 0.955, 0.965, 0.945, 0.93, 0.945], [0.805, 0.82, 0.775, 0.85, 0.795, 0.785, 0.785, 0.825, 0.81, 0.845], [0.

In [None]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------
{('English', 'active'): [[0.29797979797979796, 0.29292929292929293, 0.32323232323232326, 0.3484848484848485, 0.31313131313131315, 0.3838383838383838, 0.3434343434343434, 0.3383838383838384, 0.30808080808080807, 0.3181818181818182], [0.4696969696969697, 0.41919191919191917, 0.4696969696969697, 0.47474747474747475, 0.4797979797979798, 0.47474747474747475, 0.4393939393939394, 0.43434343434343436, 0.48484848484848486, 0.4292929292929293], [0.46464646464646464, 0.4393939393939394, 0.45454545454545453, 0.45454545454545453, 0.43434343434343436, 0.4595959595959596, 0.42424242424242425, 0.47474747474747475, 0.45454545454545453, 0.4090909090909091], [0.46464646464646464, 0.42424242424242425, 0.3939393939393939, 0.3787878787878788, 0.3888888888888889, 0.43434343434343436, 0.4494949494949495, 0.3383838383838384, 0.3787878787878788, 0.37373737373737376], [0.29797979797979796, 0.29797979797979796, 0.32323232323232326, 0.29292929292929293, 0.323232323232323

------------- German Acc plots------------
{('German', 'active'): [[0.3888888888888889, 0.36363636363636365, 0.3787878787878788, 0.36363636363636365, 0.37373737373737376, 0.398989898989899, 0.3939393939393939, 0.41919191919191917, 0.3888888888888889, 0.398989898989899], [0.36363636363636365, 0.3484848484848485, 0.3333333333333333, 0.3686868686868687, 0.3838383838383838, 0.3686868686868687, 0.35353535353535354, 0.36363636363636365, 0.3939393939393939, 0.2878787878787879], [0.3484848484848485, 0.29797979797979796, 0.30808080808080807, 0.35858585858585856, 0.3282828282828283, 0.3383838383838384, 0.32323232323232326, 0.3282828282828283, 0.3434343434343434, 0.30303030303030304], [0.43434343434343436, 0.42424242424242425, 0.40404040404040403, 0.36363636363636365, 0.398989898989899, 0.4595959595959596, 0.35858585858585856, 0.35858585858585856, 0.3787878787878788, 0.3888888888888889], [0.3686868686868687, 0.3686868686868687, 0.36363636363636365, 0.3333333333333333, 0.3434343434343434, 0.328282

------------- French Acc plots------------
{('French', 'active'): [[0.41919191919191917, 0.3888888888888889, 0.40404040404040403, 0.3838383838383838, 0.4393939393939394, 0.4444444444444444, 0.3787878787878788, 0.41414141414141414, 0.3787878787878788, 0.3939393939393939], [0.47474747474747475, 0.43434343434343436, 0.4595959595959596, 0.47474747474747475, 0.4797979797979798, 0.4696969696969697, 0.45454545454545453, 0.4797979797979798, 0.4494949494949495, 0.4444444444444444], [0.29797979797979796, 0.36363636363636365, 0.3484848484848485, 0.36363636363636365, 0.30303030303030304, 0.31313131313131315, 0.3333333333333333, 0.30808080808080807, 0.32323232323232326, 0.30808080808080807], [0.3787878787878788, 0.3333333333333333, 0.3484848484848485, 0.36363636363636365, 0.3383838383838384, 0.3686868686868687, 0.4090909090909091, 0.3434343434343434, 0.35353535353535354, 0.37373737373737376], [0.41414141414141414, 0.4090909090909091, 0.36363636363636365, 0.37373737373737376, 0.3939393939393939, 0.3

# Yes no results for poster

In [11]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0','t5']
# models = ['flan', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
# languages = ['de', 'en', 'fr']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=82
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [12]:
print(print_dict_keys(data))

['42', 'en', 'bloom', 'SA', 'active', 'prompt_id_0', 0, 'yes', 'no', 'diff', 'pred', 'true', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 'acc', 'prompt_id_1'

In [13]:
task='SA'

# print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

# print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

# print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

In [14]:
task = 'NLI'
print('------------- English Acc plots------------')
get_acc_plot(data, ['en'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- German Acc plots------------')
get_acc_plot(data, ['de'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

print('------------- French Acc plots------------')
get_acc_plot(data, ['fr'], models, prompt_types,
            task, seed, version, fn=get_acc_per_prompt_variation)

------------- English Acc plots------------


------------- German Acc plots------------


------------- French Acc plots------------


In [36]:
model='llama'
# prompt_types = ['active', 'passive', 'auxiliary',
#                     'modal', 'common', 'rare_synonyms', 'identical_modal']
prompt_types = ['active', 'passive']
lang='de'

for prompt_type in prompt_types:
    print(f'-------------Friedman test on prompt {prompt_type} for lang {lang}------------')
    get_friedman_test(data, model,task, lang, prompt_type)
    print()

-------------Friedman test on prompt active for lang de------------
[[0.3333333333333333], [0.3434343434343434], [0.3434343434343434], [0.3434343434343434], [0.3434343434343434], [0.3484848484848485], [0.3333333333333333], [0.3434343434343434], [0.3383838383838384], [0.3383838383838384]]
Friedman chi-square score: 8.999999999999996
p-value: 0.4372741889138674

-------------Friedman test on prompt passive for lang de------------
[[0.3333333333333333], [0.3484848484848485], [0.3434343434343434], [0.3484848484848485], [0.3383838383838384], [0.3333333333333333], [0.3434343434343434], [0.3434343434343434], [0.3383838383838384], [0.35858585858585856]]
Friedman chi-square score: 9.0
p-value: 0.43727418891386693



In [None]:
model='llama'
# prompt_types = ['active', 'passive', 'auxiliary',
#                     'modal', 'common', 'rare_synonyms', 'identical_modal']
prompt_types = ['active', 'passive']
lang='de'

for prompt_type in prompt_types:
    print(f'-------------t-test on prompt {prompt_type} for lang {lang}------------')
    within_1_class_significance_test(data, model,task, lang, prompt_type)
    print()

In [None]:

model='llama'
prompt_type1='active'
prompt_type2='passive'
lang='de'

print(f'-------------Mann-Whitney test on prompt {prompt_type} for lang {lang}------------')
between_2_class_significance_test(data, model,task, lang,prompt_type1, prompt_type2)
print()

In [None]:
model='llama'
lang='de'
prompt_type1='active'
prompt_type2='passive'

print(f'-------------Wilcoxon test on for lang {lang}------------')
get_wilcoxon_test(data, model,task, lang, prompt_type1, prompt_type2)


#### var van yes no again

In [None]:
# evaluation settings
models = ['bloom', 'bloomz', 'flan', 'llama', 't0']
prompt_types = ['active', 'passive', 'auxiliary',
                    'modal', 'common', 'rare_synonyms', 'identical_modal']
# languages = ['de', 'en', 'fr']
seeds = ['42']
task = 'SA'
seed = seeds[0]
# lang='de'

version=52
data = merge_dicts(open_data_pickle(f'logits_dict_seed_42_lang_en_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_de_v{version}.pickle'),  open_data_pickle(f'logits_dict_seed_42_lang_fr_v{version}.pickle'))


In [None]:
for model in models:
    print(f'------------- Acc box plots for model {model} ------------')
    get_acc_box_plot(data, 'en', model, prompt_types,task, seed)
    get_acc_box_plot(data, 'de', model, prompt_types,task, seed)
    get_acc_box_plot(data, 'fr', model, prompt_types,task, seed)
    print('=========================================================')
    print()

------------- Acc box plots for model bloom ------------



------------- Acc box plots for model bloomz ------------



------------- Acc box plots for model flan ------------



------------- Acc box plots for model llama ------------



------------- Acc box plots for model t0 ------------



