In [2]:
import altair as alt
from altair import datum
import functools
import pandas as pd

import sys
sys.path.append('/private/home/victorialin/Projects/fairseq-py')
from scripts.visualization.multilingual_few_shot_eval_utils import *


result_tables = {
    # "en-6.7B-dense-en": "/checkpoint/victorialin/few_shot/6.7B_gpt3_setting_en_tasks/results.tsv",
    # multi-7.5B-dense-en": "/checkpoint/victorialin/few_shot/dense_7.5B_lang30_new_cc100_xl_unigram_en_tasks/results.tsv",
    "multi-7.5B-dense-multi": "/checkpoint/victorialin/few_shot/dense_7.5B_lang30_new_cc100_xl_unigram_mutli_tasks_v1/results.tsv",
    # "multi-200B-moe-en": "/checkpoint/victorialin/few_shot/moe_200B_lang30_new_cc100_xl_unigram_en_tasks//results.tsv",
    # "multi-200B-moe-multi": "/checkpoint/victorialin/few_shot/moe_200B_lang30_new_cc100_xl_unigram_mutli_tasks_v1/results.tsv"
}

_all_multi_eval_tasks = [
    'xnli', 
    'xcopa', 
    'pawsx'
]

# filtering conditions
def all_multi_eval_tasks(df):
    return functools.reduce(lambda x, y: x | y, [df['task'] == t for t in _all_multi_eval_tasks])

def xnli_template_selection(df):
    return ((df['template'] == 'xnli_generativenli__en')
    | (df['template'] == 'xnli_generativenli_mt')
    | (df['template'] == 'xnli_generativenli_ht')
)

def xcopa_template_selection(df):
    return (df['template'] == 'xcopa__en') | (df['template'] == 'xcopa_mt') | (df['template'] == 'xcopa_simple')

def all_checkpoints(df):
    return (((df.model == 'dense_7.5B_lang30_new_cc100_xl_unigram') & (df.step == 238000)))

def num_few_shot_samples(df):
    return ((df['nb_few_shot_samples'] == 0))

def final_eval_splits(df):
    return (((df['task'] == 'arcchallenge') & (df['eval_set'] == 'dev') & (df['train_set'] == 'train'))
        | ((df['task'] == 'arceasy') & (df['eval_set'] == 'dev') & (df['train_set'] == 'train'))
        | ((df['task'] == 'copa') & (df['eval_set'] == 'val') & (df['train_set'] == 'train'))
        | ((df['task'] == 'hellaswag') & (df['eval_set'] == 'val') & (df['train_set'] == 'train'))
        | ((df['task'] == 'openbookqa') & (df['eval_set'] == 'test') & (df['train_set'] == 'train'))
        | ((df['task'] == 'piqa') & (df['eval_set'] == 'valid') & (df['train_set'] == 'train'))
        | ((df['task'] == 'winogrande') & (df['eval_set'] == 'dev') & (df['train_set'] == 'train_xl'))
        | ((df['task'] == 'storycloze') & (df['eval_set'] == 'test2016') & (df['train_set'] == 'val2016'))
        | ((df['task'] == 'storycloze') & (df['eval_set'] == 'val2016') & (df['train_set'] == 'val2016'))
        | ((df['task'] == 'pawsx') & (df['eval_set'] == 'test') & (df['train_set'] == 'dev'))
        | ((df['task'] == 'xcopa') & (df['eval_set'] == 'test') & (df['train_set'] == 'val'))
        | ((df['task'] == 'xnli') & (df['eval_set'] == 'dev') & (df['train_set'] == 'dev'))
        | ((df['task'] == 'xwinograd') & ((df['language'] != 'fr') & (df['language'] != 'zh')) & (df['eval_set'] == 'test') & (df['train_set'] == 'test')))

def en_final_eval_splits(df):
    return (((df['task'] == 'arcchallenge') & (df['eval_set'] == 'dev') & (df['train_set'] == 'train'))
        | ((df['task'] == 'arceasy') & (df['eval_set'] == 'dev') & (df['train_set'] == 'train'))
        | ((df['task'] == 'copa') & (df['eval_set'] == 'val') & (df['train_set'] == 'train'))
        | ((df['task'] == 'hellaswag') & (df['eval_set'] == 'val') & (df['train_set'] == 'train'))
        | ((df['task'] == 'openbookqa') & (df['eval_set'] == 'test') & (df['train_set'] == 'train'))
        | ((df['task'] == 'piqa') & (df['eval_set'] == 'valid') & (df['train_set'] == 'train'))
        | ((df['task'] == 'winogrande') & (df['eval_set'] == 'dev') & (df['train_set'] == 'train_xl'))
        | ((df['task'] == 'storycloze') & (df['eval_set'] == 'test2016') & (df['train_set'] == 'val2016'))
        | ((df['task'] == 'pawsx') & (df['eval_set'] == 'test') & (df['train_set'] == 'dev'))
        | ((df['task'] == 'xcopa') & (df['eval_set'] == 'test') & (df['train_set'] == 'val'))
        | ((df['task'] == 'xnli') & (df['eval_set'] == 'dev') & (df['train_set'] == 'dev'))
        | ((df['task'] == 'xwinograd') & ((df['language'] != 'fr') & (df['language'] != 'zh')) & (df['eval_set'] == 'test') & (df['train_set'] == 'test')))

def multi_final_eval_splits(df):
    return (((df['task'] == 'arcchallenge') & (df['eval_set'] == 'dev') & (df['train_set'] == 'train'))
        | ((df['task'] == 'arceasy') & (df['eval_set'] == 'dev') & (df['train_set'] == 'train'))
        | ((df['task'] == 'copa') & (df['eval_set'] == 'val') & (df['train_set'] == 'train'))
        | ((df['task'] == 'hellaswag') & (df['eval_set'] == 'val') & (df['train_set'] == 'train'))
        | ((df['task'] == 'openbookqa') & (df['eval_set'] == 'test') & (df['train_set'] == 'train'))
        | ((df['task'] == 'piqa') & (df['eval_set'] == 'valid') & (df['train_set'] == 'train'))
        | ((df['task'] == 'winogrande') & (df['eval_set'] == 'dev') & (df['train_set'] == 'train_xl'))
        | ((df['task'] == 'storycloze') & (df['eval_set'] == 'val2016') & (df['train_set'] == 'val2016'))
        | ((df['task'] == 'pawsx') & (df['eval_set'] == 'test') & (df['train_set'] == 'dev'))
        | ((df['task'] == 'xcopa') & (df['eval_set'] == 'test') & (df['train_set'] == 'val'))
        | ((df['task'] == 'xnli') & (df['eval_set'] == 'test') & (df['train_set'] == 'dev'))
        | ((df['task'] == 'xwinograd') & ((df['language'] != 'fr') & (df['language'] != 'zh')) & (df['eval_set'] == 'test') & (df['train_set'] == 'test')))

def multi_dev_eval_splits(df):
    return (((df['task'] == 'arcchallenge') & (df['eval_set'] == 'dev') & (df['train_set'] == 'train'))
        | ((df['task'] == 'arceasy') & (df['eval_set'] == 'dev') & (df['train_set'] == 'train'))
        | ((df['task'] == 'copa') & (df['eval_set'] == 'val') & (df['train_set'] == 'train'))
        | ((df['task'] == 'hellaswag') & (df['eval_set'] == 'val') & (df['train_set'] == 'train'))
        | ((df['task'] == 'openbookqa') & (df['eval_set'] == 'test') & (df['train_set'] == 'train'))
        | ((df['task'] == 'piqa') & (df['eval_set'] == 'valid') & (df['train_set'] == 'train'))
        | ((df['task'] == 'winogrande') & (df['eval_set'] == 'dev') & (df['train_set'] == 'train_xl'))
        | ((df['task'] == 'storycloze') & (df['eval_set'] == 'val2016') & (df['train_set'] == 'val2016'))
        | ((df['task'] == 'pawsx') & (df['eval_set'] == 'test') & (df['train_set'] == 'dev'))
        | ((df['task'] == 'xcopa') & (df['eval_set'] == 'val') & (df['train_set'] == 'val'))
        | ((df['task'] == 'xnli') & (df['eval_set'] == 'dev') & (df['train_set'] == 'dev'))
        | ((df['task'] == 'xwinograd') & ((df['language'] != 'fr') & (df['language'] != 'zh')) & (df['eval_set'] == 'test') & (df['train_set'] == 'test')))

def en_only(df):
    return df['language'] == 'en'

dfs = {}
for key in result_tables:
    df = pd.read_csv(result_tables[key], sep='\t', index_col=False).iloc[:, 1:]
    df = df.drop_duplicates()
    df['model'] = df.model_name.apply(lambda x: x.split('__step')[0])
    df['step'] = df.model_name.apply(lambda x: int(x.split('__step')[1]))
    df = df[all_multi_eval_tasks(df) & all_checkpoints(df) & num_few_shot_samples(df) & multi_dev_eval_splits(df)]

    df['meta_task'] = df.task.apply(lambda x:x.split('__', 1)[0])
    df['model_id'] = df.model_name.apply(lambda x:x.split('__step', 1)[0])
    dfs[key] = df

multi_result_df = pd.concat(dfs.values())
multi_result_df['resource_level'] = multi_result_df.language.apply(lambda x:get_resource_level(x))

In [3]:
xnli_result_df = multi_result_df[xnli_template_selection(multi_result_df)]
xnli_result_table = pd.pivot_table(xnli_result_df, values=['accuracy::mean'], index=['template'], columns=['language'])
print(xnli_result_table.to_latex(float_format="{:0.1f}".format))

\begin{tabular}{lrrrrrrrrrrrrrrr}
\toprule
{} & \multicolumn{15}{l}{accuracy::mean} \\
language &             ar &   bg &   de &   el &   en &   es &   fr &   hi &   ru &   sw &   th &   tr &   ur &   vi &   zh \\
template               &                &      &      &      &      &      &      &      &      &      &      &      &      &      &      \\
\midrule
xnli\_generativenli\_\_en &           47.5 & 50.0 & 42.4 & 47.1 & 54.5 & 38.2 & 50.7 & 43.4 & 47.2 & 46.2 & 46.1 & 44.6 & 42.7 & 47.5 & 45.0 \\
xnli\_generativenli\_ht  &           46.0 & 49.3 & 49.5 & 47.1 & 54.5 & 50.0 & 50.4 & 37.5 & 47.4 & 44.4 & 34.9 & 46.3 & 33.7 & 45.2 & 34.1 \\
xnli\_generativenli\_mt  &           46.5 & 49.3 & 37.6 & 45.4 & 54.5 & 37.5 & 47.8 & 38.8 & 40.7 & 44.4 & 33.3 & 37.1 & 33.5 & 35.4 & 34.6 \\
\bottomrule
\end{tabular}



In [4]:
xcopa_result_df = multi_result_df[xcopa_template_selection(multi_result_df)]
xcopa_result_table = pd.pivot_table(xcopa_result_df, values=['accuracy::mean'], index=['template'], columns=['language'])
print(xcopa_result_table.to_latex(float_format="{:0.1f}".format))

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
{} & \multicolumn{12}{l}{accuracy::mean} \\
language &             et &   ht &   id &   it &   qu &   ru &   sw &   ta &   th &   tr &   vi &   zh \\
template     &                &      &      &      &      &      &      &      &      &      &      &      \\
\midrule
xcopa\_\_en    &           63.0 & 57.0 & 54.0 & 59.0 & 70.0 & 64.0 & 54.0 & 59.0 & 53.0 & 65.0 & 63.0 & 63.0 \\
xcopa\_mt     &           60.0 & 60.0 & 51.0 & 69.0 & 69.0 & 59.0 & 51.0 & 59.0 & 48.0 & 64.0 & 64.0 & 62.0 \\
xcopa\_simple &           61.0 & 55.0 & 55.0 & 56.0 & 69.0 & 59.0 & 52.0 & 54.0 & 51.0 & 63.0 & 55.0 & 62.0 \\
\bottomrule
\end{tabular}



In [6]:
xcopa_result_table

Unnamed: 0_level_0,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean
language,et,ht,id,it,qu,ru,sw,ta,th,tr,vi,zh
template,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
xcopa__en,63.0,57.0,54.0,59.0,70.0,64.0,54.0,59.0,53.0,65.0,63.0,63.0
xcopa_mt,60.0,60.0,51.0,69.0,69.0,59.0,51.0,59.0,48.0,64.0,64.0,62.0
xcopa_simple,61.0,55.0,55.0,56.0,69.0,59.0,52.0,54.0,51.0,63.0,55.0,62.0


In [19]:
xnli_result_df

Unnamed: 0,task,eval_set,eval_examples_cnt::mean,language,train_set,train_lang,template,nb_few_shot_samples,nb_trunc_few_shot_samples::mean,calibration,...,accuracy_2,accuracy_3,accuracy_4,accuracy_sum::mean,accuracy_sum::std,model,step,meta_task,model_id,resource_level
383,xnli,dev,2490.0,en,dev,en,xnli_generativenli_ht,0,0.0,False,...,,,,,,dense_7.5B_lang30_new_cc100_xl_unigram,238000,xnli,dense_7.5B_lang30_new_cc100_xl_unigram,high
384,xnli,dev,2490.0,en,dev,en,xnli_generativenli__en,0,0.0,False,...,,,,,,dense_7.5B_lang30_new_cc100_xl_unigram,238000,xnli,dense_7.5B_lang30_new_cc100_xl_unigram,high
385,xnli,dev,2490.0,en,dev,en,xnli_generativenli_mt,0,0.0,False,...,,,,,,dense_7.5B_lang30_new_cc100_xl_unigram,238000,xnli,dense_7.5B_lang30_new_cc100_xl_unigram,high
386,xnli,dev,2490.0,fr,dev,fr,xnli_generativenli_ht,0,0.0,False,...,,,,,,dense_7.5B_lang30_new_cc100_xl_unigram,238000,xnli,dense_7.5B_lang30_new_cc100_xl_unigram,high
387,xnli,dev,2490.0,fr,dev,fr,xnli_generativenli_mt,0,0.0,False,...,,,,,,dense_7.5B_lang30_new_cc100_xl_unigram,238000,xnli,dense_7.5B_lang30_new_cc100_xl_unigram,high
388,xnli,dev,2490.0,fr,dev,fr,xnli_generativenli__en,0,0.0,False,...,,,,,,dense_7.5B_lang30_new_cc100_xl_unigram,238000,xnli,dense_7.5B_lang30_new_cc100_xl_unigram,high
389,xnli,dev,2490.0,es,dev,es,xnli_generativenli_ht,0,0.0,False,...,,,,,,dense_7.5B_lang30_new_cc100_xl_unigram,238000,xnli,dense_7.5B_lang30_new_cc100_xl_unigram,high
390,xnli,dev,2490.0,es,dev,es,xnli_generativenli_mt,0,0.0,False,...,,,,,,dense_7.5B_lang30_new_cc100_xl_unigram,238000,xnli,dense_7.5B_lang30_new_cc100_xl_unigram,high
391,xnli,dev,2490.0,es,dev,es,xnli_generativenli__en,0,0.0,False,...,,,,,,dense_7.5B_lang30_new_cc100_xl_unigram,238000,xnli,dense_7.5B_lang30_new_cc100_xl_unigram,high
392,xnli,dev,2490.0,de,dev,de,xnli_generativenli_ht,0,0.0,False,...,,,,,,dense_7.5B_lang30_new_cc100_xl_unigram,238000,xnli,dense_7.5B_lang30_new_cc100_xl_unigram,high


\begin{tabular}{lrrrrrrrrrrrrrrr}
\toprule
{} & \multicolumn{15}{l}{accuracy::mean} \\
language &             ar &   bg &   de &   el &   en &   es &   fr &   hi &   ru &   sw &   th &   tr &   ur &   vi &   zh \\
template               &                &      &      &      &      &      &      &      &      &      &      &      &      &      &      \\
\midrule
xnli\_generativenli\_\_en &           47.5 & 50.0 & 42.4 & 47.1 & 54.5 & 38.2 & 50.7 & 43.4 & 47.2 & 46.2 & 46.1 & 44.6 & 42.7 & 47.5 & 45.0 \\
xnli\_generativenli\_ht  &           46.0 & 49.3 & 49.5 & 47.1 & 54.5 & 50.0 & 50.4 & 37.5 & 47.4 & 44.4 & 34.9 & 46.3 & 33.7 & 45.2 & 34.1 \\
xnli\_generativenli\_mt  &           46.5 & 49.3 & 37.6 & 45.4 & 54.5 & 37.5 & 47.8 & 38.8 & 40.7 & 44.4 & 33.3 & 37.1 & 33.5 & 35.4 & 34.6 \\
\bottomrule
\end{tabular}



In [7]:
result_table

Unnamed: 0_level_0,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean,accuracy::mean
language,ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh
template,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
xnli_generativenli__en,47.51004,49.959839,42.369478,47.068273,54.497992,38.192771,50.722892,43.373494,47.188755,46.2249,46.064257,44.618474,42.730924,47.51004,45.02008
xnli_generativenli_ht,46.024096,49.277108,49.518072,47.148594,54.497992,49.959839,50.441767,37.46988,47.389558,44.37751,34.899598,46.26506,33.654618,45.220884,34.056225
xnli_generativenli_mt,46.506024,49.277108,37.550201,45.381526,54.497992,37.51004,47.751004,38.835341,40.722892,44.37751,33.253012,37.148594,33.493976,35.421687,34.618474


In [101]:
# df_mapping = pd.DataFrame({'task': all_eval_tasks})
# sort_mapping = df_mapping.reset_index().set_index('task')
result_df = pd.concat([en_result_df, multi_result_df]).drop_duplicates(
    subset=[
        'task',
        'eval_set',
        'language',
        'train_set',
        'train_lang',
        'template',
        'nb_few_shot_samples',
        'calibration',
        'run_params::scoring',
        'model_name',
        "accuracy::mean",
        "accuracy::std",
        'model',
        'step',
        'meta_task',
        'model_id',
        'num_tokens_B',
        'num_EN_tokens_B',
        'num_gpu_days'
    ],
    ignore_index=True
)
filtered_result_df = result_df[last_checkpoint(result_df) & num_few_shot_samples(result_df) & template_selection(result_df) & multilingual_checkpoints(result_df)] 
grouped_filtered_result_df = filtered_result_df.groupby(['task', 'eval_set', 'language', 'train_set', 'train_lang', 'template', 'calibration', 'model_name'])
grouped_filtered_result_df = grouped_filtered_result_df.apply(lambda a: a.sort_values('nb_few_shot_samples'))
grouped_filtered_result_df.to_csv('/checkpoint/victorialin/few_shot/dense_7.5B_lang30_new_cc100_xl_unigram_en_tasks/few_shot.tsv', sep='\t')

multi_filtered_result_df = filtered_result_df[multi_final_eval_splits(filtered_result_df) & all_multi_eval_tasks(filtered_result_df)]
multi_grouped_filtered_result_df = multi_filtered_result_df.groupby(['task', 'eval_set', 'language', 'train_set', 'train_lang', 'template', 'calibration', 'model_name'])
multi_grouped_filtered_result_df = multi_grouped_filtered_result_df.apply(lambda a: a.sort_values('nb_few_shot_samples'))
multi_grouped_filtered_result_df.to_csv('/checkpoint/victorialin/few_shot/dense_7.5B_lang30_new_cc100_xl_unigram_en_tasks/multi_few_shot.tsv', sep='\t')

en_filtered_result_df = filtered_result_df[en_final_eval_splits(filtered_result_df) & all_en_eval_tasks(filtered_result_df) & en_only(filtered_result_df)]
en_grouped_filtered_result_df = en_filtered_result_df.groupby(['task', 'eval_set', 'language', 'train_set', 'train_lang', 'template', 'calibration', 'model_name'])
en_grouped_filtered_result_df = en_grouped_filtered_result_df.apply(lambda a: a.sort_values('nb_few_shot_samples'))
en_grouped_filtered_result_df.to_csv('/checkpoint/victorialin/few_shot/dense_7.5B_lang30_new_cc100_xl_unigram_en_tasks/en_few_shot.tsv', sep='\t')