# Read aggregated results 
The collect_results.py script aggregates results from multiple directories. You can view and manipulate the aggregated results to get insights from the data

In [1]:
import glob
import os
import json
import re
import numpy as np
import matplotlib.pyplot as plt

from scipy import stats
import time

import pandas as pd
from pandas import DataFrame

import sys
# add reference to the few_shot dir
path_base = os.path.abspath(os.path.join(sys.path[0], '../'))
print(path_base)
sys.path.insert(1, path_base)
from collect_results import view_preferred_metrics, get_first_candidates_prompts, get_first_pred_prompts_example

/private/home/tbmihaylov/fairseq-xlmg/examples/few_shot


Read the raw results file:

In [2]:
# This is the path to the raw results file generated with the script
results_json = "/private/home/tbmihaylov/fairseq-xlmg/gpt3_repro_cb_results.tsv.raw.jsonl"
results_json = "/private/home/tbmihaylov/fairseq-xlmg/cb_our_models.tsv.raw.jsonl"
#results_json = "/checkpoint/tbmihaylov/few_shot/xnli_experimental/results_200.tsv.raw.jsonl"

# python examples/few_shot/scripts/collect_results.py -i /checkpoint/tbmihaylov/few_shot/xnli_experimental/*smpl.All* -o /checkpoint/tbmihaylov/few_shot/xnli_experimental/results.tsv -v preferred_metrics_mean
results_json = "/checkpoint/tbmihaylov/few_shot/xnli_experimental/results.tsv.raw.jsonl"

#python examples/few_shot/scripts/collect_results.py -i /checkpoint/tbmihaylov/few_shot/2021_09_multilingual_eval_v1 -o /checkpoint/tbmihaylov/few_shot/2021_09_multilingual_eval_v1/results.tsv -v preferred_metrics_mean
results_json = "/checkpoint/tbmihaylov/few_shot/multilingual_xnli_1.3B_gpt3_setting_checkpoints/results.tsv.raw.jsonl"
# python examples/few_shot/scripts/collect_results.py -i /checkpoint/tbmihaylov/few_shot/2021_09_multilingual_eval_v1_full -o /checkpoint/tbmihaylov/few_shot/2021_09_multilingual_eval_v1_full/results.tsv -v preferred_metrics_mean
results_json = "/checkpoint/tbmihaylov/few_shot/2021_09_multilingual_eval_v1_full/results.tsv.raw.jsonl"
#results_json = "/checkpoint/tbmihaylov/few_shot/2021_09_multilingual_eval_v1_full_calib/results.tsv.raw.jsonl"

In [3]:
only_monolingual = True 

results = []
with open(results_json) as f_res:
    for line in f_res:
        line = line.strip()
        item = json.loads(line)
        if 'run_params::n_eval_samples' not in item:
            item['run_params::n_eval_samples'] = 0
            
        if only_monolingual and item["train_lang"] != item["language"]:
            continue
        results.append(item)

print(f"{len(results)} items loaded")
#print(item)

450 items loaded


In [4]:
df = DataFrame.from_records(results)

The results has the followinf columns:

In [5]:
list(df.columns)

['model_name',
 'task',
 'language',
 'template',
 'nb_few_shot_samples',
 'calibration_options',
 'calibrator_name',
 'train_set',
 'valid_set',
 'eval_set',
 'train_lang',
 'valid_lang',
 'ppl_common_prefix::scores',
 'ppl_common_prefix::mean',
 'ppl_common_prefix::std',
 'ppl_common_prefix::mean_confidence_interval',
 'ppl_selected_candidate::scores',
 'ppl_selected_candidate::mean',
 'ppl_selected_candidate::std',
 'ppl_selected_candidate::mean_confidence_interval',
 'ppl_full_selected_candidate::scores',
 'ppl_full_selected_candidate::mean',
 'ppl_full_selected_candidate::std',
 'ppl_full_selected_candidate::mean_confidence_interval',
 'ppl_candidates_full_prompt__entailment::scores',
 'ppl_candidates_full_prompt__entailment::mean',
 'ppl_candidates_full_prompt__entailment::std',
 'ppl_candidates_full_prompt__entailment::mean_confidence_interval',
 'ppl_candidates_full_prompt__contradiction::scores',
 'ppl_candidates_full_prompt__contradiction::mean',
 'ppl_candidates_full_prompt_

## Print template verbalized prompts

In [6]:
print("Verbalized examples:")
print("#" * 10)
for item in results:
    if item["language"] == "en":
        verbalized_prompt = get_first_pred_prompts_example(item["results_file"])
        print(f"task:{item['task']} template: {item['template']}:\n{verbalized_prompt}")

Verbalized examples:
##########
task:xnli template: xnli_generativenli_sentence_mt:
### Prompt for cand `entailment`:
And he said, Mama, I'm home. Right? Yes, he called his mom as soon as the school bus dropped him off.
##########
task:xnli template: xnli_generativenli_mt:
### Prompt for cand `entailment`:
And he said, Mama, I'm home, right? Yes, he called his mom as soon as the school bus dropped him off.
##########
task:xnli template: xnli_generativenli_sentence__en:
### Prompt for cand `entailment`:
And he said, Mama, I'm home.. Right? Yes, he called his mom as soon as the school bus dropped him off.
##########
task:xnli template: xnli_generativenli__en:
### Prompt for cand `entailment`:
And he said, Mama, I'm home, right? Yes, he called his mom as soon as the school bus dropped him off.
##########
task:xnli template: generativenli:
### Prompt for cand `entailment`:
And he said, Mama, I'm home, right? Yes, he called his mom as soon as the school bus dropped him off.
##########
task:

# Create a custom view using pandas
You can use the pandas api to manipulate the table and create custom views. 

In [9]:
def view_preferred_metrics_mean(df):
    run_columns = ["task", "eval_set", "run_params::n_eval_samples", "language", "train_set", "train_lang", "template", "nb_few_shot_samples", "calibration", "run_params::scoring", "model_name"]
    suffixes = ["::mean"]
    
    return view_preferred_metrics(df, run_columns, suffixes)

df_selected = view_preferred_metrics_mean(df)

value_cols = "accuracy::mean"
index_cols = ["model_name", "calibration", "task", "eval_set", "run_params::n_eval_samples", "train_set", "nb_few_shot_samples", 
              "template",
             ]
cols = ["language"]
pt = pd.pivot_table(df_selected, values=value_cols, index=index_cols,
                    columns=cols, aggfunc=[np.mean])
#pt = pt.swaplevel(0, 1, axis=1).sort_index(axis=1)

output_tsv = results_json + ".formatted.tsv"
pt.to_csv(output_tsv, sep="\t")
print(output_tsv)
pt

/checkpoint/tbmihaylov/few_shot/2021_09_multilingual_eval_v1_full/results.tsv.raw.jsonl.formatted.tsv


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,language,ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh
model_name,calibration,task,eval_set,run_params::n_eval_samples,train_set,nb_few_shot_samples,template,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
l28_64e_1b24_top1_cc100_combined_roberta,False,xnli,dev,0,test,0,generativenli,41.445783,42.008032,35.140562,43.413655,48.232932,38.192771,44.497992,37.751004,40.240964,42.128514,41.084337,38.634538,34.738956,46.385542,42.570281
l28_64e_1b24_top1_cc100_combined_roberta,False,xnli,dev,0,test,0,xnli_generativenli__en,41.445783,42.008032,35.140562,43.413655,48.232932,38.192771,44.497992,37.751004,40.240964,42.128514,41.084337,38.634538,34.738956,46.385542,42.570281
l28_64e_1b24_top1_cc100_combined_roberta,False,xnli,dev,0,test,0,xnli_generativenli_mt,33.895582,44.457831,45.742972,41.124498,48.232932,42.891566,44.97992,38.192771,43.534137,40.682731,34.096386,35.62249,32.048193,35.140562,33.895582
l28_64e_1b24_top1_cc100_combined_roberta,False,xnli,dev,0,test,0,xnli_generativenli_sentence__en,42.610442,41.606426,33.815261,42.971888,47.269076,35.742972,44.859438,37.550201,43.253012,41.927711,42.168675,37.710843,34.899598,47.148594,45.060241
l28_64e_1b24_top1_cc100_combined_roberta,False,xnli,dev,0,test,0,xnli_generativenli_sentence_mt,35.62249,43.052209,35.220884,40.200803,48.514056,44.216867,42.771084,39.236948,36.506024,39.35743,34.939759,37.389558,31.164659,34.899598,34.056225
l28_dense_1b24_2cc100_combined_roberta,False,xnli,dev,0,test,0,generativenli,46.26506,41.204819,34.658635,43.975904,48.473896,34.939759,43.534137,37.911647,42.53012,41.927711,40.321285,38.072289,33.453815,45.421687,43.574297
l28_dense_1b24_2cc100_combined_roberta,False,xnli,dev,0,test,0,xnli_generativenli__en,46.26506,41.204819,34.658635,43.975904,48.473896,34.939759,43.534137,37.911647,42.53012,41.927711,40.321285,38.072289,33.453815,45.421687,43.574297
l28_dense_1b24_2cc100_combined_roberta,False,xnli,dev,0,test,0,xnli_generativenli_mt,33.614458,45.180723,47.46988,43.73494,48.473896,44.738956,45.180723,39.35743,44.819277,41.767068,33.413655,36.345382,30.963855,36.184739,33.975904
l28_dense_1b24_2cc100_combined_roberta,False,xnli,dev,0,test,0,xnli_generativenli_sentence__en,44.899598,39.236948,33.895582,42.851406,48.072289,34.899598,44.698795,37.951807,43.815261,41.164659,40.923695,36.746988,34.216867,46.827309,43.413655
l28_dense_1b24_2cc100_combined_roberta,False,xnli,dev,0,test,0,xnli_generativenli_sentence_mt,35.381526,44.297189,35.903614,41.526104,49.236948,44.016064,43.092369,39.919679,34.618474,40.481928,33.7751,34.658635,30.682731,35.662651,34.016064


# Make your view available to others
If you think that you created a view that might be useful to others, you can add it to the collect_results.py.
Simply add the `my_custom_view` function to the `display_views` dictionary in [collect_results.py](examples/few_shot/scripts/collect_results.py) and the custom view will be available as -v my_custom_view_freindly.

In [None]:
#df[df["run_params::train_sep"] == "\n\n"]