In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import pandas as pd
import glob
import pickle

os.sys.path.insert(0, '../evaluation')

from evaluate import evaluate_all_systems, preprocess_all_models

In [89]:
models = [os.path.basename(p) for p in glob.glob(f'../data/models/dev/*')]

dfs = []

for model in models:
    if os.path.isfile(f'../data/models/dev/{model}/system_evaluation.csv'):
        df_ = pd.read_csv(f'../data/models/dev/{model}/system_evaluation.csv', index_col=['subset', 'references', 'metric'])
        dfs.append(df_)

scores_df = pd.concat(dfs, keys=models).reset_index(level=[1, 2], drop=True).unstack()
scores_df.columns = scores_df.columns.droplevel()

In [90]:
params_dfs = []
for model in models:
    with open(f'../data/models/dev/{model}/params.pkl', 'rb') as f:
        params_dfs.append(pd.DataFrame([pickle.load(f)], index=[model]))
    

params_df = pd.concat(params_dfs)

In [91]:
df = pd.merge(scores_df, params_df, left_index=True, right_index=True)

cols = ['bleu', 'meteor', 'ter', 'dp_scorer', 'max_dp', 'sa_scorer', 'max_sa', 'tems_lm_name', 'tems_lm_n', 'max_tems', 'referrer', 'txs_lm_name', 'txs_lm_n']
df.sort_values('bleu', ascending=False).loc[:, cols]

Unnamed: 0,bleu,meteor,ter,dp_scorer,max_dp,sa_scorer,max_sa,tems_lm_name,tems_lm_n,max_tems,referrer,txs_lm_name,txs_lm_n
8562436464462099980,57.66,0.433749,0.410661,ltr_lasso,2,random,4,lower,3,2,counter,lower,3
37677081755143658,57.64,0.432542,0.415537,ltr_lasso,2,inv_ltr_lasso,4,lower,3,2,counter,lower,3
4692801175971517865,57.61,0.436153,0.417633,ltr_lasso,2,inv_ltr_lasso,4,lower,3,2,counter,lower,6
3489570017219532767,57.31,0.435099,0.414805,ltr_lasso,2,random,4,lower,3,2,counter,lower,6
710608634447711577,57.15,0.432661,0.420168,ltr_lasso,2,ltr_lasso,4,lower,3,2,counter,lower,3
-5320594571956842214,56.97,0.435921,0.421631,ltr_lasso,2,ltr_lasso,4,lower,3,2,counter,lower,6
7697843703963551123,56.85,0.431124,0.420217,ltr_lasso,2,inv_ltr_lasso,4,lower,6,2,counter,lower,3
7268716319870250105,56.78,0.429774,0.419486,ltr_lasso,2,random,4,lower,6,2,counter,lower,3
-9198725233750128576,56.52,0.433739,0.426945,ltr_lasso,2,inv_ltr_lasso,4,lower,6,2,counter,lower,6
2617089377173533434,56.37,0.433599,0.426653,ltr_lasso,2,random,4,lower,6,2,counter,lower,6


In [92]:
df.to_csv('../data/models/dev/results.csv')

In [93]:
df.groupby('dp_scorer').bleu.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
dp_scorer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
inv_ltr_lasso,225.0,34.061778,9.765977,17.58,26.5,32.33,42.33,54.54
ltr_lasso,225.0,35.3916,10.660946,17.7,26.95,33.22,43.1,57.66
random,225.0,34.566178,10.098182,17.66,26.41,32.77,42.18,56.07


In [94]:
df.groupby('sa_scorer').bleu.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sa_scorer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
inv_ltr_lasso,225.0,34.426044,10.113723,17.61,26.74,32.53,42.63,57.64
ltr_lasso,225.0,34.783067,10.230664,17.58,26.58,32.78,42.26,57.15
random,225.0,34.810444,10.24024,17.63,26.73,32.77,42.5,57.66


In [95]:
df.groupby('tems_lm_name').bleu.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
tems_lm_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
inv_lower,270.0,28.535593,7.49993,17.58,24.62,27.465,35.965,44.67
lower,270.0,40.847519,9.165403,26.06,32.8025,42.265,47.9475,57.66
random,135.0,34.599704,9.334048,20.77,29.68,32.37,43.895,52.43


In [96]:
df.groupby('txs_lm_name').bleu.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
txs_lm_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
inv_lower,270.0,29.624185,8.176243,17.58,25.365,27.465,39.625,43.67
lower,270.0,40.426593,9.679128,22.87,32.8025,39.825,48.7625,57.66
random,135.0,33.26437,8.891935,19.19,28.585,30.63,42.315,49.3


In [97]:
df.groupby('referrer').bleu.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
referrer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
counter,225.0,40.966933,9.208281,27.29,32.29,42.18,47.78,57.66
inv_counter,225.0,25.414089,5.168019,17.58,20.92,26.11,29.39,34.15
preprocess_so,225.0,37.638533,8.103931,25.1,29.69,39.55,43.5,51.26
