In [7]:
import os
import pandas as pd
from mtqe.utils.paths import EVAL_DIR
from mtqe.utils.tables import create_latex_table

In [8]:
DATA_SPLIT = 'test'
VALUE = 'precision'

In [9]:
folders = os.listdir(EVAL_DIR)

In [10]:
li_max_results = []
li_min_results = []
li_med_results = []
li_mean_results = []
li_ensemble_results = []

In [11]:
for folder in folders:
    path = os.path.join(EVAL_DIR, folder)
    if os.path.isdir(path):
        files = os.listdir(path)
        for file in files:
            df = pd.read_csv(os.path.join(EVAL_DIR, folder, file))
            if file.endswith('ensemble_results.csv'):
                li_ensemble_results.append(df)
            elif file.endswith('max_results.csv'):
                li_max_results.append(df)
            elif file.endswith('min_results.csv'):
                li_min_results.append(df)
            elif file.endswith('median_results.csv'):
                li_med_results.append(df)
            elif file.endswith('mean_results.csv'):
                li_mean_results.append(df)

df_ensemble = pd.concat(li_ensemble_results)
df_max = pd.concat(li_max_results)
df_min = pd.concat(li_min_results)
df_med = pd.concat(li_med_results)
df_mean = pd.concat(li_mean_results)

In [12]:
df_max.head()

Unnamed: 0.1,Unnamed: 0,threshold,MCC,precision,recall,f1,accuracy,threshold_strategy,language_pair,split,seed,model_type,exp_group
0,1,0.63,0.493971,0.589404,0.55625,0.572347,0.867,best,en-cs,dev,2710,supervised,train_monolingual_auth_data
1,6,0.5,0.478523,0.515306,0.63125,0.567416,0.846,default,en-cs,dev,89,supervised,train_monolingual_auth_data
2,5,0.1,0.395604,0.345646,0.81875,0.486085,0.723,extreme,en-cs,dev,928,supervised,train_monolingual_auth_data
3,19,0.6,0.469422,0.561224,0.582011,0.571429,0.835,best,en-cs,test,928,supervised,train_monolingual_auth_data
4,24,0.5,0.471864,0.553922,0.597884,0.575064,0.833,default,en-cs,test,42,supervised,train_monolingual_auth_data


In [13]:
df_min.head()

Unnamed: 0.1,Unnamed: 0,threshold,MCC,precision,recall,f1,accuracy,threshold_strategy,language_pair,split,seed,model_type,exp_group
0,4,0.6,0.476773,0.559006,0.5625,0.560748,0.859,best,en-cs,dev,928,supervised,train_monolingual_auth_data
1,3,0.5,0.469649,0.507614,0.625,0.560224,0.843,default,en-cs,dev,928,supervised,train_monolingual_auth_data
2,8,0.1,0.330841,0.271719,0.91875,0.419401,0.593,extreme,en-cs,dev,89,supervised,train_monolingual_auth_data
3,22,0.64,0.451957,0.566667,0.539683,0.552846,0.835,best,en-cs,test,89,supervised,train_monolingual_auth_data
4,18,0.5,0.45742,0.522321,0.619048,0.566586,0.821,default,en-cs,test,928,supervised,train_monolingual_auth_data


In [14]:
def update_exp_group_names(row):
    if row['exp_group'][-4:] == 'enja':
        row['exp_group'] = row['exp_group'][:-5]
    return row

In [15]:
df_max = df_max.apply(update_exp_group_names, axis=1)
df_min = df_min.apply(update_exp_group_names, axis=1)
df_med = df_med.apply(update_exp_group_names, axis=1)
df_mean = df_mean.apply(update_exp_group_names, axis=1)
df_ensemble = df_ensemble.apply(update_exp_group_names, axis=1)

In [16]:
df_max_best = df_max[(df_max['threshold_strategy']=='best') & (df_max['split'] == DATA_SPLIT)]
df_min_default = df_min[(df_min['threshold_strategy']=='default') & (df_min['split'] == DATA_SPLIT)]
df_med_default = df_med[(df_med['threshold_strategy']=='best') & (df_med['split'] == DATA_SPLIT)]
df_mean_default = df_mean[(df_mean['threshold_strategy']=='default') & (df_mean['split'] == DATA_SPLIT)]
df_ensemble_best = df_ensemble[(df_ensemble['threshold_strategy']=='best') & (df_ensemble['split'] == DATA_SPLIT)]

In [17]:
df_min_default.head()

Unnamed: 0.1,Unnamed: 0,threshold,MCC,precision,recall,f1,accuracy,threshold_strategy,language_pair,split,seed,model_type,exp_group
4,18,0.5,0.45742,0.522321,0.619048,0.566586,0.821,default,en-cs,test,928,supervised,train_monolingual_auth_data
10,48,0.5,0.462632,0.68018,0.524306,0.592157,0.792,default,en-de,test,89,supervised,train_monolingual_auth_data
16,81,0.5,0.265752,0.418033,0.322785,0.364286,0.822,default,en-zh,test,2710,supervised,train_monolingual_auth_data
4,15,0.5,0.47162,0.575269,0.566138,0.570667,0.839,default,en-cs,test,928,supervised,train_multilingual_auth_data_all
10,54,0.5,0.480281,0.662698,0.579861,0.618519,0.794,default,en-de,test,107,supervised,train_multilingual_auth_data_all


In [18]:
df_max_best = pd.pivot_table(df_max_best, index='exp_group', columns='language_pair', values=VALUE)
df_max_best = df_max_best.rename_axis(None, axis=0)

In [19]:
df_min_default = pd.pivot_table(df_min_default, index='exp_group', columns='language_pair', values=VALUE)
df_min_default = df_min_default.rename_axis(None, axis=0)

In [20]:
df_med_default = pd.pivot_table(df_med_default, index='exp_group', columns='language_pair', values=VALUE)
df_med_default = df_med_default.rename_axis(None, axis=0)

In [21]:
df_mean_default = pd.pivot_table(df_mean_default, index='exp_group', columns='language_pair', values=VALUE)
df_mean_default = df_mean_default.rename_axis(None, axis=0)

In [22]:
df_ensemble_best = pd.pivot_table(df_ensemble_best, index='exp_group', columns='language_pair', values=VALUE)
df_ensemble_best = df_ensemble_best.rename_axis(None, axis=0)

In [23]:
df_med_default.head()

language_pair,en-cs,en-de,en-ja,en-zh
second_step_base_auth_data,0.578125,0.759494,0.259542,0.402235
second_step_base_demetr_auth_data,0.6,0.699571,0.222222,0.403361
second_step_base_demetr_data,0.585106,0.689956,0.263566,0.392593
second_step_base_wmt22_data,,0.699531,,
second_step_base_wmt22_small_data,,0.726316,,


In [24]:
col_names = ['experiment_group', 'en-cs', 'en-de', 'en-ja', 'en-zh']
di_med_default = {df_med_default.index[i]: [df_med_default.iloc[i,0], df_med_default.iloc[i,1], df_med_default.iloc[i,2], df_med_default.iloc[i,3]] for i in range(len(df_med_default))}
di_mean_default = {df_mean_default.index[i]: [df_mean_default.iloc[i,0], df_mean_default.iloc[i,1], df_mean_default.iloc[i,2], df_mean_default.iloc[i,3]] for i in range(len(df_mean_default))}
li_med_default = create_latex_table(col_names, di_med_default)
li_mean_default = create_latex_table(col_names, di_mean_default)

In [25]:
print(li_med_default)

\begin{table}
\centering
\begin{tabular}{c|ccccc}
 & EXPERIMENT_GROUP & EN-CS & EN-DE & EN-JA & EN-ZH\\
\hline
second_step_base_auth_data & 0.578 & 0.759 & 0.260 & 0.402 \\
second_step_base_demetr_auth_data & 0.600 & 0.700 & 0.222 & 0.403 \\
second_step_base_demetr_data & 0.585 & 0.690 & 0.264 & 0.393 \\
second_step_base_wmt22_data & nan & 0.700 & nan & nan \\
second_step_base_wmt22_small_data & nan & 0.726 & nan & nan \\
train_monolingual_auth_data & 0.570 & 0.725 & 0.271 & 0.418 \\
train_monolingual_auth_data_calibrated & 0.568 & 0.677 & 0.247 & 0.385 \\
train_multilingual_auth_data_all & 0.545 & 0.609 & 0.217 & 0.428 \\
train_multilingual_auth_data_single & 0.605 & 0.623 & 0.246 & 0.404 \\
train_multilingual_auth_demetr_data_single & 0.597 & 0.599 & 0.327 & 0.382 \\
train_multilingual_auth_wmt22_data_single & nan & 0.764 & nan & nan \\
\hline
\end{tabular}
\end{table}



In [26]:
di_max_best = {df_max_best.index[i]: [df_max_best.iloc[i,0], df_max_best.iloc[i,1], df_max_best.iloc[i,2], df_max_best.iloc[i,3]] for i in range(len(df_max_best))}
di_min_default = {df_min_default.index[i]: [df_min_default.iloc[i,0], df_min_default.iloc[i,1], df_min_default.iloc[i,2], df_min_default.iloc[i,3]] for i in range(len(df_min_default))}
li_max_best = create_latex_table(df_max_best.columns.names, di_max_best)
li_min_default = create_latex_table(df_min_default.columns.names, di_min_default)

In [27]:
di_ensemble_best = {df_ensemble_best.index[i]: [df_ensemble_best.iloc[i,0], df_ensemble_best.iloc[i,1], df_ensemble_best.iloc[i,2], df_ensemble_best.iloc[i,3]] for i in range(len(df_ensemble_best))}
li_ensemble_best = create_latex_table(['experiment group', 'en-cs', 'en-de', 'en-ja', 'en-zh'], di_ensemble_best)

In [28]:
print('\n'.join(li_med_default))

\
b
e
g
i
n
{
t
a
b
l
e
}


\
c
e
n
t
e
r
i
n
g


\
b
e
g
i
n
{
t
a
b
u
l
a
r
}
{
c
|
c
c
c
c
c
}


 
&
 
E
X
P
E
R
I
M
E
N
T
_
G
R
O
U
P
 
&
 
E
N
-
C
S
 
&
 
E
N
-
D
E
 
&
 
E
N
-
J
A
 
&
 
E
N
-
Z
H
\
\


\
h
l
i
n
e


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
a
u
t
h
_
d
a
t
a
 
&
 
0
.
5
7
8
 
&
 
0
.
7
5
9
 
&
 
0
.
2
6
0
 
&
 
0
.
4
0
2
 
\
\


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
d
e
m
e
t
r
_
a
u
t
h
_
d
a
t
a
 
&
 
0
.
6
0
0
 
&
 
0
.
7
0
0
 
&
 
0
.
2
2
2
 
&
 
0
.
4
0
3
 
\
\


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
d
e
m
e
t
r
_
d
a
t
a
 
&
 
0
.
5
8
5
 
&
 
0
.
6
9
0
 
&
 
0
.
2
6
4
 
&
 
0
.
3
9
3
 
\
\


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
w
m
t
2
2
_
d
a
t
a
 
&
 
n
a
n
 
&
 
0
.
7
0
0
 
&
 
n
a
n
 
&
 
n
a
n
 
\
\


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
w
m
t
2
2
_
s
m
a
l
l
_
d
a
t
a
 
&
 
n
a
n
 
&
 
0
.
7
2
6
 
&
 
n
a
n
 
&
 
n
a
n
 
\
\


t
r
a
i
n
_
m
o
n
o
l
i
n
g
u
a
l
_
a
u
t
h
_
d
a
t
a
 
&
 
0
.
5
7
0
 
&
 
0
.
7
2
5
 
&
 
0
.
2
7
1
 
&
 
0
.
4
1
8
 
\
\


t
r
a
i
n
_
m
o
n
o
l


In [29]:
di_max_best

{'second_step_base_auth_data': [0.5947368144989014,
  0.7558139562606812,
  0.28125,
  0.4043715894222259],
 'second_step_base_demetr_auth_data': [0.5891891717910767,
  0.6884615421295166,
  0.2777777910232544,
  0.4181818068027496],
 'second_step_base_demetr_data': [0.5879396796226501,
  0.6896551847457886,
  0.28125,
  0.4060150384902954],
 'second_step_base_wmt22_data': [nan, 0.7178217768669128, nan, nan],
 'second_step_base_wmt22_small_data': [nan, 0.7115384340286255, nan, nan],
 'train_monolingual_auth_data': [0.5612244606018066,
  0.642405092716217,
  0.2741935551166534,
  0.4331210255622864],
 'train_monolingual_auth_data_calibrated': [0.5675675868988037,
  0.6772727370262146,
  0.2467532455921173,
  0.385185182094574],
 'train_multilingual_auth_data_all': [0.6296296119689941,
  0.6209912300109863,
  0.2318840622901916,
  0.4727272689342499],
 'train_multilingual_auth_data_single': [0.6067415475845337,
  0.6231002807617188,
  0.3000000119209289,
  0.4043715894222259],
 'train_mu

In [30]:
di_min_default

{'baseline': [0.5810810923576355, 0.6960784196853638, 0.2666666805744171, 0.5],
 'prompt_basic': [0.4974619150161743,
  0.6549707651138306,
  0.2222222238779068,
  0.3940886557102203],
 'prompt_gemba': [0.5026454925537109,
  0.577464759349823,
  0.1554252207279205,
  0.3258064389228821],
 'second_step_base_auth_data': [0.578125,
  0.7654321193695068,
  0.25,
  0.386904776096344],
 'second_step_base_demetr_auth_data': [0.5370370149612427,
  0.7053571343421936,
  0.1923076957464218,
  0.3934426307678222],
 'second_step_base_demetr_data': [0.5409091114997864,
  0.681034505367279,
  0.2543859779834747,
  0.3868613243103027],
 'second_step_base_wmt22_data': [nan, 0.6328125, nan, nan],
 'second_step_base_wmt22_small_data': [nan, 0.6995074152946472, nan, nan],
 'train_monolingual_auth_data': [0.5223214030265808,
  0.6801801919937134,
  0.2321428507566452,
  0.4180327951908111],
 'train_monolingual_auth_data_calibrated': [0.6823529601097107,
  0.6772727370262146,
  0.1666666716337204,
  0.5660

In [31]:
li_max_best

'\\begin{table}\n\\centering\n\\begin{tabular}{c|c}\n & LANGUAGE_PAIR\\\\\n\\hline\nsecond_step_base_auth_data & 0.595 & 0.756 & 0.281 & 0.404 \\\\\nsecond_step_base_demetr_auth_data & 0.589 & 0.688 & 0.278 & 0.418 \\\\\nsecond_step_base_demetr_data & 0.588 & 0.690 & 0.281 & 0.406 \\\\\nsecond_step_base_wmt22_data & nan & 0.718 & nan & nan \\\\\nsecond_step_base_wmt22_small_data & nan & 0.712 & nan & nan \\\\\ntrain_monolingual_auth_data & 0.561 & 0.642 & 0.274 & 0.433 \\\\\ntrain_monolingual_auth_data_calibrated & 0.568 & 0.677 & 0.247 & 0.385 \\\\\ntrain_multilingual_auth_data_all & 0.630 & 0.621 & 0.232 & 0.473 \\\\\ntrain_multilingual_auth_data_single & 0.607 & 0.623 & 0.300 & 0.404 \\\\\ntrain_multilingual_auth_demetr_data_single & 0.573 & 0.597 & 0.281 & 0.381 \\\\\ntrain_multilingual_auth_wmt22_data_single & nan & 0.724 & nan & nan \\\\\n\\hline\n\\end{tabular}\n\\end{table}\n'