In [1]:
import os
import pandas as pd
from mtqe.utils.paths import EVAL_DIR
from mtqe.utils.tables import create_latex_table

In [None]:
DATA_SPLIT = 'test'
VALUE = 'precision'

In [2]:
folders = os.listdir(EVAL_DIR)

In [3]:
li_max_results = []
li_min_results = []
li_med_results = []
li_mean_results = []
li_ensemble_results = []

In [4]:
for folder in folders:
    files = os.listdir(os.path.join(EVAL_DIR, folder))
    for file in files:
        df = pd.read_csv(os.path.join(EVAL_DIR, folder, file))
        if file.endswith('ensemble_results.csv'):
            li_ensemble_results.append(df)
        elif file.endswith('max_results.csv'):
            li_max_results.append(df)
        elif file.endswith('min_results.csv'):
            li_min_results.append(df)
        elif file.endswith('median_results.csv'):
            li_med_results.append(df)
        elif file.endswith('mean_results.csv'):
            li_mean_results.append(df)

df_ensemble = pd.concat(li_ensemble_results)
df_max = pd.concat(li_max_results)
df_min = pd.concat(li_min_results)
df_med = pd.concat(li_med_results)
df_mean = pd.concat(li_mean_results)

In [5]:
df_max.head()

Unnamed: 0.1,Unnamed: 0,threshold,MCC,precision,recall,f1,accuracy,threshold_strategy,language_pair,split,seed,model_type,exp_group
0,1,0.63,0.493971,0.55625,0.589404,0.572347,0.867,best,en-cs,dev,2710,supervised,train_monolingual_auth_data
1,6,0.5,0.478523,0.63125,0.515306,0.567416,0.846,default,en-cs,dev,89,supervised,train_monolingual_auth_data
2,5,0.1,0.395604,0.81875,0.345646,0.486085,0.723,extreme,en-cs,dev,928,supervised,train_monolingual_auth_data
3,19,0.6,0.469422,0.582011,0.561224,0.571429,0.835,best,en-cs,test,928,supervised,train_monolingual_auth_data
4,24,0.5,0.471864,0.597884,0.553922,0.575064,0.833,default,en-cs,test,42,supervised,train_monolingual_auth_data


In [6]:
df_min.head()

Unnamed: 0.1,Unnamed: 0,threshold,MCC,precision,recall,f1,accuracy,threshold_strategy,language_pair,split,seed,model_type,exp_group
0,4,0.6,0.476773,0.5625,0.559006,0.560748,0.859,best,en-cs,dev,928,supervised,train_monolingual_auth_data
1,3,0.5,0.469649,0.625,0.507614,0.560224,0.843,default,en-cs,dev,928,supervised,train_monolingual_auth_data
2,8,0.1,0.330841,0.91875,0.271719,0.419401,0.593,extreme,en-cs,dev,89,supervised,train_monolingual_auth_data
3,22,0.64,0.451957,0.539683,0.566667,0.552846,0.835,best,en-cs,test,89,supervised,train_monolingual_auth_data
4,18,0.5,0.45742,0.619048,0.522321,0.566586,0.821,default,en-cs,test,928,supervised,train_monolingual_auth_data


In [7]:
def update_exp_group_names(row):
    if row['exp_group'][-4:] == 'enja':
        row['exp_group'] = row['exp_group'][:-5]
    return row

In [8]:
df_max = df_max.apply(update_exp_group_names, axis=1)
df_min = df_min.apply(update_exp_group_names, axis=1)
df_med = df_med.apply(update_exp_group_names, axis=1)
df_mean = df_mean.apply(update_exp_group_names, axis=1)
df_ensemble = df_ensemble.apply(update_exp_group_names, axis=1)

In [9]:
df_max_best = df_max[(df_max['threshold_strategy']=='best') & (df_max['split'] == DATA_SPLIT)]
df_min_default = df_min[(df_min['threshold_strategy']=='default') & (df_min['split'] == DATA_SPLIT)]
df_med_default = df_med[(df_med['threshold_strategy']=='best') & (df_med['split'] == DATA_SPLIT)]
df_mean_default = df_mean[(df_mean['threshold_strategy']=='default') & (df_mean['split'] == DATA_SPLIT)]
df_ensemble_best = df_ensemble[(df_ensemble['threshold_strategy']=='best') & (df_ensemble['split'] == DATA_SPLIT)]

In [10]:
df_min_default.head()

Unnamed: 0.1,Unnamed: 0,threshold,MCC,precision,recall,f1,accuracy,threshold_strategy,language_pair,split,seed,model_type,exp_group
4,18,0.5,0.45742,0.619048,0.522321,0.566586,0.821,default,en-cs,test,928,supervised,train_monolingual_auth_data
10,48,0.5,0.462632,0.524306,0.68018,0.592157,0.792,default,en-de,test,89,supervised,train_monolingual_auth_data
16,81,0.5,0.265752,0.322785,0.418033,0.364286,0.822,default,en-zh,test,2710,supervised,train_monolingual_auth_data
4,15,0.5,0.47162,0.566138,0.575269,0.570667,0.839,default,en-cs,test,928,supervised,train_multilingual_auth_data_all
10,54,0.5,0.480281,0.579861,0.662698,0.618519,0.794,default,en-de,test,107,supervised,train_multilingual_auth_data_all


In [11]:
df_max_best = pd.pivot_table(df_max_best, index='exp_group', columns='language_pair', values=VALUE)
df_max_best = df_max_best.rename_axis(None, axis=0)

In [12]:
df_min_default = pd.pivot_table(df_min_default, index='exp_group', columns='language_pair', values=VALUE)
df_min_default = df_min_default.rename_axis(None, axis=0)

In [13]:
df_med_default = pd.pivot_table(df_med_default, index='exp_group', columns='language_pair', values=VALUE)
df_med_default = df_med_default.rename_axis(None, axis=0)

In [14]:
df_mean_default = pd.pivot_table(df_mean_default, index='exp_group', columns='language_pair', values=VALUE)
df_mean_default = df_mean_default.rename_axis(None, axis=0)

In [15]:
df_ensemble_best = pd.pivot_table(df_ensemble_best, index='exp_group', columns='language_pair', values=VALUE)
df_ensemble_best = df_ensemble_best.rename_axis(None, axis=0)

In [16]:
df_med_default.head()

language_pair,en-cs,en-de,en-ja,en-zh
second_step_base_auth_data,0.484499,0.451037,0.251246,0.312664
second_step_base_demetr_auth_data,0.483496,0.500943,0.132522,0.247233
second_step_base_demetr_data,0.486823,0.483764,0.254678,0.254091
second_step_base_wmt22_data,,0.47279,,
second_step_base_wmt22_small_data,,0.468797,,


In [17]:
col_names = ['experiment_group', 'en-cs', 'en-de', 'en-ja', 'en-zh']
di_med_default = {df_med_default.index[i]: [df_med_default.iloc[i,0], df_med_default.iloc[i,1], df_med_default.iloc[i,2], df_med_default.iloc[i,3]] for i in range(len(df_med_default))}
di_mean_default = {df_mean_default.index[i]: [df_mean_default.iloc[i,0], df_mean_default.iloc[i,1], df_mean_default.iloc[i,2], df_mean_default.iloc[i,3]] for i in range(len(df_mean_default))}
li_med_default = create_latex_table(col_names, di_med_default)
li_mean_default = create_latex_table(col_names, di_mean_default)

In [18]:
print(li_med_default)

\begin{table}
\centering
\begin{tabular}{c|ccccc}
 & EXPERIMENT_GROUP & EN-CS & EN-DE & EN-JA & EN-ZH\\
\hline
second_step_base_auth_data & 0.484 & 0.451 & 0.251 & 0.313 \\
second_step_base_demetr_auth_data & 0.483 & 0.501 & 0.133 & 0.247 \\
second_step_base_demetr_data & 0.487 & 0.484 & 0.255 & 0.254 \\
second_step_base_wmt22_data & nan & 0.473 & nan & nan \\
second_step_base_wmt22_small_data & nan & 0.469 & nan & nan \\
train_monolingual_auth_data & 0.465 & 0.472 & 0.173 & 0.280 \\
train_monolingual_auth_data_calibrated & 0.461 & 0.457 & 0.173 & 0.246 \\
train_multilingual_auth_data_all & 0.468 & 0.518 & 0.273 & 0.296 \\
train_multilingual_auth_data_single & 0.484 & 0.509 & 0.214 & 0.306 \\
train_multilingual_auth_demetr_data_single & 0.410 & 0.507 & 0.202 & 0.252 \\
train_multilingual_auth_wmt22_data_single & nan & 0.483 & nan & nan \\
\hline
\end{tabular}
\end{table}



In [19]:
di_max_best = {df_max_best.index[i]: [df_max_best.iloc[i,0], df_max_best.iloc[i,1], df_max_best.iloc[i,2], df_max_best.iloc[i,3]] for i in range(len(df_max_best))}
di_min_default = {df_min_default.index[i]: [df_min_default.iloc[i,0], df_min_default.iloc[i,1], df_min_default.iloc[i,2], df_min_default.iloc[i,3]] for i in range(len(df_min_default))}
li_max_best = create_latex_table(df_max_best.columns.names, di_max_best)
li_min_default = create_latex_table(df_min_default.columns.names, di_min_default)

In [20]:
di_ensemble_best = {df_ensemble_best.index[i]: [df_ensemble_best.iloc[i,0], df_ensemble_best.iloc[i,1], df_ensemble_best.iloc[i,2], df_ensemble_best.iloc[i,3]] for i in range(len(df_ensemble_best))}
li_ensemble_best = create_latex_table(['experiment group', 'en-cs', 'en-de', 'en-ja', 'en-zh'], di_ensemble_best)

In [21]:
print('\n'.join(li_med_default))

\
b
e
g
i
n
{
t
a
b
l
e
}


\
c
e
n
t
e
r
i
n
g


\
b
e
g
i
n
{
t
a
b
u
l
a
r
}
{
c
|
c
c
c
c
c
}


 
&
 
E
X
P
E
R
I
M
E
N
T
_
G
R
O
U
P
 
&
 
E
N
-
C
S
 
&
 
E
N
-
D
E
 
&
 
E
N
-
J
A
 
&
 
E
N
-
Z
H
\
\


\
h
l
i
n
e


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
a
u
t
h
_
d
a
t
a
 
&
 
0
.
4
8
4
 
&
 
0
.
4
5
1
 
&
 
0
.
2
5
1
 
&
 
0
.
3
1
3
 
\
\


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
d
e
m
e
t
r
_
a
u
t
h
_
d
a
t
a
 
&
 
0
.
4
8
3
 
&
 
0
.
5
0
1
 
&
 
0
.
1
3
3
 
&
 
0
.
2
4
7
 
\
\


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
d
e
m
e
t
r
_
d
a
t
a
 
&
 
0
.
4
8
7
 
&
 
0
.
4
8
4
 
&
 
0
.
2
5
5
 
&
 
0
.
2
5
4
 
\
\


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
w
m
t
2
2
_
d
a
t
a
 
&
 
n
a
n
 
&
 
0
.
4
7
3
 
&
 
n
a
n
 
&
 
n
a
n
 
\
\


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
w
m
t
2
2
_
s
m
a
l
l
_
d
a
t
a
 
&
 
n
a
n
 
&
 
0
.
4
6
9
 
&
 
n
a
n
 
&
 
n
a
n
 
\
\


t
r
a
i
n
_
m
o
n
o
l
i
n
g
u
a
l
_
a
u
t
h
_
d
a
t
a
 
&
 
0
.
4
6
5
 
&
 
0
.
4
7
2
 
&
 
0
.
1
7
3
 
&
 
0
.
2
8
0
 
\
\


t
r
a
i
n
_
m
o
n
o
l


In [22]:
di_max_best

{'second_step_base_auth_data': [0.5019233822822571,
  0.470853716135025,
  0.2782382369041443,
  0.3196837306022644],
 'second_step_base_demetr_auth_data': [0.4870023727416992,
  0.524198055267334,
  0.2709347307682037,
  0.2507804632186889],
 'second_step_base_demetr_data': [0.507897675037384,
  0.4875065982341766,
  0.2782382369041443,
  0.2663236260414123],
 'second_step_base_wmt22_data': [nan, 0.4775583446025848, nan, nan],
 'second_step_base_wmt22_small_data': [nan, 0.4793201386928558, nan, nan],
 'train_monolingual_auth_data': [0.4694221913814544,
  0.5319602489471436,
  0.1800963580608368,
  0.3255181610584259],
 'train_monolingual_auth_data_calibrated': [0.4606903791427612,
  0.4565430879592895,
  0.1734403818845749,
  0.2460674941539764],
 'train_multilingual_auth_data_all': [0.4948434829711914,
  0.5313258767127991,
  0.2791102230548858,
  0.3033549785614013],
 'train_multilingual_auth_data_single': [0.4965242147445678,
  0.5181735754013062,
  0.2793377041816711,
  0.31968373

In [23]:
di_min_default

{'prompt_GEMBA': [0.3867392241954803,
  0.3325538039207458,
  0.192509189248085,
  0.3083758652210235],
 'prompt_basic': [0.3902434408664703,
  0.3680577874183655,
  0.2394516915082931,
  0.3266701400279999],
 'second_step_base_auth_data': [0.4844988286495209,
  0.4635652601718902,
  0.1547008603811264,
  0.2820091843605041],
 'second_step_base_demetr_auth_data': [0.466608852148056,
  0.4951819181442261,
  0.0941618606448173,
  0.2406207919120788],
 'second_step_base_demetr_data': [0.4773678779602051,
  0.4770433008670807,
  0.2253767997026443,
  0.2500015497207641],
 'second_step_base_wmt22_data': [nan, 0.4466632902622223, nan, nan],
 'second_step_base_wmt22_small_data': [nan, 0.4586276710033417, nan, nan],
 'train_monolingual_auth_data': [0.4574199020862579,
  0.4626324474811554,
  0.1332859694957733,
  0.2657517790794372],
 'train_monolingual_auth_data_calibrated': [0.3840743601322174,
  0.4565430879592895,
  0.0340091958642005,
  0.264654129743576],
 'train_multilingual_auth_data_a

In [24]:
li_max_best

'\\begin{table}\n\\centering\n\\begin{tabular}{c|c}\n & LANGUAGE_PAIR\\\\\n\\hline\nsecond_step_base_auth_data & 0.502 & 0.471 & 0.278 & 0.320 \\\\\nsecond_step_base_demetr_auth_data & 0.487 & 0.524 & 0.271 & 0.251 \\\\\nsecond_step_base_demetr_data & 0.508 & 0.488 & 0.278 & 0.266 \\\\\nsecond_step_base_wmt22_data & nan & 0.478 & nan & nan \\\\\nsecond_step_base_wmt22_small_data & nan & 0.479 & nan & nan \\\\\ntrain_monolingual_auth_data & 0.469 & 0.532 & 0.180 & 0.326 \\\\\ntrain_monolingual_auth_data_calibrated & 0.461 & 0.457 & 0.173 & 0.246 \\\\\ntrain_multilingual_auth_data_all & 0.495 & 0.531 & 0.279 & 0.303 \\\\\ntrain_multilingual_auth_data_single & 0.497 & 0.518 & 0.279 & 0.320 \\\\\ntrain_multilingual_auth_demetr_data_single & 0.424 & 0.519 & 0.227 & 0.261 \\\\\ntrain_multilingual_auth_wmt22_data_single & nan & 0.496 & nan & nan \\\\\n\\hline\n\\end{tabular}\n\\end{table}\n'