In [8]:
import os
import pandas as pd
from mtqe.utils.paths import EVAL_DIR
from mtqe.utils.tables import create_latex_table

In [9]:
folders = os.listdir(EVAL_DIR)

In [10]:
li_max_results = []
li_min_results = []
li_med_results = []
li_mean_results = []
li_ensemble_results = []

In [11]:
for folder in folders:
    files = os.listdir(os.path.join(EVAL_DIR, folder))
    for file in files:
        df = pd.read_csv(os.path.join(EVAL_DIR, folder, file))
        if file.endswith('ensemble_results.csv'):
            li_ensemble_results.append(df)
        elif file.endswith('max_results.csv'):
            li_max_results.append(df)
        elif file.endswith('min_results.csv'):
            li_min_results.append(df)
        elif file.endswith('median_results.csv'):
            li_med_results.append(df)
        elif file.endswith('mean_results.csv'):
            li_mean_results.append(df)

df_ensemble = pd.concat(li_ensemble_results)
df_max = pd.concat(li_max_results)
df_min = pd.concat(li_min_results)
df_med = pd.concat(li_med_results)
df_mean = pd.concat(li_mean_results)

In [12]:
df_max.head()

Unnamed: 0.1,Unnamed: 0,threshold,MCC,precision,recall,f1,accuracy,threshold_strategy,language_pair,split,seed,model_type,exp_group
0,1,0.63,0.493971,0.55625,0.589404,0.572347,0.867,best,en-cs,dev,2710,supervised,train_monolingual_auth_data
1,6,0.5,0.478523,0.63125,0.515306,0.567416,0.846,default,en-cs,dev,89,supervised,train_monolingual_auth_data
2,5,0.1,0.395604,0.81875,0.345646,0.486085,0.723,extreme,en-cs,dev,928,supervised,train_monolingual_auth_data
3,19,0.6,0.469422,0.582011,0.561224,0.571429,0.835,best,en-cs,test,928,supervised,train_monolingual_auth_data
4,24,0.5,0.471864,0.597884,0.553922,0.575064,0.833,default,en-cs,test,42,supervised,train_monolingual_auth_data


In [13]:
df_min.head()

Unnamed: 0.1,Unnamed: 0,threshold,MCC,precision,recall,f1,accuracy,threshold_strategy,language_pair,split,seed,model_type,exp_group
0,4,0.6,0.476773,0.5625,0.559006,0.560748,0.859,best,en-cs,dev,928,supervised,train_monolingual_auth_data
1,3,0.5,0.469649,0.625,0.507614,0.560224,0.843,default,en-cs,dev,928,supervised,train_monolingual_auth_data
2,8,0.1,0.330841,0.91875,0.271719,0.419401,0.593,extreme,en-cs,dev,89,supervised,train_monolingual_auth_data
3,22,0.64,0.451957,0.539683,0.566667,0.552846,0.835,best,en-cs,test,89,supervised,train_monolingual_auth_data
4,18,0.5,0.45742,0.619048,0.522321,0.566586,0.821,default,en-cs,test,928,supervised,train_monolingual_auth_data


In [14]:
def update_exp_group_names(row):
    if row['exp_group'][-4:] == 'enja':
        row['exp_group'] = row['exp_group'][:-5]
    return row

In [15]:
df_max = df_max.apply(update_exp_group_names, axis=1)
df_min = df_min.apply(update_exp_group_names, axis=1)
df_med = df_med.apply(update_exp_group_names, axis=1)
df_mean = df_mean.apply(update_exp_group_names, axis=1)
df_ensemble = df_ensemble.apply(update_exp_group_names, axis=1)

In [17]:
df_max_best = df_max[(df_max['threshold_strategy']=='best') & (df_max['split'] == 'test')]
df_min_default = df_min[(df_min['threshold_strategy']=='default') & (df_min['split'] == 'test')]
df_med_default = df_med[(df_med['threshold_strategy']=='default') & (df_med['split'] == 'test')]
df_mean_default = df_mean[(df_mean['threshold_strategy']=='default') & (df_mean['split'] == 'test')]
df_ensemble_best = df_ensemble[(df_ensemble['threshold_strategy']=='best') & (df_ensemble['split'] == 'test')]

In [43]:
df_min_default.head()

Unnamed: 0.1,Unnamed: 0,threshold,MCC,precision,recall,f1,accuracy,threshold_strategy,language_pair,split,seed,model_type,exp_group
4,18,0.5,0.45742,0.619048,0.522321,0.566586,0.821,default,en-cs,test,928,supervised,train_monolingual_auth_data
10,48,0.5,0.462632,0.524306,0.68018,0.592157,0.792,default,en-de,test,89,supervised,train_monolingual_auth_data
16,81,0.5,0.265752,0.322785,0.418033,0.364286,0.822,default,en-zh,test,2710,supervised,train_monolingual_auth_data
4,15,0.5,0.47162,0.566138,0.575269,0.570667,0.839,default,en-cs,test,928,supervised,train_multilingual_auth_data_all
10,54,0.5,0.480281,0.579861,0.662698,0.618519,0.794,default,en-de,test,107,supervised,train_multilingual_auth_data_all


In [44]:
df_max_best = pd.pivot_table(df_max_best, index='exp_group', columns='language_pair', values='MCC')
df_max_best = df_max_best.rename_axis(None, axis=0)

In [45]:
df_min_default = pd.pivot_table(df_min_default, index='exp_group', columns='language_pair', values='MCC')
df_min_default = df_min_default.rename_axis(None, axis=0)

In [18]:
df_med_default = pd.pivot_table(df_med_default, index='exp_group', columns='language_pair', values='MCC')
df_med_default = df_med_default.rename_axis(None, axis=0)

In [19]:
df_mean_default = pd.pivot_table(df_mean_default, index='exp_group', columns='language_pair', values='MCC')
df_mean_default = df_mean_default.rename_axis(None, axis=0)

In [46]:
df_ensemble_best = pd.pivot_table(df_ensemble_best, index='exp_group', columns='language_pair', values='MCC')
df_ensemble_best = df_ensemble_best.rename_axis(None, axis=0)

In [26]:
df_med_default.head()

language_pair,en-cs,en-de,en-ja,en-zh
second_step_base_auth_data,0.488725,0.47226,0.242937,0.304009
second_step_base_demetr_auth_data,0.472047,0.503202,0.137031,0.243742
second_step_base_demetr_data,0.489043,0.483764,0.254678,0.269977
second_step_base_wmt22_data,,0.462655,,
second_step_base_wmt22_small_data,,0.470218,,


In [29]:
col_names = ['experiment_group', 'en-cs', 'en-de', 'en-ja', 'en-zh']
di_med_default = {df_med_default.index[i]: [df_med_default.iloc[i,0], df_med_default.iloc[i,1], df_med_default.iloc[i,2], df_med_default.iloc[i,3]] for i in range(len(df_med_default))}
di_mean_default = {df_mean_default.index[i]: [df_mean_default.iloc[i,0], df_mean_default.iloc[i,1], df_mean_default.iloc[i,2], df_mean_default.iloc[i,3]] for i in range(len(df_mean_default))}
li_med_default = create_latex_table(col_names, di_med_default)
li_mean_default = create_latex_table(col_names, di_mean_default)

In [32]:
print(li_mean_default)

\begin{table}
\centering
\begin{tabular}{c|ccccc}
 & EXPERIMENT_GROUP & EN-CS & EN-DE & EN-JA & EN-ZH\\
\hline
second_step_base_auth_data & 0.488 & 0.472 & 0.217 & 0.302 \\
second_step_base_demetr_auth_data & 0.473 & 0.505 & 0.174 & 0.243 \\
second_step_base_demetr_data & 0.490 & 0.483 & 0.255 & 0.265 \\
second_step_base_wmt22_data & nan & 0.463 & nan & nan \\
second_step_base_wmt22_small_data & nan & 0.470 & nan & nan \\
train_monolingual_auth_data & 0.463 & 0.476 & 0.166 & 0.287 \\
train_multilingual_auth_data_all & 0.478 & 0.488 & 0.248 & 0.310 \\
train_multilingual_auth_data_single & 0.481 & 0.512 & 0.219 & 0.304 \\
train_multilingual_auth_demetr_data_single & 0.412 & 0.460 & 0.192 & 0.256 \\
train_multilingual_auth_wmt22_data_single & nan & 0.454 & nan & nan \\
\hline
\end{tabular}
\end{table}



In [24]:
di_max_best = {df_max_best.index[i]: [df_max_best.iloc[i,0], df_max_best.iloc[i,1], df_max_best.iloc[i,2], df_max_best.iloc[i,3]] for i in range(len(df_max_best))}
di_min_default = {df_min_default.index[i]: [df_min_default.iloc[i,0], df_min_default.iloc[i,1], df_min_default.iloc[i,2], df_min_default.iloc[i,3]] for i in range(len(df_min_default))}
li_max_best = create_latex_table(df_max_best.columns.names, di_max_best)
li_min_default = create_latex_table(df_min_default.columns.names, di_min_default)

AttributeError: 'NoneType' object has no attribute 'upper'

In [49]:
di_ensemble_best = {df_ensemble_best.index[i]: [df_ensemble_best.iloc[i,0], df_ensemble_best.iloc[i,1], df_ensemble_best.iloc[i,2], df_ensemble_best.iloc[i,3]] for i in range(len(df_ensemble_best))}
li_ensemble_best = create_latex_table(['experiment group', 'en-cs', 'en-de', 'en-ja', 'en-zh'], di_ensemble_best)

In [21]:
print('\n'.join(li_med_default))

\
b
e
g
i
n
{
t
a
b
l
e
}


\
c
e
n
t
e
r
i
n
g


\
b
e
g
i
n
{
t
a
b
u
l
a
r
}
{
c
|
c
}


 
&
 
L
A
N
G
U
A
G
E
_
P
A
I
R
\
\


\
h
l
i
n
e


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
a
u
t
h
_
d
a
t
a
 
&
 
0
.
4
8
9
 
&
 
0
.
4
7
2
 
&
 
0
.
2
4
3
 
&
 
0
.
3
0
4
 
\
\


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
d
e
m
e
t
r
_
a
u
t
h
_
d
a
t
a
 
&
 
0
.
4
7
2
 
&
 
0
.
5
0
3
 
&
 
0
.
1
3
7
 
&
 
0
.
2
4
4
 
\
\


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
d
e
m
e
t
r
_
d
a
t
a
 
&
 
0
.
4
8
9
 
&
 
0
.
4
8
4
 
&
 
0
.
2
5
5
 
&
 
0
.
2
7
0
 
\
\


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
w
m
t
2
2
_
d
a
t
a
 
&
 
n
a
n
 
&
 
0
.
4
6
3
 
&
 
n
a
n
 
&
 
n
a
n
 
\
\


s
e
c
o
n
d
_
s
t
e
p
_
b
a
s
e
_
w
m
t
2
2
_
s
m
a
l
l
_
d
a
t
a
 
&
 
n
a
n
 
&
 
0
.
4
7
0
 
&
 
n
a
n
 
&
 
n
a
n
 
\
\


t
r
a
i
n
_
m
o
n
o
l
i
n
g
u
a
l
_
a
u
t
h
_
d
a
t
a
 
&
 
0
.
4
5
9
 
&
 
0
.
4
7
8
 
&
 
0
.
1
7
3
 
&
 
0
.
2
8
0
 
\
\


t
r
a
i
n
_
m
u
l
t
i
l
i
n
g
u
a
l
_
a
u
t
h
_
d
a
t
a
_
a
l
l
 
&
 
0
.
4
7
8
 
&
 
0
.
4
8
6
 
&


In [26]:
di_max_best

{'second_step_base_auth_data': [0.5019233822822571,
  0.470853716135025,
  0.2782382369041443,
  0.3196837306022644],
 'second_step_base_demetr_auth_data': [0.4870023727416992,
  0.524198055267334,
  0.2709347307682037,
  0.2507804632186889],
 'second_step_base_demetr_data': [0.507897675037384,
  0.4875065982341766,
  0.2782382369041443,
  0.2663236260414123],
 'second_step_base_wmt22_data': [nan, 0.4775583446025848, nan, nan],
 'second_step_base_wmt22_small_data': [nan, 0.4793201386928558, nan, nan],
 'train_monolingual_auth_data': [0.4694221913814544,
  0.5319602489471436,
  0.1800963580608368,
  0.3255181610584259],
 'train_multilingual_auth_data_all': [0.4948434829711914,
  0.5313258767127991,
  0.2791102230548858,
  0.3033549785614013],
 'train_multilingual_auth_data_single': [0.4965242147445678,
  0.5181735754013062,
  0.2793377041816711,
  0.3196837306022644],
 'train_multilingual_auth_demetr_data_single': [0.4235520362854004,
  0.519127607345581,
  0.2265895456075668,
  0.26143

In [28]:
di_min_default

{'second_step_base_auth_data': [0.493360698223114,
  0.4797698557376861,
  0.2748351991176605,
  0.3127940893173218],
 'second_step_base_demetr_auth_data': [0.4823073446750641,
  0.5208618640899658,
  0.2617437541484833,
  0.246474027633667],
 'second_step_base_demetr_data': [0.499349445104599,
  0.4875065982341766,
  0.276479423046112,
  0.2755996286869049],
 'second_step_base_wmt22_data': [nan, 0.4841794967651367, nan, nan],
 'second_step_base_wmt22_small_data': [nan, 0.4793201386928558, nan, nan],
 'train_monolingual_auth_data': [0.4718638956546783,
  0.4841794967651367,
  0.1800963580608368,
  0.328977108001709],
 'train_multilingual_auth_data_all': [0.4831692576408386,
  0.5025749206542969,
  0.2617258131504059,
  0.3198156356811523],
 'train_multilingual_auth_data_single': [0.4938519895076751,
  0.516930341720581,
  0.2563737332820892,
  0.3196837306022644],
 'train_multilingual_auth_demetr_data_single': [0.4229797422885895,
  0.4728780686855316,
  0.21236053109169,
  0.270318627

In [30]:
li_max_best

['\\begin{table}',
 '\\centering',
 '\\begin{tabular}{c|c}',
 ' & LANGUAGE_PAIR\\\\',
 '\\hline\nsecond_step_base_auth_data & 0.502 & 0.471 & 0.278 & 0.320 \\\\\nsecond_step_base_demetr_auth_data & 0.487 & 0.524 & 0.271 & 0.251 \\\\\nsecond_step_base_demetr_data & 0.508 & 0.488 & 0.278 & 0.266 \\\\\nsecond_step_base_wmt22_data & nan & 0.478 & nan & nan \\\\\nsecond_step_base_wmt22_small_data & nan & 0.479 & nan & nan \\\\\ntrain_monolingual_auth_data & 0.469 & 0.532 & 0.180 & 0.326 \\\\\ntrain_multilingual_auth_data_all & 0.495 & 0.531 & 0.279 & 0.303 \\\\\ntrain_multilingual_auth_data_single & 0.497 & 0.518 & 0.279 & 0.320 \\\\\ntrain_multilingual_auth_demetr_data_single & 0.424 & 0.519 & 0.227 & 0.261 \\\\\ntrain_multilingual_auth_wmt22_data_single & nan & 0.496 & nan & nan \\\\\n\\hline',
 '\\end{tabular}',
 '\\end{table}']