A notebook that puts evaluation results into a latex table format and prints the result.
The results are grouped by result type, e.g., median, max etc.

In [40]:
import os
import pandas as pd
from mtqe.utils.paths import EVAL_DIR
from mtqe.utils.tables import create_latex_table

In [41]:
# Set the data split to be evaluated, and the metrics to be shown in the table
DATA_SPLIT = 'test'
VALUES = ['precision', 'recall']

In [42]:
folders = os.listdir(EVAL_DIR)

In [20]:
li_max_results = []
li_min_results = []
li_med_results = []
li_mean_results = []
li_ensemble_results = []

In [43]:
# Collect results for all experiment groups
for folder in folders:
    path = os.path.join(EVAL_DIR, folder)
    if os.path.isdir(path):
        files = os.listdir(path)
        for file in files:
            df = pd.read_csv(os.path.join(EVAL_DIR, folder, file))
            if file.endswith('ensemble_results.csv'):
                li_ensemble_results.append(df)
            elif file.endswith('max_results.csv'):
                li_max_results.append(df)
            elif file.endswith('min_results.csv'):
                li_min_results.append(df)
            elif file.endswith('median_results.csv'):
                li_med_results.append(df)
            elif file.endswith('mean_results.csv'):
                li_mean_results.append(df)

df_ensemble = pd.concat(li_ensemble_results)
df_max = pd.concat(li_max_results)
df_min = pd.concat(li_min_results)
df_med = pd.concat(li_med_results)
df_mean = pd.concat(li_mean_results)

In [44]:
def update_exp_group_names(row):
    if row['exp_group'][-4:] == 'enja':
        row['exp_group'] = row['exp_group'][:-5]
    return row

In [45]:
# Move en-ja results to same group as other language pairs, if they are separate.
df_max = df_max.apply(update_exp_group_names, axis=1)
df_min = df_min.apply(update_exp_group_names, axis=1)
df_med = df_med.apply(update_exp_group_names, axis=1)
df_mean = df_mean.apply(update_exp_group_names, axis=1)
df_ensemble = df_ensemble.apply(update_exp_group_names, axis=1)

In [46]:
# Filter based on threshold strategy and data split - can create more dataframes here, as required
df_max_best = df_max[(df_max['threshold_strategy']=='best') & (df_max['split'] == DATA_SPLIT)]
df_min_default = df_min[(df_min['threshold_strategy']=='default') & (df_min['split'] == DATA_SPLIT)]
df_med_default = df_med[(df_med['threshold_strategy']=='default') & (df_med['split'] == DATA_SPLIT)]
df_mean_default = df_mean[(df_mean['threshold_strategy']=='default') & (df_mean['split'] == DATA_SPLIT)]
df_ensemble_best = df_ensemble[(df_ensemble['threshold_strategy']=='best') & (df_ensemble['split'] == DATA_SPLIT)]

In [47]:
# Pivot the results for the metrics (values) to be shown in the table
df_max_best = pd.pivot_table(df_max_best, index='exp_group', columns='language_pair', values=VALUES)
df_max_best = df_max_best.rename_axis(None, axis=0)

In [48]:
df_min_default = pd.pivot_table(df_min_default, index='exp_group', columns='language_pair', values=VALUES)
df_min_default = df_min_default.rename_axis(None, axis=0)

In [49]:
df_med_default = pd.pivot_table(df_med_default, index='exp_group', columns='language_pair', values=VALUES)
df_med_default = df_med_default.rename_axis(None, axis=0)

In [50]:
df_mean_default = pd.pivot_table(df_mean_default, index='exp_group', columns='language_pair', values=VALUES)
df_mean_default = df_mean_default.rename_axis(None, axis=0)

In [51]:
df_ensemble_best = pd.pivot_table(df_ensemble_best, index='exp_group', columns='language_pair', values=VALUES)
df_ensemble_best = df_ensemble_best.rename_axis(None, axis=0)

In [52]:
# Create table for columns and content 
# NOTE: Should make this a function and pass the dataframe through as a parameter
if len(VALUES) == 1:
    col_names = ['experiment_group', 'en-cs', 'en-de', 'en-ja', 'en-zh']
    di_med_default = {df_med_default.index[i]: [df_med_default.iloc[i,0], df_med_default.iloc[i,1], df_med_default.iloc[i,2], df_med_default.iloc[i,3]] for i in range(len(df_med_default))}
    di_mean_default = {df_mean_default.index[i]: [df_mean_default.iloc[i,0], df_mean_default.iloc[i,1], df_mean_default.iloc[i,2], df_mean_default.iloc[i,3]] for i in range(len(df_mean_default))}
    li_med_default = create_latex_table(col_names, di_med_default)
    li_mean_default = create_latex_table(col_names, di_mean_default)
elif len(VALUES) == 2: # Assume it's precision & recall
    col_names = ['experiment_group', 'en-cs', '', 'en-de', '', 'en-ja', '', 'en-zh', '']
    di_med_default = {df_med_default.index[i]: [df_med_default.iloc[i,0], df_med_default.iloc[i,4], df_med_default.iloc[i,1], df_med_default.iloc[i,5], df_med_default.iloc[i,2], df_med_default.iloc[i,6], df_med_default.iloc[i,3], df_med_default.iloc[i,7]] for i in range(len(df_med_default))}
    li_med_default = create_latex_table(col_names, di_med_default)

In [53]:
# Print whichever data are of interest
print(li_med_default)

\begin{table}
\centering
\begin{tabular}{c|ccccccccc}
 & EXPERIMENT_GROUP & EN-CS &  & EN-DE &  & EN-JA &  & EN-ZH & \\
\hline
baseline & 0.581 & 0.228 & 0.696 & 0.247 & 0.267 & 0.049 & 0.500 & 0.038 \\
prompt_GEMBA & 0.503 & 0.503 & 0.577 & 0.427 & 0.155 & 0.646 & 0.326 & 0.639 \\
prompt_basic & 0.497 & 0.519 & 0.655 & 0.389 & 0.222 & 0.488 & 0.394 & 0.506 \\
second_step_base_auth_data & 0.590 & 0.587 & 0.744 & 0.465 & 0.264 & 0.171 & 0.397 & 0.437 \\
second_step_base_demetr_auth_data & 0.537 & 0.614 & 0.682 & 0.625 & 0.252 & 0.415 & 0.398 & 0.297 \\
second_step_base_demetr_data & 0.593 & 0.593 & 0.690 & 0.549 & 0.271 & 0.439 & 0.378 & 0.373 \\
second_step_base_wmt22_data & nan & nan & 0.633 & 0.562 & nan & nan & nan & nan \\
second_step_base_wmt22_small_data & nan & nan & 0.739 & 0.472 & nan & nan & nan & nan \\
train_monolingual_auth_data & 0.527 & 0.614 & 0.706 & 0.517 & 0.250 & 0.232 & 0.463 & 0.316 \\
train_monolingual_auth_data_calibrated & 0.682 & 0.307 & 0.677 & 0.517 & 0.167 