In [1]:
import os
import json
import numpy as np
from omegaconf import OmegaConf

In [2]:
exp_dir = "../../../logs/de/asr/evaluate"
folders = {
    "Fine-tuned": "version_4",
    "No AC": "version_48",
    "AL": "version_43",
    "One-way AL": "version_44",
}

In [3]:
# define which accent is standard, which are seen and which are unseen
classes = {
    "standard": ["de"],
    "seen": ["ch", "at"],
    "unseen": ["gb", "it", "de_al", "fr", "de_ni", "us", "ca", "ru"],
}

In [4]:
# load WERs of each file, for each accent
wers = dict()
for exp, folder in folders.items():
    wers[exp] = {k[5:-4]: v for k, v in json.load(open(f"{exp_dir}/{folder}/avg_wers.json")).items()}

In [5]:
# transform dictionary into a numpy array, where one axis represents the accents and another the experiments
experiments = list(wers.keys())
accents = list(wers[experiments[0]].keys())
n_words = np.array([wers[experiments[0]][acc]["n_words"] for acc in accents])
avg_wers = np.array([[wers[exp][acc]["avg_wer"] for exp in experiments] for acc in accents])

In [6]:
# compute means for each class, for each experiment
class_indices = {k: [accents.index(acc) for acc in v] for k, v in classes.items()}
class_means = {k: np.mean(avg_wers[indices], axis=0) for k, indices in class_indices.items()}

In [7]:
# print the avg. WERs and the means (overall and per class) as a markdown table
headers = ["Accent / Dialect"] + experiments
print((" | ").join(headers))
print(f"|{'---:|'*len(headers)}")

# print avg. WERs
for i in range(len(accents)):
    row = [accents[i]] + [f"{avg_wers[i,j]*100:.2f}" for j in range(len(experiments))]
    print((" | ").join(row))

# print overall means
row = ["mean"] + [f"{np.mean(avg_wers[:,i])*100:.2f}" for i in range(len(experiments))]
print((" | ").join(row))

# print class means
for key, value in class_means.items():
    row = [f"{key} mean"] + [f"{value[i]*100:.2f}" for i in range(len(experiments))]
    print((" | ").join(row))

# print worst avg. WER of each experiment
row = ["worst"] + [f"{v*100:.2f}" for v in np.max(avg_wers, axis=0)]
print((" | ").join(row))

# print the avg. of the three worst avg. WERs of each experiment
row = ["3-worst mean."] + [f"{np.mean(np.sort(avg_wers, axis=0)[-3:,i])*100:.2f}" for i in range(len(experiments))]
print((" | ").join(row))


Accent / Dialect | Fine-tuned | No AC | AL | One-way AL
|---:|---:|---:|---:|---:|
at | 1.80 | 1.53 | 1.63 | 1.61
gb | 4.11 | 3.71 | 4.19 | 3.71
it | 1.40 | 1.24 | 1.31 | 1.35
de_al | 1.16 | 1.13 | 1.19 | 1.20
fr | 2.68 | 1.92 | 2.00 | 2.04
de_ni | 4.31 | 4.20 | 4.11 | 4.17
ch | 3.11 | 2.67 | 2.70 | 2.63
de | 1.92 | 1.60 | 1.65 | 1.65
us | 4.69 | 4.60 | 4.38 | 4.47
ca | 3.70 | 3.27 | 2.76 | 2.93
ru | 2.97 | 2.29 | 2.13 | 2.12
mean | 2.89 | 2.56 | 2.55 | 2.54
standard mean | 1.92 | 1.60 | 1.65 | 1.65
seen mean | 2.45 | 2.10 | 2.16 | 2.12
unseen mean | 3.13 | 2.79 | 2.76 | 2.75
worst | 4.69 | 4.60 | 4.38 | 4.47
3-worst mean. | 4.37 | 4.17 | 4.23 | 4.12


Accent / Dialect | Fine-tuned | No AC | AL | One-way AL
|---:|---:|---:|---:|---:|
at | 1.80 | 1.53 | 1.63 | 1.61
gb | 4.11 | 3.71 | 4.19 | 3.71
it | 1.40 | 1.24 | 1.31 | 1.35
de_al | 1.16 | 1.13 | 1.19 | 1.20
fr | 2.68 | 1.92 | 2.00 | 2.04
de_ni | 4.31 | 4.20 | 4.11 | 4.17
ch | 3.11 | 2.67 | 2.70 | 2.63
de | 1.92 | 1.60 | 1.65 | 1.65
us | 4.69 | 4.60 | 4.38 | 4.47
ca | 3.70 | 3.27 | 2.76 | 2.93
ru | 2.97 | 2.29 | 2.13 | 2.12
mean | 2.89 | 2.56 | 2.55 | 2.54
standard mean | 1.92 | 1.60 | 1.65 | 1.65
seen mean | 2.45 | 2.10 | 2.16 | 2.12
unseen mean | 3.13 | 2.79 | 2.76 | 2.75
worst | 4.69 | 4.60 | 4.38 | 4.47
3-worst mean. | 4.37 | 4.17 | 4.23 | 4.12

In [8]:
# print the experiment folders of each experiment (both train and eval folders)
print("Experiment folders:\n")
for exp, folder in folders.items():
    eval_folder = os.path.join(exp_dir, folder)
    eval_config = OmegaConf.load(os.path.join(eval_folder, "hparams.yaml"))
    train_folder = f'../{eval_config.asr.ckpt.replace("/checkpoints/last.ckpt", "")}'
    print(f"- {exp}: training `{train_folder[3:]}`, evaluation `{eval_folder[3:]}`")

Experiment folders:

- Fine-tuned: training `logs/asr/train/version_12`, evaluation `../../logs/de/asr/evaluate/version_4`


ConfigAttributeError: Missing key asr
    full_key: asr
    object_type=dict

### Conclusions:

1. In comparison with AL, OneWay yields \
    a. Same WER for standard accent \
    b. Better WER for the other seen accents \
    c. Better WER for GB \
    d. Worse WER for US, CA and DE-NI
