In [1]:
import jiwer
import pickle

In [31]:
transforms = jiwer.Compose(
    [
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)

In [4]:
with open("datasets/fiv2.pkl", "rb") as handle:
    data = pickle.load(handle)

In [5]:
train_data = data["train"]
with open("transcription_training.pkl", "rb") as handle:
    reference_train = pickle.load(handle, encoding='latin1')

In [45]:
dev_data = data["dev"]
with open("transcription_validation.pkl", "rb") as handle:
    reference_dev = pickle.load(handle, encoding='latin1')

In [46]:
test_data = data["test"]
with open("transcription_test.pkl", "rb") as handle:
    reference_test = pickle.load(handle, encoding='latin1')

In [43]:
train_wer = []

for (word, _, _), _, segment in train_data:
    hypothesis = " ".join([str(w) for w in word])
    assert segment+".mp4" in reference_train
    reference = reference_train[segment+".mp4"]
    if len(reference) == 0:
        continue
    wer = jiwer.wer(
                reference,
                hypothesis,
                truth_transform=transforms,
                hypothesis_transform=transforms,
            )
    train_wer.append(wer)

print(f'[TRAIN] WER: {sum(train_wer) / len(train_wer)}, accuracy: {(1 - sum(train_wer) / len(train_wer))*100}%')

[TRAIN] WER: 0.5468493136236625, accuracy: 45.31506863763375%


In [49]:
dev_wer = []

for (word, _, _), _, segment in dev_data:
    hypothesis = " ".join([str(w) for w in word])
    assert segment+".mp4" in reference_dev
    reference = reference_dev[segment+".mp4"]
    if len(reference) == 0:
        continue
    wer = jiwer.wer(
                reference,
                hypothesis,
                truth_transform=transforms,
                hypothesis_transform=transforms,
            )
    dev_wer.append(wer)

print(f'[DEV] WER: {sum(dev_wer) / len(dev_wer)}, accuracy: {(1 - sum(dev_wer) / len(dev_wer))*100}%')

[DEV] WER: 0.5210720586072152, accuracy: 47.892794139278486%


In [50]:
test_wer = []

for (word, _, _), _, segment in test_data:
    hypothesis = " ".join([str(w) for w in word])
    assert segment+".mp4" in reference_test
    reference = reference_test[segment+".mp4"]
    if len(reference) == 0:
        continue
    wer = jiwer.wer(
                reference,
                hypothesis,
                truth_transform=transforms,
                hypothesis_transform=transforms,
            )
    test_wer.append(wer)

print(f'[TEST] WER: {sum(test_wer) / len(test_wer)}, accuracy: {(1 - sum(test_wer) / len(test_wer))*100}%')

[TEST] WER: 0.5007257921941465, accuracy: 49.92742078058535%
