## Dependencies

In [None]:
import os
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
ROOT_DIR = "/content/drive/My Drive/data/nlp/"
SEMCOR_SPEECH_PATH = f"{ROOT_DIR}speech2text/SemCor/"

if not os.path.isdir(f"{ROOT_DIR}WSD_Training_Corpora"):
    ! wget http://nlp.uniroma1.it/wsdeval/data/WSD_Training_Corpora.zip
    ! unzip WSD_Training_Corpora
    SEMCOR_PATH = "WSD_Training_Corpora/SemCor/semcor"
else:
    SEMCOR_PATH = f"{ROOT_DIR}WSD_Training_Corpora/SemCor/semcor"

Mounted at /content/drive


In [None]:
! pip install --upgrade --quiet transformers torchmetrics

[K     |████████████████████████████████| 4.0 MB 4.2 MB/s 
[K     |████████████████████████████████| 398 kB 80.0 MB/s 
[K     |████████████████████████████████| 77 kB 7.1 MB/s 
[K     |████████████████████████████████| 895 kB 49.9 MB/s 
[K     |████████████████████████████████| 596 kB 77.1 MB/s 
[K     |████████████████████████████████| 6.5 MB 36.8 MB/s 
[?25h

In [None]:
import re
import nltk
import torch
import librosa
import torch
import numpy as np
import pandas as pd
import IPython.display as display
import torchmetrics

from typing import *
from pprint import pprint
from pathlib import Path
from dataclasses import dataclass, field
from nltk.corpus import wordnet as wn
from xml.etree import cElementTree as etree
from tqdm import tqdm
from torchmetrics.functional.text.wer import word_error_rate
from torchmetrics.functional.text.cer import char_error_rate
from transformers import AutoTokenizer, AutoModel, Wav2Vec2ForCTC, HubertForCTC, Wav2Vec2Tokenizer, Wav2Vec2Processor

## Utils

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
encoder = AutoModel.from_pretrained('roberta-base')

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
wnpos_map = dict(NOUN='n', VERB='v', ADJ='a', ADV='r')

@dataclass
class Token:
    ''' a token in a given context'''
  
    text: str                    # input token (word form)
    sent: List['Token']          # the whole sentence the token is contained in
    token_index: int             # token index within sent
    subword_index_start: int
    lemma: Optional[str] = None
    pos: Optional[str] = None
    id_: Optional[str] = None
    sensekeys: List[str] = field(default_factory=list) 

    def wnpos(self):
        if self.pos is None:
            return None
        return wnpos_map.get(self.pos)
        
    def possible_senses(self):
        assert self.lemma
        return wn.lemmas(self.lemma, self.wnpos())

    def possible_keys(self):
        return [sense.key() for sense in self.possible_senses()]

    def possible_lexnames(self):
        return list(dict.fromkeys([sense.synset().lexname() for sense in self.possible_senses()]))

    def gold_senses(self):
        return [wn.lemma_from_key(sk) for sk in self.sensekeys]

    def gold_keys(self):
        return self.sensekeys

    def gold_lexnames(self):
        return [sense.synset().lexname() for sense in self.gold_senses()]

    @property
    def is_tagged(self):
        return bool(self.sensekeys)

    @property
    def subwords(self):
        '''creates the subwords of the token'''
        if not hasattr(self, '_subwords'):
            # RoBERTa tokenizer requires leading space
            self._subwords = tokenizer.tokenize(' ' + self.text.lstrip())
        return self._subwords

    @property
    def subword_index_end(self):
        return self.subword_index_start + len(self.subwords)

In [None]:
def read_corpus(xml_path, key_path, max=50) -> List[List[Token]]:
    
    sentences = []

    keys = Path(key_path).read_text().strip().splitlines()
    keys = [l.split(' ') for l in keys]
    keys = {l[0]: l[1:] for l in keys}

    for i, sent_xml in enumerate(list(etree.parse(xml_path).iter('sentence'))[:max]):
        sent = []
        subword_index_start = 0
        for token_index, token_xml in enumerate(sent_xml):
            if token_xml.tag == 'instance':
                # <instance>
                gold = keys.get(token_xml.attrib['id'], [])
            else:
                # <wf>
                gold = []
            token = Token(
                token_xml.text,
                sent,
                token_index,
                subword_index_start,
                token_xml.attrib['lemma'],
                token_xml.attrib['pos'],
                token_xml.attrib.get('id'),
                gold 
            )
            subword_index_start = token.subword_index_end
            sent.append(token)
        if any([t.is_tagged for t in sent]):
            sentences.append(sent)

    return sentences

## Speech2text models

In [None]:
BASE_MODEL_PATH = f"{ROOT_DIR}speech2text/models/wav2vec2-base-960h"             # "facebook/wav2vec2-base-960h"
LARGE_MODEL_PATH = f"{ROOT_DIR}speech2text/models/wav2vec2-large-960h"           # "facebook/wav2vec2-large-960h"
XLSR_MODEL_PATH = f"{ROOT_DIR}speech2text/models/wav2vec2-large-xlsr-53-english" # "jonatasgrosman/wav2vec2-large-xlsr-53-english"
HUBERT_MODEL_PATH = f"{ROOT_DIR}speech2text/models/hubert-large-ls960-ft"        # "facebook/hubert-large-ls960-ft"
HUBERT_XLARGE_MODEL_PATH = f"facebook/hubert-xlarge-ls960-ft"        # "facebook/hubert-xlarge-ls960-ft"

base_tokenizer = Wav2Vec2Processor.from_pretrained(BASE_MODEL_PATH)
base_model = Wav2Vec2ForCTC.from_pretrained(BASE_MODEL_PATH)

large_tokenizer = Wav2Vec2Processor.from_pretrained(LARGE_MODEL_PATH)
large_model = Wav2Vec2ForCTC.from_pretrained(LARGE_MODEL_PATH)

xlsr_tokenizer = Wav2Vec2Processor.from_pretrained(XLSR_MODEL_PATH)
xlsr_model = Wav2Vec2ForCTC.from_pretrained(XLSR_MODEL_PATH)

hubert_tokenizer = Wav2Vec2Processor.from_pretrained(HUBERT_MODEL_PATH)
hubert_model = HubertForCTC.from_pretrained(HUBERT_MODEL_PATH)

hubert_xlarge_tokenizer = Wav2Vec2Processor.from_pretrained(HUBERT_XLARGE_MODEL_PATH)
hubert_xlarge_model = HubertForCTC.from_pretrained(HUBERT_XLARGE_MODEL_PATH)

Downloading:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/138 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/292 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

In [None]:
def load_audio(file_path: str, sampling_rate: int = 16_000) -> np.ndarray:
    speech, rate = librosa.load(file_path, sr=sampling_rate)
    return speech

In [None]:
def speech2text(speech: np.ndarray,
                model: Wav2Vec2ForCTC,
                processor: Wav2Vec2Processor,
                sampling_rate: int = 16_000) -> str:
    input_values = processor(speech, sampling_rate=sampling_rate, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcriptions = processor.decode(predicted_ids[0])
    return transcriptions.lower()

def speech2textCMP(speech: np.ndarray) -> str:
    out = dict()
    out["BASE_MODEL"] = speech2text(speech, base_model, base_tokenizer)
    out["LARGE_MODEL"] = speech2text(speech, large_model, large_tokenizer)
    out["XLSR_MODEL"] = speech2text(speech, xlsr_model, xlsr_tokenizer)
    out["HUBERT_MODEL"] = speech2text(speech, hubert_model, hubert_tokenizer)
    return out

## SemCor samples

In [None]:
# @title Util functions

def load_semcor_audio_sample(sample_idx: int, path: str,
                             postfix: str = "") -> np.ndarray:
    if path is None: 
        path = f"{SEMCOR_SPEECH_PATH}andrea/"
    semcor_sample_path = f"{path}SemCor#{sample_idx+1}{postfix}.wav"
    assert os.path.isfile(semcor_sample_path), f"Speech SemCor sample {semcor_sample_path} does not exist"
    return load_audio(semcor_sample_path)

def get_semcor_sample_transcriptions(sample_idx: int,
                                     path: Optional[str] = None,
                                     postfix: str = "") -> Dict:
    sample = " ".join([tok.text for tok in SemCor[sample_idx]]).lower()
    sample = re.sub(r'[^\w\s]', '', sample)

    out = speech2textCMP(load_semcor_audio_sample(sample_idx, path, postfix=postfix))

    for model, prediction in out.items():
        wer = word_error_rate(prediction, sample)
        out[model] = [prediction, wer.item()]

    out["SemCor"] = [sample, ""]

    return out

def show_semcor_transcriptions_comparison(out: Dict) -> pd.DataFrame:

    pd.set_option('display.max_colwidth', None)

    df = pd.DataFrame.from_dict(out, orient='index')
    df.columns = ["Transcription", "WER"]
    
    return df.style.set_properties(**{'text-align': 'left'})

In [None]:
SemCor = read_corpus(SEMCOR_PATH + ".data.xml",
                     SEMCOR_PATH + ".gold.key.txt",
                     max=40)

In [None]:
import json

outs = list()

TRANSCRIPTIONS_JSON = f"{SEMCOR_SPEECH_PATH}transcriptions.json"

if os.path.isfile(TRANSCRIPTIONS_JSON):
    with open(TRANSCRIPTIONS_JSON) as f:
        outs = json.load(f)
else:    
    outs = [get_semcor_sample_transcriptions(i) for i in range(len(SemCor))]
    with open(TRANSCRIPTIONS_JSON, "w") as f:
        f.write(json.dumps(outs))

In [None]:
douts = {
    "HUBERT_MODEL": [],
    "XLSR_MODEL": [],
    "LARGE_MODEL": [],
    "BASE_MODEL": [],
    "SemCor": []
}
for dic in outs:
    for key, value in dic.items():
        douts[key] += [value]

In [None]:
# average Word Error Rate
sum([x[1] / len(douts["HUBERT_MODEL"]) for x in douts["HUBERT_MODEL"]])

0.11923798834905032

In [None]:
# best model per sample
a = [(("HUBERT_MODEL", x["HUBERT_MODEL"][1]), ("XLSR_MODEL", x["XLSR_MODEL"][1]), ("LARGE_MODEL", x["LARGE_MODEL"][1]), ("BASE_MODEL", x["BASE_MODEL"][1])) for x in outs]
[min(x, key=lambda y : y[1]) for x in a]

[('HUBERT_MODEL', 0.0625),
 ('XLSR_MODEL', 0.1666666716337204),
 ('XLSR_MODEL', 0.0),
 ('LARGE_MODEL', 0.1666666716337204),
 ('XLSR_MODEL', 0.1111111119389534),
 ('HUBERT_MODEL', 0.0),
 ('LARGE_MODEL', 0.0),
 ('XLSR_MODEL', 0.10000000149011612),
 ('HUBERT_MODEL', 0.0),
 ('HUBERT_MODEL', 0.0625),
 ('XLSR_MODEL', 0.0),
 ('LARGE_MODEL', 0.07407407462596893),
 ('HUBERT_MODEL', 0.1666666716337204),
 ('XLSR_MODEL', 0.1034482792019844),
 ('HUBERT_MODEL', 0.07692307978868484),
 ('HUBERT_MODEL', 0.0),
 ('HUBERT_MODEL', 0.125),
 ('HUBERT_MODEL', 0.0),
 ('HUBERT_MODEL', 0.0),
 ('HUBERT_MODEL', 0.05882352963089943),
 ('HUBERT_MODEL', 0.0),
 ('XLSR_MODEL', 0.07692307978868484),
 ('HUBERT_MODEL', 0.0833333358168602),
 ('HUBERT_MODEL', 0.1111111119389534),
 ('HUBERT_MODEL', 0.43478259444236755),
 ('HUBERT_MODEL', 0.0),
 ('HUBERT_MODEL', 0.06666667014360428),
 ('HUBERT_MODEL', 0.0714285746216774),
 ('XLSR_MODEL', 0.0625),
 ('HUBERT_MODEL', 0.2857142984867096),
 ('HUBERT_MODEL', 0.0833333358168602),
 (

In [None]:
wrongs_list = list()
for sample_i, out in enumerate(outs):
    sample = out["SemCor"][0]
    trans = out["HUBERT_MODEL"][0]

    cnt = 0
    sw_trans = ""
    split_trans = trans.split()
    split_sample = sample.split()

    wrongs = dict()
    for word_i, sw in enumerate(split_sample):
        if word_i == len(split_trans):
            wrongs[word_i] = (sw, "")
            break

        sw_trans = split_trans[word_i]
        if sw != sw_trans:
            wrongs[word_i] = (sw, sw_trans)

    wrongs2 = dict()
    for word_i, sw in reversed(list(enumerate(split_sample))):
        if word_i == 0:
            wrongs2[word_i] = (sw, "")
            break

        trans_i = word_i - (len(split_sample) - len(split_trans))
        sw_trans = split_trans[trans_i]
        if sw != sw_trans:
            wrongs2[word_i] = (sw, sw_trans)

    for k, v in wrongs.copy().items():
        if k not in wrongs2:
            del wrongs[k]
        else:
            wrongs[k] = (v[0], [v[1], wrongs2[k][1]])

    if wrongs: pprint(wrongs)

    wrongs_list.append(wrongs)

    print("")

{15: ('program', ['programme', 'programme'])}

{8: ('program', ['programm', 'giveaway']),
 12: ('that', ['there', 'one']),
 13: ('has', ['as', 'there']),
 18: ('employee', ['emploimoral', 'improved']),
 19: ('morale', ['and', 'emploimoral'])}

{10: ('program', ['programme', 'programme'])}

{6: ('reduced', ['reduce', 'absentism']),
 7: ('absenteeism', ['absentism', 'turn']),
 8: ('turnover', ['turn', 'over']),
 14: ('improved', ['to', 'improve'])}

{2: ('set', ['sad', 'sad']), 7: ('employee', ['emploe', 'emploe'])}


{3: ('or', ['offence', 'offence']), 4: ('fancier', ['ere', 'ere'])}

{9: ('etc', ['et', 'cetera'])}


{4: ('employee', ['employe', 'employe'])}

{11: ('employees', ['employes', 'employes'])}

{9: ('lighting', ['lightning', 'rest']),
 14: ('airconditioning', ['air', 'conditioning'])}

{19: ('per', ['pay', 'cents']),
 20: ('hour', ['our', 'pay']),
 21: ('cost', ['costing', 'our']),
 22: ('in', ['wages', 'costing'])}

{6: ('employee', ['implete', 'benefits']),
 13: ('profitsha

In [None]:
pprint(wrongs_list[0])
show_semcor_transcriptions_comparison(outs[0])

{15: ('program', ['programme', 'programme'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,how long has it been since you review the objectives of your benefit and savice programme,0.1875
LARGE_MODEL,how long has it been since you reviewed the objectives of your benefit and sevis programm,0.125
XLSR_MODEL,how long has it been since you review the objectives of your benefit and service program,0.0625
HUBERT_MODEL,how long has it been since you reviewed the objectives of your benefit and service programme,0.0625
SemCor,how long has it been since you reviewed the objectives of your benefit and service program,


In [None]:
pprint(wrongs_list[1])
show_semcor_transcriptions_comparison(outs[1])

{8: ('program', ['programm', 'giveaway']),
 12: ('that', ['there', 'one']),
 13: ('has', ['as', 'there']),
 18: ('employee', ['emploimoral', 'improved']),
 19: ('morale', ['and', 'emploimoral'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,have you permitted it to become a give away programm rather than one that as the go of improving pluymoral and consequently increase productivity,0.375
LARGE_MODEL,have you permitted it to become a giveaway programm rather than one that as the goal of improved impluimoral and consequently increase productivity,0.208333
XLSR_MODEL,evew permitted it to become a giveaway program rather than one that as the goal of improved employee morile and consequently increased productivity,0.166667
HUBERT_MODEL,have you permitted it to become a giveaway programm rather than one there as the goal of improved emploimoral and consequently increased productivity,0.208333
SemCor,have you permitted it to become a giveaway program rather than one that has the goal of improved employee morale and consequently increased productivity,


In [None]:
pprint(wrongs_list[2])
show_semcor_transcriptions_comparison(outs[2])

{10: ('program', ['programme', 'programme'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,what effort do you make to assess results of your programme,0.090909
LARGE_MODEL,what effort do you make to assess results of your programme,0.090909
XLSR_MODEL,what effort do you make to assess results of your program,0.0
HUBERT_MODEL,what effort do you make to assess results of your programme,0.090909
SemCor,what effort do you make to assess results of your program,


In [None]:
pprint(wrongs_list[3])
show_semcor_transcriptions_comparison(outs[3])

{6: ('reduced', ['reduce', 'absentism']),
 7: ('absenteeism', ['absentism', 'turn']),
 8: ('turnover', ['turn', 'over']),
 14: ('improved', ['to', 'improve'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,do you measure its relation to reduce absentiism turnover accidents and grievances and to improve quality and output,0.166667
LARGE_MODEL,do you measure its relation to reduce abzentism turnover accidents and grievances and to improve quality and output,0.166667
XLSR_MODEL,do you measure its relation to reduce absenteism tarnover accidents angrievences and to improve quality and output,0.333333
HUBERT_MODEL,do you measure its relation to reduce absentism turn over accidents and grievances and to improve quality and output,0.277778
SemCor,do you measure its relation to reduced absenteeism turnover accidents and grievances and to improved quality and output,


In [None]:
pprint(wrongs_list[4])
show_semcor_transcriptions_comparison(outs[4])

{2: ('set', ['sad', 'sad']), 7: ('employee', ['emploe', 'emploe'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,have you said specific objectives for your implite obligation,0.333333
LARGE_MODEL,have you sad specific objectives for your employed publication,0.222222
XLSR_MODEL,have you sad specific objectives for your employee publication,0.111111
HUBERT_MODEL,have you sad specific objectives for your emploe publication,0.222222
SemCor,have you set specific objectives for your employee publication,


In [None]:
pprint(wrongs_list[5])
show_semcor_transcriptions_comparison(outs[5])

{}


Unnamed: 0,Transcription,WER
BASE_MODEL,is it reaching these goals,0.0
LARGE_MODEL,is it reaching these goals,0.0
XLSR_MODEL,is it reaching these goals,0.0
HUBERT_MODEL,is it reaching these goals,0.0
SemCor,is it reaching these goals,


In [None]:
pprint(wrongs_list[6])
show_semcor_transcriptions_comparison(outs[6])

{3: ('or', ['offence', 'offence']), 4: ('fancier', ['ere', 'ere'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,is it larger offencier than you really need,0.222222
LARGE_MODEL,is it larger or fancier than you really need,0.0
XLSR_MODEL,is it larger ofencier than you really need,0.222222
HUBERT_MODEL,is it larger offence ere than you really need,0.222222
SemCor,is it larger or fancier than you really need,


In [None]:
pprint(wrongs_list[7])
show_semcor_transcriptions_comparison(outs[7])

{9: ('etc', ['et', 'cetera'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,are you using the most economical printing methods paper et cetera,0.2
LARGE_MODEL,are you using the most economical printing methods paper et cetera,0.2
XLSR_MODEL,are you using the most economical printing methods paper exce,0.1
HUBERT_MODEL,are you using the most economical printing methods paper et cetera,0.2
SemCor,are you using the most economical printing methods paper etc,


In [None]:
pprint(wrongs_list[8])
show_semcor_transcriptions_comparison(outs[8])

{}


Unnamed: 0,Transcription,WER
BASE_MODEL,are there ar there are cheaper communications thechniques that could be substituted,0.4
LARGE_MODEL,are there other cheaper communications techniques that could be substituted,0.0
XLSR_MODEL,are there other cheaper communications techniques that could be substituted,0.0
HUBERT_MODEL,are there other cheaper communications techniques that could be substituted,0.0
SemCor,are there other cheaper communications techniques that could be substituted,


In [None]:
pprint(wrongs_list[9])
show_semcor_transcriptions_comparison(outs[9])

{4: ('employee', ['employe', 'employe'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,has your attitude toward in ple benefits in corage and excess of free government work in your plant,0.3125
LARGE_MODEL,has your attitude toward him thlee benefits encouraged an excess of free government work in your plant,0.125
XLSR_MODEL,has your attitude toward employee benefits encouraged an excess of free government war in your plant,0.0625
HUBERT_MODEL,has your attitude toward employe benefits encouraged an excess of free government work in your plant,0.0625
SemCor,has your attitude toward employee benefits encouraged an excess of free government work in your plant,


In [None]:
pprint(wrongs_list[10])
show_semcor_transcriptions_comparison(outs[10])

{11: ('employees', ['employes', 'employes'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,easier purchasing agent offering too much frebuying service for the please,0.5
LARGE_MODEL,easier purchasing agent offering too much free buying service for him please,0.333333
XLSR_MODEL,is your purchasing agent offering too much free buying service for employees,0.0
HUBERT_MODEL,is your purchasing agent offering too much free buying service for employes,0.083333
SemCor,is your purchasing agent offering too much free buying service for employees,


In [None]:
pprint(wrongs_list[11])
show_semcor_transcriptions_comparison(outs[11])

{9: ('lighting', ['lightning', 'rest']),
 14: ('airconditioning', ['air', 'conditioning'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,when improvements are recommended in working conditions such as lightning rest rooms eating facilities eir conditioning do you try to set a measure of their effactiveness on productivity,0.148148
LARGE_MODEL,when improvements are recommended in working conditions such as lighting rest rooms eating facilities air conditioning do you try to set a measure of their effectiveness on productivity,0.074074
XLSR_MODEL,when improvements are recommended in working conditions such as lighting restrooms eating facilities air-conditioning do you try to set a measure of their effectiveness on productivity,0.111111
HUBERT_MODEL,when improvements are recommended in working conditions such as lightning rest rooms eating facilities air conditioning do you try to set a measure of their effectiveness on productivity,0.111111
SemCor,when improvements are recommended in working conditions such as lighting rest rooms eating facilities airconditioning do you try to set a measure of their effectiveness on productivity,


In [None]:
pprint(wrongs_list[12])
show_semcor_transcriptions_comparison(outs[12])

Unnamed: 0,Transcription,WER
BASE_MODEL,when negotiating with your union do you make sure imploase a vachalways between new benefits and their sense petour costing wages,0.333333
LARGE_MODEL,when negotiating with your union do you make sure emplois evathois between new benefits and their scense bet our costing wages,0.375
XLSR_MODEL,when negotiating with your union do you make sure employees ava choise between new benefits and their sanspit-hour costing wages,0.333333
HUBERT_MODEL,when negotiating with your union do you make sure employees have a choice between new benefits and their cents pay our costing wages,0.166667
SemCor,when negotiating with your union do you make sure employees have a choice between new benefits and their cents per hour cost in wages,


In [None]:
# slow = get_semcor_sample_transcriptions(sample_idx=12, path=f"{SEMCOR_SPEECH_PATH}mothertongue/", postfix="-slow")
show_semcor_transcriptions_comparison(slow)

Unnamed: 0,Transcription,WER
BASE_MODEL,when negotiating with your union do you make sure employs have a choice between new benefits and their cense for our costine wages,0.25
LARGE_MODEL,when negotiating with your union do you make sure employers have a choice between new benefits and their scents por our costline wages,0.25
XLSR_MODEL,when negotiating with your union do you make sure employees have a choice between new benefits ome their cense-parou costing wages,0.25
HUBERT_MODEL,when negotiating with your union do you make sure employes have a choice between new benefits and their cents per hour cost in wages,0.041667
SemCor,when negotiating with your union do you make sure employees have a choice between new benefits and their cents per hour cost in wages,


In [None]:
# fast = get_semcor_sample_transcriptions(sample_idx=12, path=f"{SEMCOR_SPEECH_PATH}mothertongue/", postfix="-fast")
show_semcor_transcriptions_comparison(fast)

Unnamed: 0,Transcription,WER
BASE_MODEL,when negotiating with your union do you make sure employes have the choice between new benefits and their cents per hour cost in wages,0.083333
LARGE_MODEL,when negotiating with your union do you make sure employes have a choice between new benefits and their scense paraur cost in wages,0.166667
XLSR_MODEL,when negotiating with your uniondo you make sure employees have a choice between new benefits and their sense-peraur cost in wages,0.208333
HUBERT_MODEL,when negotiating with your union do you make sure employees have a choice between new benefits and their cents per hour cost in wages,0.0
SemCor,when negotiating with your union do you make sure employees have a choice between new benefits and their cents per hour cost in wages,


In [None]:
pprint(wrongs_list[13])
show_semcor_transcriptions_comparison(outs[13])

In [None]:
pprint(wrongs_list[14])
show_semcor_transcriptions_comparison(outs[14])

{2: ('employees', ['employies', 'employies'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,do your in please understand all the benefits to which your insurance entitles them,0.153846
LARGE_MODEL,do your emplis understand all the benefits to which your insurance entitles them,0.076923
XLSR_MODEL,do your employeese understandl the benefits to which your insurance entitles them,0.230769
HUBERT_MODEL,do your employies understand all the benefits to which your insurance entitles them,0.076923
SemCor,do your employees understand all the benefits to which your insurance entitles them,


In [None]:
pprint(wrongs_list[15])
show_semcor_transcriptions_comparison(outs[15])

{}


Unnamed: 0,Transcription,WER
BASE_MODEL,are they incourage to take full legal advantage of these benefits,0.090909
LARGE_MODEL,are they encouraged to take full legal advantage of these benefits,0.0
XLSR_MODEL,are they encouraged to take full legal advantage of these benefits,0.0
HUBERT_MODEL,are they encouraged to take full legal advantage of these benefits,0.0
SemCor,are they encouraged to take full legal advantage of these benefits,


In [None]:
pprint(wrongs_list[16])
show_semcor_transcriptions_comparison(outs[16])

{10: ('company', ["company's", 'the']), 11: ('s', ['share', "company's"])}


Unnamed: 0,Transcription,WER
BASE_MODEL,ave youco besides the sensperour value of the company's share of insuranspeniums,0.625
LARGE_MODEL,evieupublicize the sensperaor value of the company's share of insurance premins,0.5625
XLSR_MODEL,have you publicize the sensper-hour value of the company's share of insurance premiums,0.375
HUBERT_MODEL,have you publicized the cents per hour value of the company's share of insurance premiums,0.125
SemCor,have you publicized the cents per hour value of the company s share of insurance premiums,


In [None]:
pprint(wrongs_list[17])
show_semcor_transcriptions_comparison(outs[17])

{}


Unnamed: 0,Transcription,WER
BASE_MODEL,when did you lust compare your present preaming costs with the costs of insurence from other sources,0.176471
LARGE_MODEL,when did you lust compare your present premien costs with the costs of insurance from other sources,0.117647
XLSR_MODEL,when did you last compare your present premium costs with the costs of insurance from other sources,0.0
HUBERT_MODEL,when did you last compare your present premium costs with the costs of insurance from other sources,0.0
SemCor,when did you last compare your present premium costs with the costs of insurance from other sources,


In [None]:
pprint(wrongs_list[18])
show_semcor_transcriptions_comparison(outs[18])

{}


Unnamed: 0,Transcription,WER
BASE_MODEL,can your insurance company aid you in reducing administrative costs,0.0
LARGE_MODEL,can your insurance company aid you in reducing administrative costs,0.0
XLSR_MODEL,can your insurance company aid you in reducing administrative costs,0.0
HUBERT_MODEL,can your insurance company aid you in reducing administrative costs,0.0
SemCor,can your insurance company aid you in reducing administrative costs,


In [None]:
pprint(wrongs_list[19])
show_semcor_transcriptions_comparison(outs[19])

{8: ('employee', ['imply', 'imply'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,do you try to maintain the principle of implecontributed as opposto fully company paid programms,0.294118
LARGE_MODEL,do you try to maintain the principle of imply contributed as opposed to fully companypaid programms,0.235294
XLSR_MODEL,do you try to maintain the principle of employe contributed as opposed to fully company-paid programs,0.176471
HUBERT_MODEL,do you try to maintain the principle of imply contributed as opposed to fully company paid programs,0.058824
SemCor,do you try to maintain the principle of employee contributed as opposed to fully company paid programs,


In [None]:
pprint(wrongs_list[20])
show_semcor_transcriptions_comparison(outs[20])

{}


Unnamed: 0,Transcription,WER
BASE_MODEL,do you protect your holiday privileges with an attendanc' requirement both before and after the holiday,0.0625
LARGE_MODEL,do you protect your holiday privileges with an attendance requirement both before and after the holiday,0.0
XLSR_MODEL,do you protect your holiday privileges with an attendance requirement both before and after the holiday,0.0
HUBERT_MODEL,do you protect your holiday privileges with an attendance requirement both before and after the holiday,0.0
SemCor,do you protect your holiday privileges with an attendance requirement both before and after the holiday,


In [None]:
pprint(wrongs_list[21])
show_semcor_transcriptions_comparison(outs[21])

{4: ('limit', ['lein', 'its'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,do you plan to lean its additional olidays to area and orindustrial patterns,0.384615
LARGE_MODEL,do you plan to lenets additional holidays to area and or industrial patterns,0.076923
XLSR_MODEL,do you plan to limit aditional holidays to area and or industrial patterns,0.076923
HUBERT_MODEL,do you plan to lein its additional holidays to area and or industrial patterns,0.153846
SemCor,do you plan to limit additional holidays to area and or industrial patterns,


In [None]:
pprint(wrongs_list[22])
show_semcor_transcriptions_comparison(outs[22])

{22: ('work', ['workweek', 'uninterrupted']), 23: ('week', ['', 'workweek'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,have you investigated the possibility of moving midwak hollidays forward to monday or back to friday in order to have an uninterrupted workwik,0.166667
LARGE_MODEL,have you investigated the possibility of moving midwick holidays forward to monday or back to friday in order to avan uninterrupted workwick,0.208333
XLSR_MODEL,have you investigated the possibility of moving midwak holidays forward to monday or back to friday in order to have an uninterrupted workweek,0.125
HUBERT_MODEL,have you investigated the possibility of moving midweek holidays forward to monday or back to friday in order to have an uninterrupted workweek,0.083333
SemCor,have you investigated the possibility of moving midweek holidays forward to monday or back to friday in order to have an uninterrupted work week,


In [None]:
pprint(wrongs_list[23])
show_semcor_transcriptions_comparison(outs[23])

{3: ('policing', ['polising', 'polising']), 4: ('washup', ['washp', 'washp'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,are you carefully polishing woodshop time and resperious to be certain that all other time is prolactive,0.277778
LARGE_MODEL,are you carefully polishing worshop time and rasperius to be certain that all other time is productive,0.222222
XLSR_MODEL,are you carefully polishing wushup time and rest periods to be certain that all other time is productive,0.111111
HUBERT_MODEL,are you carefully polising washp time and rest periods to be certain that all other time is productive,0.111111
SemCor,are you carefully policing washup time and rest periods to be certain that all other time is productive,


In [None]:
pprint(wrongs_list[24])
show_semcor_transcriptions_comparison(outs[24])

{3: ('work', ['worse', 'caduales']),
 4: ('schedules', ['caduales', 'for']),
 6: ('boiler', ['boleberators', 'guard']),
 7: ('operators', ['guard', 'and']),
 8: ('guards', ['and', 'other']),
 9: ('and', ['other', 'twenty']),
 10: ('other', ['twenty', 'four']),
 11: ('24', ['four', 'hour']),
 14: ('7', ['seven', 'day']),
 22: ('overtime', ['over', 'time'])}


Unnamed: 0,Transcription,WER
BASE_MODEL,are you watching worse caduls for bottelbrators guards another or twenty four our day seven they week overations in order to minimasere time,0.608696
LARGE_MODEL,are you watching worse caduals for bolerberators guards another or twenty four hour day seven day week operations in order to manymiles over time,0.521739
XLSR_MODEL,are you watching word schedules for bull operators guards and other twenty-for-hour-day seven-dayweek oporations in order to minimize over time,0.478261
HUBERT_MODEL,are you watching worse caduales for boleberators guard and other twenty four hour day seven day week operations in order to minimize over time,0.434783
SemCor,are you watching work schedules for boiler operators guards and other 24 hour day 7 day week operations in order to minimize overtime,


In [None]:
show_semcor_transcriptions_comparison(outs[25])

Unnamed: 0,Transcription,WER
BASE_MODEL,are you careful to restrict the number of people on live at one time so that your total employment obligation is menimised,0.090909
LARGE_MODEL,are you careful to restrict the number of people on leve at one time so that your total employment obligation is minimized,0.045455
XLSR_MODEL,are you careful to restrict the number of people on leave at one time so that your total employment obligation is minimized,0.0
HUBERT_MODEL,are you careful to restrict the number of people on leave at one time so that your total employment obligation is minimized,0.0
SemCor,are you careful to restrict the number of people on leave at one time so that your total employment obligation is minimized,


In [None]:
show_semcor_transcriptions_comparison(outs[26])

Unnamed: 0,Transcription,WER
BASE_MODEL,have you considered use inventing equipment to replays or reduse the number of cafeteri emploies,0.4
LARGE_MODEL,have you considered yous invending equipment to replace or reduce the number of caffetry employes,0.266667
XLSR_MODEL,have you considered using vending equipment to replace or reduce the number of cafetery employees,0.066667
HUBERT_MODEL,have you considered using vending equipment to replace or reduce the number of cafetri employees,0.066667
SemCor,have you considered using vending equipment to replace or reduce the number of cafeteria employees,


In [None]:
show_semcor_transcriptions_comparison(outs[27])

Unnamed: 0,Transcription,WER
BASE_MODEL,what are the possibilities for operating your caphateria for a single shift only and relying upon bending machines or prepacat sandwiches for the second and third shift operations,0.107143
LARGE_MODEL,what are the possibilities for operating your caphateria for a single shift only and relying upon bending machines or prepacked sandwiches for the second and third shift operations,0.107143
XLSR_MODEL,what are the possibilities for operating your ca feteria for a single shift only and relying upon vending machines or prepeckage sandwiches for the second and third shift operations,0.107143
HUBERT_MODEL,what are the possibilities for operating your cafateria for a single sheift only and relying upon vending machines or prepackaged sandwiches for the second and third shift operations,0.071429
SemCor,what are the possibilities for operating your cafeteria for a single shift only and relying upon vending machines or prepackaged sandwiches for the second and third shift operations,


In [None]:
show_semcor_transcriptions_comparison(outs[28])

Unnamed: 0,Transcription,WER
BASE_MODEL,av you checked the cost of sup contracting your confetri operation in order to save administrative costs,0.25
LARGE_MODEL,have you checked the cost of subcontracting yourcafetoryoperation in order to save administrative costs,0.1875
XLSR_MODEL,have you checked the cost of subcontracting your cafetery operation in order to save administrative costs,0.0625
HUBERT_MODEL,have you checked the cost of subcontracting your cafet ri operation in order to save administrative costs,0.125
SemCor,have you checked the cost of subcontracting your cafeteria operation in order to save administrative costs,


In [None]:
show_semcor_transcriptions_comparison(outs[29])

Unnamed: 0,Transcription,WER
BASE_MODEL,are there possibilities of having caffetriale poor par time on custodial or other jobs,0.285714
LARGE_MODEL,are there possibilities of having caphetrial port parteiman custodial or other jobs,0.357143
XLSR_MODEL,are there possibilities of having cafetria at por part-time an custodial or other jobs,0.357143
HUBERT_MODEL,are there possibilities of having cafetrial pork part time on custodial or other jobs,0.285714
SemCor,are there possibilities of having cafeteria help work parttime on custodial or other jobs,


In [None]:
show_semcor_transcriptions_comparison(outs[30])

Unnamed: 0,Transcription,WER
BASE_MODEL,can staggerd lunch periods relieve the capasti train on your feeding facilities,0.25
LARGE_MODEL,can staggard lunch periods relieve the capacity strain on your feeding facilities,0.083333
XLSR_MODEL,can staggerd lunch periods relieve the capacity strain on your feeding facilities,0.083333
HUBERT_MODEL,can staggared lunch periods relieve the capacity strain on your feeding facilities,0.083333
SemCor,can staggered lunch periods relieve the capacity strain on your feeding facilities,


In [None]:
show_semcor_transcriptions_comparison(outs[31])

Unnamed: 0,Transcription,WER
BASE_MODEL,would it be feasible to lemin the manner in order to read yeur's fitting costs,0.357143
LARGE_MODEL,would it be feasiable to limit the manner in order to reduce feeding costs,0.142857
XLSR_MODEL,would it be feasible to limit the monu in order to reduce feeding costs,0.071429
HUBERT_MODEL,would it be feasible to limit the manue in order to reduce feeding costs,0.071429
SemCor,would it be feasible to limit the menu in order to reduce feeding costs,


In [None]:
show_semcor_transcriptions_comparison(outs[32])

Unnamed: 0,Transcription,WER
BASE_MODEL,have you considered gradwel with drawal of subsidence to urinplan finingoberation,0.692308
LARGE_MODEL,have you considered gradual witdrawal of subsidies to your implaned fitting operation,0.307692
XLSR_MODEL,have you considered gradual withdrawal of subsidies to yourinplan feeding operation,0.230769
HUBERT_MODEL,have you considered gradwell withdrawal of subsidies to your in plant feeding operation,0.076923
SemCor,have you considered gradual withdrawal of subsidies to your in plant feeding operation,


In [None]:
show_semcor_transcriptions_comparison(outs[33])

Unnamed: 0,Transcription,WER
BASE_MODEL,are you utilizing cafaterias pace for company meetings or discussions,0.2
LARGE_MODEL,are you utilizing capateria space for company meetings or discussions,0.1
XLSR_MODEL,are you utilizing cafeteria s pace for company meetings or discussions,0.2
HUBERT_MODEL,are you utilizing cafeteria space for company meetings or discussions,0.0
SemCor,are you utilizing cafeteria space for company meetings or discussions,


In [None]:
show_semcor_transcriptions_comparison(outs[34])

Unnamed: 0,Transcription,WER
BASE_MODEL,are your expenses in this area commantorary with the number of emploies who benefit from your programme,0.176471
LARGE_MODEL,are your expenses in this area commenturate with the number of emploies who benefit from your programme,0.176471
XLSR_MODEL,are your expenses in this area commentiary with the number of employees who benefit from your program,0.058824
HUBERT_MODEL,are your expenses in this area commercuary with the number of employees who benefit from your programme,0.117647
SemCor,are your expenses in this area commensurate with the number of employees who benefit from your program,


In [None]:
show_semcor_transcriptions_comparison(outs[35])

Unnamed: 0,Transcription,WER
BASE_MODEL,have you aldited your program recently to wet out those phases that drove lis participation,0.266667
LARGE_MODEL,have you aldited your programm recently to weed out those phases that draw lease participation,0.2
XLSR_MODEL,have you audited your program recently to without those phases that drove leas participation,0.266667
HUBERT_MODEL,have you algited your program recently to wed out those phases that draw lees participation,0.2
SemCor,have you audited your program recently to weed out those phases that draw least participation,


In [None]:
show_semcor_transcriptions_comparison(outs[36])

Unnamed: 0,Transcription,WER
BASE_MODEL,do employs contribute their share of money to recreational facilities,0.1
LARGE_MODEL,do employs contribute their share of money to recretional facilities,0.2
XLSR_MODEL,do employees contribute their share of money to recreational facilities,0.0
HUBERT_MODEL,do employees contribute their share of money to recreational facilities,0.0
SemCor,do employees contribute their share of money to recreational facilities,


In [None]:
show_semcor_transcriptions_comparison(outs[37])

Unnamed: 0,Transcription,WER
BASE_MODEL,have you considered delegating operation or responsibility to your employ association and carefully restricting your plans financial contribution,0.277778
LARGE_MODEL,have you considered delegating operation a responsibility to your employed association and carefully restricting your plans financial contribution,0.277778
XLSR_MODEL,have you considered delegating operational responsibility to your employe association and carefully restricting your plan's financial contribution,0.166667
HUBERT_MODEL,have you considered delegating operational responsibility to your employ association and carefully restricting your plan's financial contribution,0.166667
SemCor,have you considered delegating operational responsibility to your employee association and carefully restricting your plant s financial contribution,


In [None]:
show_semcor_transcriptions_comparison(outs[38])

Unnamed: 0,Transcription,WER
BASE_MODEL,could and employse gardenclot take over partial care of plant grounds,0.384615
LARGE_MODEL,could and employs garden club take over partial care of plant grounds,0.230769
XLSR_MODEL,coudan employees garden club take over partial care of plant grounds,0.307692
HUBERT_MODEL,could and employees gard and club take over partial care of plant grounds,0.307692
SemCor,could an employee s garden club take over partial care of plant grounds,


In [None]:
show_semcor_transcriptions_comparison(outs[39])

Unnamed: 0,Transcription,WER
BASE_MODEL,vuda canra club be useful in taking pictures pertenin to a plan safety,0.461538
LARGE_MODEL,vouda camera clab be useful in taking pictures pertenan to plan safety,0.384615
XLSR_MODEL,would the camera club be useful in taking pictures perteinant to plant safety,0.153846
HUBERT_MODEL,vouda camera club be useful in taking pictures pertinant to plan safety,0.307692
SemCor,would a camera club be useful in taking pictures pertinent to plant safety,
