
<span style="color:red">**THIS NOTEBOOK WORKS WITH MacOS 13.5.2 (22G91) ON THE Intel CHIP AND Python 3.7.17. OTHER OPERATIONAL SYSTEMS AND Python VERSIONS ARE NOT GUARANTEED TO WORK.**</span>.

Let's start with installing and importing required libraries.

In [None]:
!pip install deepspeech pocketsphinx vosk librosa jiwer noisereduce==2.0.1 torch scipy pandas speechbrain transformers

In [14]:
import os
import json
import time
import librosa
import numpy as np
from deepspeech import Model as DeepspeechModel
from pocketsphinx import Decoder
from vosk import Model as VoskModel, KaldiRecognizer
from speechbrain.pretrained import EncoderASR
import noisereduce as nr
import re
import unicodedata
from jiwer import wer as calculate_wer
import pandas as pd
import torch

Define transcriptions for comparing with recognized text.

In [17]:
transcriptions = {
    # EN
    'checkin.wav': 'Where is the check-in desk?',
    'parents.wav': 'I have lost my parents.',
    'suitcase.wav': 'Please, I have lost my suitcase.',
    'what_time.wav': 'What time is my plane?',
    'where.wav': 'Where are the restaurants and shops?',
    'your_sentence1.wav': 'Hello Everyone, this is the sentence number one',
    'your_sentence2.wav': 'How is it going?',
    # IT
    'checkin_it.wav': 'Dove e\' il bancone?',
    'parents_it.wav': 'Ho perso i miei genitori.',
    'suitcase_it.wav': 'Per favore, ho perso la mia valigia.',
    'what_time_it.wav': 'A che ora e’ il mio aereo?',
    'where_it.wav': 'Dove sono i ristoranti e i negozi?',
    # ES
    'checkin_es.wav': '¿Dónde están los mostradores?',
    'parents_es.wav': 'He perdido a mis padres.',
    'suitcase_es.wav': 'Por favor, he perdido mi maleta.',
    'what_time_es.wav': '¿A qué hora es mi avión?',
    'where_es.wav': '¿Dónde están los restaurantes y las tiendas?'
}

For correct computation of WER we need to transform both transcription and recognized text to some unified form. So we define function `normalize_text` for that purpose.

In [18]:
def normalize_text(text):
    normalized_text = text.lower()
    normalized_text = unicodedata.normalize('NFD', normalized_text)
    normalized_text = ''.join([c for c in normalized_text if not unicodedata.combining(c)])
    normalized_text = re.sub('[^a-z ]', '', normalized_text).lower()
    return normalized_text

Init ASRs we are working with. At the result we have something similar to Parameters Grid for evaluation of different ASR models.

In [19]:
def create_deepspeech_model_en():
    model = DeepspeechModel('models/deepspeech/deepspeech-0.9.3-models.pbmm')
    model.enableExternalScorer('models/deepspeech/deepspeech-0.9.3-models.scorer')
    return model

def create_deepspeech_model_es():
    model = DeepspeechModel('models/deepspeech/output_graph_es.pbmm')
    model.enableExternalScorer('models/deepspeech/kenlm_es.scorer',)
    return model

def create_deepspeech_model_it():
    model = DeepspeechModel('models/deepspeech/output_graph_it.pbmm')
    model.enableExternalScorer('models/deepspeech/kenlm_it.scorer')
    return model

def recognize_speech_with_deepspeech(model, signal):
    return model.stt(signal)

def recognize_speech_with_deepspeech_it(model, signal):
    result = max(model.sttWithMetadata(signal, 5).transcripts, key=lambda t: len(t.tokens)).tokens
    return''.join([token.text for token in result])

def create_pocketsphinx_model():
    return Decoder()

def recognize_speech_with_pocketsphinx(model, signal):
    signal_bytes= signal.tobytes()
    model.reinit()
    model.start_utt()
    for i, _ in enumerate(signal_bytes[::2048]):
        model.process_raw(signal_bytes[i * 2048:(i + 1) * 2048])
    model.end_utt()
    return model.hyp().hypstr

def create_vosk_model():
    return KaldiRecognizer(VoskModel(lang="en-us"), 16000)

def recognize_speech_with_vosk(model, signal):
    signal_bytes= signal.tobytes()
    model.Reset()
    for i, _ in enumerate(signal_bytes[::4000]):
        model.AcceptWaveform(signal_bytes[i * 4000:(i + 1) * 4000])

    return json.loads(model.FinalResult())["text"]

def create_speechbrain_model():
    return EncoderASR.from_hparams(
        source="speechbrain/asr-wav2vec2-commonvoice-14-en",
        savedir="pretrained_models/asr-wav2vec2-commonvoice-14-en"
    )

def recognize_speech_with_speechbrain(model, signal):
    waveform = model.audio_normalizer(torch.tensor(signal), 16000)
    result = model.transcribe_batch(waveform.unsqueeze(0), torch.tensor([1.0]))[0]
    return str(result[0])

models = {
    'deepspeech': {
        'EN': {
            'create_model': create_deepspeech_model_en,
            'recognize_speech': recognize_speech_with_deepspeech,
            'use_noise_processing': True
        },
        'ES': {
            'create_model': create_deepspeech_model_es,
            'recognize_speech': recognize_speech_with_deepspeech,
            'use_noise_processing': True
        },
        'IT': {
            'create_model': create_deepspeech_model_it,
            'recognize_speech': recognize_speech_with_deepspeech_it,
            'use_noise_processing': True
        }
    },
    'pocketsphinx': {
        'EN': {
            'create_model': create_pocketsphinx_model,
            'recognize_speech': recognize_speech_with_pocketsphinx,
            'use_noise_processing': False
        }
    },
    'vosk': {
        'EN': {
            'create_model': create_vosk_model,
            'recognize_speech': recognize_speech_with_vosk,
            'use_noise_processing': False
        }
    },
    'speechbrain': {
        'EN': {
            'create_model': create_speechbrain_model,
            'recognize_speech': recognize_speech_with_speechbrain,
            'use_noise_processing': True
        }
    }
}

Define function for noise processing.

In [20]:
def process_noise(signal, samplerate, language):
    normalized_signal = librosa.util.normalize(signal)

    if language == 'EN':
        return np.concatenate((
            np.zeros(np.round(samplerate * 0.3).astype(np.int32), dtype=np.float32),
            nr.reduce_noise(
                y=normalized_signal,
                sr=samplerate,
                hop_length=128,
                n_fft=512,
                win_length=512,
                prop_decrease=0.9,
                time_constant_s=0.1,
                freq_mask_smooth_hz = 6000
            )
        ))

    if language == 'ES':
        return np.concatenate((
            np.zeros(np.round(samplerate * 0.3).astype(np.int32), dtype=np.float32),
            nr.reduce_noise(
                y=normalized_signal,
                sr=samplerate,
                hop_length=128,
                n_fft=512,
                win_length=512,
                prop_decrease=0.9,
                time_constant_s=1,
                freq_mask_smooth_hz = 1500
            )
        ))

    if language == 'IT':
        return np.concatenate((
            np.zeros(np.round(samplerate * 0.3).astype(np.int32), dtype=np.float32),
            nr.reduce_noise(
                y=normalized_signal,
                sr=samplerate,
                hop_length=128,
                n_fft=512,
                win_length=512,
                prop_decrease=0.1,
                time_constant_s=1,
                freq_mask_smooth_hz = 100
            )
        ))

Run evaluation of different ASR models. At the result we should have a table with each ASR model and language required. That table also should contain WER rate and time spent to recognize speech.

In [22]:
final_report = []

for model_name, languages in models.items():
    for language, model_tools in languages.items():
        wers = []
        time_spents = []
  
        model = model_tools['create_model']()
    
        for audio_file in [file for file in os.listdir(f'audio/{language}') if file.endswith('.wav')]:
            signal, _ = librosa.load(f'audio/{language}/{audio_file}', sr=16000)
            
            if model_tools['use_noise_processing']:
                signal = process_noise(signal, 16000, language)
                
            signal = (signal * 32767).astype(np.int16)

            start = time.time_ns()
            recognized_text = model_tools['recognize_speech'](model, signal)
            end = time.time_ns()
            
            time_spent_ms = (end - start) / 1000000
            time_spents.append(time_spent_ms)
    
            wer = calculate_wer(normalize_text(transcriptions[audio_file]), normalize_text(recognized_text))
            wers.append(wer)

            final_report.append({
                'Model': model_name,
                'Language': language,
                'File': audio_file,
                'Time Spent, ms': int(np.round(time_spent_ms)),
                'wer': f'{int(np.round(wer * 100))}%'
            })

        final_report.append({
            'Model': model_name,
            'Language': language,
            'File': '*',
            'Time Spent, ms': int(np.round(np.mean(time_spents))),
            'wer': f'{int(np.round(np.mean(wers) * 100))}%'
        })

pd.set_option('display.max_rows', None)
pd.DataFrame(final_report)

TensorFlow: v2.3.0-6-g23ad988fcd
DeepSpeech: v0.9.3-0-gf2e9c858
TensorFlow: v2.3.0-6-g23ad988fcd
DeepSpeech: v0.9.3-0-gf2e9c858
TensorFlow: v2.3.0-6-g23ad988fcd
DeepSpeech: v0.9.3-0-gf2e9c858
LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from /Users/bastrich/.cache/vosk/vosk-model-small-en-us-0.15/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from /Users/bastrich/.cache/vosk/vosk-model-small-en-us-0.15/graph/HCLr.fst /Users/ba

Unnamed: 0,Model,Language,File,"Time Spent, ms",wer
0,deepspeech,EN,where.wav,3130,0%
1,deepspeech,EN,parents.wav,1951,20%
2,deepspeech,EN,your_sentence2.wav,983,25%
3,deepspeech,EN,your_sentence1.wav,2044,50%
4,deepspeech,EN,suitcase.wav,1745,0%
5,deepspeech,EN,checkin.wav,1822,20%
6,deepspeech,EN,what_time.wav,1266,20%
7,deepspeech,EN,*,1849,19%
8,deepspeech,ES,parents_es.wav,2467,0%
9,deepspeech,ES,what_time_es.wav,1384,83%
