In [4]:
# adapted from https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py
# which has a Mozzila Public License:
# https://github.com/mozilla/DeepSpeech/blob/master/LICENSE

from deepspeech import Model, version
import librosa as lr
import numpy as np
import os 
from jiwer import wer
from pydub import AudioSegment


# paths to models and scorers
en_scorer = "./models/deepspeech-0.9.3-models.scorer"
en_model = "./models/deepspeech-0.9.3-models.pbmm"
es_scorer = "./models/kenlm_es.scorer"
es_model = "./models/output_graph_es.pbmm"
it_scorer = "./models/kenlm_it.scorer"
it_model = "./models/output_graph_it.pbmm"

#names of audio files
en_audio_files = ['checkin.wav', 'checkin_child.wav','parents.wav','parents_child.wav', 'suitcase.wav',
               'suitcase_child.wav','what_time.wav','what_time_child.wav', 'where.wav','where_child.wav',
                 'your_sentence1.wav', 'your_sentence2.wav']
it_audio_files = ['checkin_it.wav','parents_it.wav', 'suitcase_it.wav',
              'what_time_it.wav', 'where_it.wav']
es_audio_files = ['checkin_es.wav','parents_es.wav', 'suitcase_es.wav',
              'what_time_es.wav', 'where_es.wav']

#Transcript that we are going to compare our initial work with 
en = ['where is the check in desk','where is the check in desk','i have lost my parents','i have lost my parents',
      'please i have lost my suitcase','please i have lost my suitcase', 'what time is my plane',
        'what time is my plane','where are the restaurants and shops','where are the restaurants and shops',
     'how are you', 'you are welcome']

es = ['dónde están los mostradores','he perdido a mis padres','por favor he perdido mi maleta',
     'a qué hora es mi avión','dónde están los restaurantes y las tiendas']
it = ['dove e il bancone', 'ho perso i miei genitori','per favore ho perso la mia valigia',
         'a che ora e il mio aereo', 'dove sono i ristoranti e i negozi']




In [11]:


def deepspeech_ASR(model, scorer, language):
    
    audio_dir = ''
    audio_file_list = '' 
    
    ds = Model(model)
    ds.enableExternalScorer(scorer)
    hypothesis = []
    one_sec_segment = AudioSegment.silent(duration=700) # silent padding 
    i = 0
    desired_sample_rate = ds.sampleRate()
    
    if (language == 'english'):
        audio_dir = 'EN/'
        audio_file_list = en_audio_files
    elif (language == 'italian'):
        audio_dir = 'IT/'
        audio_file_list = it_audio_files 
    elif (language == 'spanish'):
        audio_dir = 'ES/'
        audio_file_list = es_audio_files 
        
    for audio_file_name in en_audio_files:
        


        audio = AudioSegment.from_wav("./Ex4_audio_files/"+audio_dir+audio_file_name)
        audio = audio + 6
        audio = audio.apply_gain(audio.dBFS - audio.max_dBFS)
        audio = one_sec_segment + audio + one_sec_segment
        audio.export("temp.wav", format="wav")



        audio = lr.load("temp.wav", sr=desired_sample_rate)[0]    
        audio = (audio * 32767).astype(np.int16) # scale from -1 to 1 to +/-32767
        res = ds.stt(audio)
        hypothesis.append(res)
        print("Language: "+language+" file : "+audio_file_name+ " WER: "+str(wer(res, en[i]) * 100)+'%')
        i+=1
    print('Predicted sentences')
    print(hypothesis)
    print('=================')
    print('actual sentences')
    print(en)
    print("Total WER for "+language+ ": "+str(wer(hypothesis, en) * 100)+'%')



In [12]:
deepspeech_ASR(en_model, en_scorer, 'english')

Language: english file : checkin.wav WER: 0.0%
Language: english file : checkin_child.wav WER: 166.66666666666669%
Language: english file : parents.wav WER: 20.0%
Language: english file : parents_child.wav WER: 0.0%
Language: english file : suitcase.wav WER: 0.0%
Language: english file : suitcase_child.wav WER: 33.33333333333333%
Language: english file : what_time.wav WER: 20.0%
Language: english file : what_time_child.wav WER: 20.0%
Language: english file : where.wav WER: 0.0%
Language: english file : where_child.wav WER: 20.0%
Language: english file : your_sentence1.wav WER: 0.0%
Language: english file : your_sentence2.wav WER: 0.0%
Predicted sentences
['where is the check in desk', 'the variegated as', 'i had lost my parents', 'i have lost my parents', 'please i have lost my suitcase', 'then i have lost my sakes', 'what time is my plan', 'what time is my plan', 'where are the restaurants and shops', 'where the restaurants and shops', 'how are you', 'you are welcome']
actual sentence