In [1]:
from vosk import Model, KaldiRecognizer, SetLogLevel
import librosa
from IPython.display import Audio
import os
import wave
import pandas as pd
import numpy as np
import soundfile
from pathlib import Path
import glob

from tqdm import tqdm

loading Vosk model:

wget https://alphacephei.com/kaldi/models/vosk-model-ru-0.10.zip

unzip vosk-model-ru-0.10.zip

In [2]:
# dataframe with normalized sentences
df_mtsamples = pd.read_csv('data/normalized_mtsamples.csv',index_col=False)
# path to Vosk model
model_directory = 'vosk-model-en-us-daanzu-20200905' # EN model

### Functions

In [3]:
model = Model(model_directory) # it was removed out of function for memmory usage optimization

def asr_with_vosk(filename, model_directory):
    """ Make ASR by Vosk and return dictionary with words and phrases."""

    SetLogLevel(0)

    if not os.path.exists(model_directory):
        print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
        exit (1)

    wf = wave.open(filename, "rb")
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
        print ("Audio file must be WAV format mono PCM.")
        exit (1)

    
    #model = Model(model_directory)
    rec = KaldiRecognizer(model, wf.getframerate())

    results ={}

    results_counter = -1

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            results_counter+=1
            result=rec.Result()
            results[str(results_counter)] = (result)
    
    return results

### ASR of single record

In [8]:
# Load audio record
wavs_directory="/SSD-2T/medical_domain_adaptation_dataset/wavs/"
file_id = 5

filepath = wavs_directory + str(file_id) + ".wav"

wav, sr = librosa.load(filepath)

display(Audio(wav, rate=sr))
duration = librosa.get_duration(y=wav, sr=sr)
print('Audio duration: ' + str(duration))

padding_wav = np.zeros((30000,))

preprocessed_filename = 'temporary_record.wav'
soundfile.write(preprocessed_filename, 
                np.concatenate([padding_wav, wav, padding_wav], axis = 0), 
                samplerate=sr)

Audio duration: 5.17501133786848


In [9]:
results = asr_with_vosk(preprocessed_filename, model_directory)

In [10]:
originat_text = df_mtsamples.sentence[file_id]

vosk_detected_utterances = []
for key in results.keys():
    text = eval(results[key])['text']
    if text!='':
        vosk_detected_utterances.append(text)
        
print('Target:')
print(originat_text)
print('Prediction:')
print(" ".join(x for x in vosk_detected_utterances))

Target:
hemostasis was assured within the mesentery and at the base of the cecum.
Prediction:
he must assist was a short within the midcentury at at the base of the seek them


### ASR of the whole synthesized set

In [4]:
def speech2text(model_directory, input_directory, output_directory):
    """Performs ASR for synthesized dataset. For each record the ASR 
       output will be saved to separated file.
       inputs: model_directory - path to the Vosk modek,
               input_directory - path to the folder with records (files names must be int id-numbers),
               output_directory - directory where ASR outputs will be saved.             
    """
    
    wav_paths = glob.glob(input_directory+"*.wav")
    wav_ids = [int(os.path.basename(path).split('.')[0]) for path in wav_paths]
    
    text_paths = glob.glob(output_directory+"*.txt")
    text_ids = [int(os.path.basename(path).split('.')[0]) for path in text_paths]
    
    # in order to check if some files already generated (if you run the function second time)
    new_wav_paths = [wav_paths[i] for i in range(len(wav_paths)) if wav_ids[i] not in text_ids]
    new_wav_ids = [wav_ids[i] for i in range(len(wav_ids)) if wav_ids[i] not in text_ids]
    
    padding_wav = np.zeros((25000,))
    preprocessed_filename = 'temporary_record.wav'
    
    for i in tqdm(range(len(new_wav_ids)), desc="STT generation"):
        detected_text = ''
        vosk_detected_utterances = []
        
        audio_sample, sr = librosa.load(new_wav_paths[i])
        
        soundfile.write(preprocessed_filename, 
                        np.concatenate([padding_wav, audio_sample, padding_wav], axis = 0), 
                        samplerate=sr)
        
        results = asr_with_vosk(preprocessed_filename, model_directory)
        
        for key in results.keys():
            text = eval(results[key])['text']
            if text!='':
                vosk_detected_utterances.append(text)
                
        detected_text = " ".join(x for x in vosk_detected_utterances)
        
        filepath = output_directory + str(new_wav_ids[i]) + '.txt'
        file = open(filepath, "a+")
        file.writelines(detected_text)
        file.close()

In [5]:
# Create directory for generated texts
texts_directory="data/texts/"
Path(texts_directory).mkdir(parents=True, exist_ok=True)

wavs_directory="/SSD-2T/medical_domain_adaptation_dataset/wavs/"

In [6]:
speech2text(model_directory, wavs_directory, texts_directory)

STT generation: 100%|██████████| 92224/92224 [17:07:01<00:00,  1.50it/s]   
