[Speech Recognition with Timestamps](https://towardsdatascience.com/speech-recognition-with-timestamps-934ede4234b2)

[How to import custom modules in google colab?](https://stackoverflow.com/questions/52733786/how-to-import-custom-modules-in-google-colab)

# Speech recognition with timestamps with vosk

## Set Paths

In [None]:
project_path = '/content/drive/MyDrive/Colab Notebooks/Speech Recognition'
libary_file_path = project_path+'/Word.py'
model_folder_path = project_path+'/models/vosk-model-small-en-us-0.15'
audio_folder_path = project_path+'/audio'
text_folder_path = project_path+'/text'

## Import Libraries

In [None]:
!pip install vosk
!pip install pydub

In [None]:
import os
import sys
import wave
import json

from google.colab import drive

from pydub import AudioSegment

from vosk import Model, KaldiRecognizer, SetLogLevel

SetLogLevel(0)

In [None]:
mounted_dir = '/content/drive/'
drive.mount(mounted_dir, force_remount=True)

In [None]:
# Check your Colab temporary path
!pwd

# Copy custom module from Google Drive to Colab temporary drive
!cp {libary_file_path.replace(" ", "\ ")} /content

In [None]:
import Word as custom_Word

## Loading a vosk model

In [None]:
# load vosk model downloaded from: https://alphacephei.com/vosk/models
if not os.path.exists(model_folder_path):
    print(f"Please download the model from https://alphacephei.com/vosk/models and unpack as {model_folder_path}")
    sys.exit()

print(f"Reading your vosk model '{model_folder_path}'...")
model = Model(model_folder_path)
print(f"'{model_folder_path}' model was successfully read")

## Specify the file name to recognize

In [None]:
# name of the audio file to recognize
audio_filename = os.path.join(audio_folder_path,'Test.wav')
# name of the mono audio file to recognize
audio_mono_filename = os.path.join(audio_folder_path,'Test_Mono.wav')
# name of the text file to write recognized text
text_filename = os.path.join(text_folder_path,'Test.txt')

## Reading a file

In [None]:
# convert to mono wav
wf = AudioSegment.from_wav(audio_filename)
wf = wf.set_channels(1)
wf.export(audio_mono_filename, format="wav")

In [None]:
# read the mono wav file
if not os.path.exists(audio_mono_filename):
    print(f"File '{audio_mono_filename}' doesn't exist")
    sys.exit()

print(f"Reading your file '{audio_mono_filename}'...")
wf = wave.open(audio_mono_filename, "rb")
print(f"'{audio_mono_filename}' file was successfully read")

In [None]:
# check if audio is mono wav
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
    print("Audio file must be WAV format mono PCM.")

## Recognize

In [None]:
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)

In [None]:
results = []

# recognize speech using vosk model
while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        part_result = json.loads(rec.Result())
        results.append(part_result)

part_result = json.loads(rec.FinalResult())
results.append(part_result)

`results` - list of json dictionaries, each of them has the following structure:

```
{'result': [
  # first word in a sentence
  {'conf': 0.84, # confidence
   'end': 4.5, # end time
   'start': 4.05, # start time
   'word': 'test'},
  # then, same parameters for 
  # the second word in a sentence
  {'conf': 0.87, 
   'end': 5.7, 
   'start': 5.1, 
   'word': 'library'},
  ... ], # and so on 
 # and a full text of the sentence
 'text': 'test library ...'}
 ```

In [None]:
# convert list of JSON dictionaries to list of 'Word' objects
list_of_words = []
for sentence in results:
    if len(sentence) == 1:
        # sometimes there are bugs in recognition 
        # and it returns an empty dictionary
        # {'text': ''}
        continue
    for obj in sentence['result']:
        w = custom_Word.Word(obj)  # create custom Word object
        list_of_words.append(w)  # and add it to list

In [None]:
for word in list_of_words:
    print(word.to_string())

In [None]:
# forming a final string from the words
text = ''
for r in results:
    text += r['text'] + ' '

print("\tVosk thinks you said:\n")
print(text)

In [None]:
text = ''
for word in list_of_words:
    text += word.to_string() + "\n"

print(f"Saving text to '{text_filename}'...")
with open(text_filename, "w") as text_file:
    text_file.write(text)
print(f"Text successfully saved")