This file will create a speech recognition, too which someone can talk into their microphone, and the speech will be transcripted into text. 

Need to perform 2 tasks at the same time; one too continiously record the microphone, and the other too transcribe word too text. Way we do this is through threads (functions which run in the background)

In [11]:
import ipywidgets as widgets
from IPython.display import display
from threading import Thread
from queue import Queue

messages = Queue()
recordings = Queue()

#Setting up widget for recording 
record_button = widgets.Button(
    description = 'Record',
    disabled = False,
    button_style = 'success',
    icon = 'microphone'
)

#Setting up widget for stopping the recording 
stop_button = widgets.Button(
    description = 'Stop',
    disabled = False,
    button_style = 'warning',
    icon = 'stop'
)

#Widget too show transcript as it's generated
output = widgets.Output()

#Function for when we start recording
def start_recording(data):
    messages.put(True)
    
    with output:
        display('Starting...')
        record = Thread(target = record_microphone)
        record.start()
        
        transcribe = Thread(target = speech_recognition, args = (output,))
        transcribe.start()

#Function for when we stop recording
def stop_recording(data):
    with output:
        messages.get()
        display('Stopped.')


#Linking widgets too functions above
record_button.on_click(start_recording)
stop_button.on_click(stop_recording)

display(record_button, stop_button, output)

Button(button_style='success', description='Record', icon='microphone', style=ButtonStyle())



Output()

In [12]:
#Need to figure out which of our sound devices we woud like too use
import pyaudio

#Initalize pyaudio connection too system audio devices
p = pyaudio.PyAudio()
#Loop through too print and see all of our audio devices
for i in range(p.get_device_count()):
    print(p.get_device_info_by_index(i))
    
p.terminate() #Terminate pyaudio connection to audio devices

{'index': 0, 'structVersion': 2, 'name': 'BlackHole 2ch', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.0014512471655328798, 'defaultHighInputLatency': 0.1, 'defaultHighOutputLatency': 0.011609977324263039, 'defaultSampleRate': 44100.0}
{'index': 1, 'structVersion': 2, 'name': 'MacBook Pro Microphone', 'hostApi': 0, 'maxInputChannels': 1, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.03414965986394558, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.05591836734693877, 'defaultHighOutputLatency': 0.1, 'defaultSampleRate': 44100.0}
{'index': 2, 'structVersion': 2, 'name': 'MacBook Pro Speakers', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.009229024943310658, 'defaultHighInputLatency': 0.1, 'defaultHighOutputLatency': 0.019387755102040816, 'defaultSampleRate': 44100.0}
{'index': 3, 'structVersion': 2, 'name': 'Microso

In [13]:
#Constants too define how our audio will be recorded
CHANNELS = 1
FRAME_RATE = 16000 #How high quality recording is
RECORD_SECONDS = 5 #How many seconds we want too record audio for before transcription
AUDIO_FORMAT = pyaudio.paInt16
SAMPLE_SIZE = 2

def record_microphone(chunk = 1024): #chunk defines how often we read from microphone
    p = pyaudio.PyAudio()
    
    #Going too connect too our microphone and record
    stream = p.open(format = AUDIO_FORMAT,
                    channels = CHANNELS,
                    rate = FRAME_RATE,
                    input = True,
                    input_device_index = 1,
                    frames_per_buffer = chunk)
    
    frames = [] #Store all of our audio
    
    while not messages.empty():
        data = stream.read(chunk)
        frames.append(data)
        
        #Setting up so every given seconds will send our audio data too transcription, then start new collection of audio
        if len(frames) >= (FRAME_RATE * RECORD_SECONDS) / chunk:
            recordings.put(frames.copy())
            frames = [] #Need to empty frames once for next chunk of audio data coming in
    
    #Need too stop and close connection once we're done recording
    stream.stop_stream()
    stream.close()
    p.terminate()

In [14]:
import subprocess
import json
from vosk import Model, KaldiRecognizer

#Various models too use, will use the smallest sized one
#https://alphacephei.com/vosk/models
model = Model(model_name = "vosk-model-small-en-us-0.15")

#Create recognizer which will use model for speech recognition
rec = KaldiRecognizer(model, FRAME_RATE)
rec.SetWords(True) #Will give us confidence level for words

#Function too take in output widget too display transcription live
def speech_recognition(output):
    while not messages.empty(): #Making sure we haven't clicked stop recording
        frames = recordings.get() #Pulling data out of queue 
        rec.AcceptWaveform(b''.join(frames)) #Joining all chunks of our frames into one 
        result = rec.Result()
        text = json.loads(result)['text']
        output.append_stdout(text) #Delete this line if using lines of code commented out below
        
'''
The following lines are in relation too including punctuation within our transcription. Due too storage issues,
I will comment this part out for now, but what the following lines of code will allow for punctuations and capitilization
too be included within the transcription. 
https://github.com/benob/recasepunc
'''
        #cased = subproces.check_output('python recasepunc/recasepunc.py predict recasepunc/checkpoint', shell = True, text = True, input = text)
        #output.append_stdout(cased) 

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from /Users/brandonamarasingam/.cache/vosk/vosk-model-small-en-us-0.15/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from /Users/brandonamarasingam/.cache/vosk/vosk-model-small-en-us-0.15/graph/HCLr.fst /Users/brandonamarasingam/.cache/vosk/vosk-model-small-en-us-0.15/graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo /Users/brandonamarasingam/.cache/vosk/vosk-mo

'\nThe following lines are in relation too including punctuation within our transcription. Due too storage issues,\nI will comment this part out for now, but what the following lines of code will allow for punctuations and capitilization\ntoo be included within the transcription. \nhttps://github.com/benob/recasepunc\n'