In [1]:
from transformers import pipeline
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

classifier = pipeline(
    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
)

In [8]:
import subprocess
import numpy as np
import sys

def launch_fn(wake_word="marvin", prob_threshold=0.5, chunk_length_s=1,
              debug=False, amplitdude_threshold=2000):
    
    if wake_word not in classifier.model.config.label2id.keys():
        raise ValueError(
            f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
        )
    
    # ffmpeg lib required for this function 
    ffmpeg_command = [
        'ffmpeg',
        '-f', 'avfoundation',
        '-i', ':0',  # default microphone
        '-ac', '1',  # audio channels (1 for mono)
        '-ar', '16000',  # sample rate
        '-f', 's16le',  # format (signed 16-bit little endian)
        '-acodec', 'pcm_s16le',  # audio codec (PCM signed 16-bit little endian)
        '-'
    ]
    
    process = subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
    try:
        print('listening...')
        while True:
            # read 40960 bytes (~1 seconds of audio) from ffmpeg stdout
            data = process.stdout.read(40960*chunk_length_s)
            if not data:
                break

            # convert raw audio bytes to a numpy array
            audio_data = np.frombuffer(data, dtype=np.int16)

            # check if any value in the audio data exceeds the threshold
            if np.any(audio_data > amplitdude_threshold):
                input_data = audio_data.astype(np.float32)
                
                # normalize the data
                input_data = (input_data - input_data.mean()) / (input_data.std() * 2)
                prediction = classifier(input_data)
                prediction = prediction[0]
                
                if debug:
                    print('debug',prediction)
                if prediction["label"] == wake_word:
                    if prediction["score"] > prob_threshold:
                        process.kill()
                        return  True
                              
                if prediction["label"] == 'stop':       
                    if prediction["score"] > prob_threshold:
                        process.kill()
                        raise KeyboardInterrupt()
                
    except KeyboardInterrupt:
        print("stoped by user")
    finally:
        process.kill()
    

In [9]:
launch_fn(debug=False)

listening...


True

In [11]:
transcriber = pipeline(
    'automatic-speech-recognition', model="openai/whisper-base.en", device=device
    )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
def transcribe(chunk_length_s=2, amplitdude_threshold=2000):
    # ffmpeg lib required for this function 
    ffmpeg_command = [
        'ffmpeg',
        '-f', 'avfoundation',
        '-i', ':0',  # default microphone
        '-ac', '1',  # audio channels (1 for mono)
        '-ar', '16000',  # sample rate
        '-f', 's16le',  # format (signed 16-bit little endian)
        '-acodec', 'pcm_s16le',  # audio codec (PCM signed 16-bit little endian)
        '-'
    ]
    
    process = subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
    try:
        print('listening...')
        while True:
            # read 40960 bytes (~1 seconds of audio) from ffmpeg stdout
            data = process.stdout.read(40960*chunk_length_s)
            if not data:
                break

            # convert raw audio bytes to a numpy array
            audio_data = np.frombuffer(data, dtype=np.int16)

            # check if any value in the audio data exceeds the threshold
            if np.any(audio_data > amplitdude_threshold):
                input_data = audio_data.astype(np.float32)
                
                # normalize the data
                input_data = (input_data - input_data.mean()) / (input_data.std() * 2)
                item = transcriber(input_data, generate_kwargs={"max_new_tokens": 128})
                
                sys.stdout.write("\033[K")
                print(item["text"], end="\r")
                if item.get('partial') and not item["partial"][0]:
                    return item['text']
                    
    except KeyboardInterrupt:
        print("stoped by user")
    finally:
        process.kill()
    

In [None]:
transcribe()

listening...
[K Let us discuss this later.