In [1]:
# https://github.com/nalbion/whisper-server/blob/master/whisper_server/services/audio/microphone.py

import numpy as np
import pyaudio
import time
import logging 
import scipy

import wave
import librosa
import queue
import signal
from multiprocessing import Process, Queue, Event
from threading import Thread

from faster_whisper import WhisperModel
from faster_whisper.transcribe import Segment

CHUNK_LENGTH = 30  # seconds

SAMPLE_RATE = 16000# 16000
RECORD_SECONDS = CHUNK_LENGTH  # 30
# TODO: bring FRAMES_PER_BUFFER down and see if it breaks anything or improves latency
# FRAMES_PER_BUFFER = 1024
# FRAMES_PER_BUFFER = 3000
# OpenAI Whisper complains if not 480000: assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
# FRAMES_PER_BUFFER = RECORD_SECONDS * SAMPLE_RATE  # 480,000 = N_SAMPLES
# FRAMES_TO_PROCESS = FRAMES_PER_BUFFER >> 3
FORMAT = pyaudio.paInt16
CHANNELS = 1

logger = logging.getLogger(__name__)

class Microphone:
    def __init__(self, device_index=None):
        self.closed = False
        self.audio = pyaudio.PyAudio()
        self.run = False
        self.device_index = device_index
        self.device_sample_rate = None
        self.list_input_devices()
        self.frames_to_process = (RECORD_SECONDS * self.device_sample_rate) >> 1
        print(f"Use device index: {self.device_index} with sample rate {self.device_sample_rate}")
        self.stream = self.audio.open(format=FORMAT,
                                      channels=CHANNELS,
                                      rate=self.device_sample_rate,
                                      input=True,
                                      input_device_index=self.device_index,
                                      frames_per_buffer=RECORD_SECONDS * self.device_sample_rate)
        
        self.output_wave = wave.open("test.wav", 'wb')
        self.output_wave.setnchannels(1)
        self.output_wave.setsampwidth(self.audio.get_sample_size(FORMAT))
        self.output_wave.setframerate(SAMPLE_RATE)

    def __exit__(self, *err):
        self.close()

    def __del__(self):
        self.close()

    def list_input_devices(self):

        hosts = self.audio.get_host_api_count()
        device_count = self.audio.get_device_count()

        for index in range(hosts):
            host = self.audio.get_host_api_info_by_index(index)
            print("-----------------------")
            print(f"Host audio API {index}: {host['name']}")

            for d in range(device_count):
                device_info = self.audio.get_device_info_by_index(d)
                inputs = device_info['maxInputChannels']

                try:
                     if 1 <= inputs <= 2 and device_info['hostApi'] == index: # and \
                    #         self.audio.is_format_supported(rate=SAMPLE_RATE,
                    #                                        input_device=d,
                    #                                        input_format=FORMAT,
                    #                                        input_channels=1):
                        print(str(device_info))
                        print(f"  Device {d}: {device_info['name']}")
                        if self.device_index is None:
                            self.device_index = d
                            self.device_sample_rate = int(device_info['defaultSampleRate'])
                            print(f"Use Device {device_info['name']}")
                        elif self.device_index == d:
                            self.device_sample_rate = int(device_info['defaultSampleRate'])
                            print(f"Use Device {device_info['name']}")
                except ValueError:
                    # print(str(device_info))
                    pass



        # for index in range(device_count):
        #     device_info = self.audio.get_device_info_by_index(index)
        #
        #     try:
        #         if device_info['maxInputChannels'] == 1 and \
        #             device_info['hostApi'] == 0 and \
        #             self.audio.is_format_supported(rate=SAMPLE_RATE,
        #                                           input_device=index,
        #                                           input_format=FORMAT,
        #                                           input_channels=1):
        #             print(str(device_info))
        #             # print(f"Device {index}: {device_info['name']}")
        #     except ValueError:
        #         # print(str(device_info))
        #         pass



    def listen(self):
        while self.run:
            prev = time.time()
            if self.stream.is_active():
                start = time.time()
                # print("listening...")  # {:.3f}".format(time.time() - start))
                data = self.stream.read(self.frames_to_process)
                # print("got audio from the mic, {:.3f}".format(time.time() - start))
                prev = time.time()
                audio_data = np.frombuffer(data, np.int16).flatten().astype(np.float32) / 32768.0
              
                if self.device_sample_rate != SAMPLE_RATE:
                    audio_data = scipy.signal.resample_poly(audio_data, SAMPLE_RATE, self.device_sample_rate)
            
                    #audio_data = librosa.resample(audio_data, orig_sr=self.device_sample_rate, target_sr=SAMPLE_RATE)
                self.output_wave.writeframes((audio_data * 32768).astype(np.int16).tobytes())
                    
                
                
                yield audio_data
                # print("after yield, {:.3f}".format(time.time() - prev))
            else:
                print("break from microphone.listen()")
                break

    def start(self):
        self.run = True
        self.stream.start_stream()
        logger.debug("microphone started")

    def stop(self):
        logger.debug("microphone stopped")
        self.run = False
        self.stream.stop_stream()
        self.output_wave.close()

    def close(self):
        if not self.closed:
            self.stream.stop_stream()
            self.stream.close()
            self.output_wave.close()
            self.audio.terminate()
            self.closed = True

    def is_closed(self):
        return self.closed

In [2]:

def start_audio_process(audio_queue: Queue, audio_active_event: Event, device_index:int = None):
    mic = Microphone(device_index=device_index)

    if audio_active_event.is_set():
        mic.start()

    def process_record_audio():  # audio_queue: multiprocessing.Queue):
        while not mic.is_closed():
            if not audio_active_event.is_set():
                mic.stop()
                audio_active_event.wait()
                mic.start()

            for audio in mic.listen():
                if audio_active_event.is_set():
                    audio_queue.put(audio)
                    print("Wrote audio to queue")
                else:
                    break

    audio_process = Process(name="whisper_server audio", target=process_record_audio)  # , args=(audio_queue,))
    audio_process.start()
    
    async def stop():
        mic.stop()
        mic.close()
        audio_queue.close()
        audio_queue.cancel_join_thread()
        audio_process.terminate()
        audio_process.join()

    return stop


def start_audio_thread(audio_queue: Queue, audio_active_event: Event, device_index:int = None):
    mic = Microphone(device_index=device_index)

    if audio_active_event.is_set():
        mic.start()

    def run_audio_loop():
        while not mic.is_closed():
            if not audio_active_event.is_set():
                mic.stop()
                # print("run_audio_loop waiting for startRecognition()")
                audio_active_event.wait()
                if mic.is_closed():
                    break
                mic.start()

            for audio in mic.listen():
                if audio_active_event.is_set():
                    audio_queue.put(audio)
                    print("Wrote audio to queue")
                else:
                    break

    audio_thread = Thread(name="whisper_server audio", target=run_audio_loop)
    audio_thread.start()

    async def stop():
        mic.stop()
        mic.close()
        audio_active_event.clear()
        audio_queue.close()
        audio_queue.cancel_join_thread()
        audio_thread.join()

    return stop

In [3]:
def signal_handler(interrupted_event):
    interrupted_event.set()

In [4]:
def filter_results(self, hypothesis: Segment):
        # Ignore no_speech_prob >= 0.75
        # avg_logprob:
        #  >= -0.3 : good
        #     -0.4 : close
        #  >  -0.8 : clear speech mis-recognised or
        #  <= -0.8 : mis-pronounced
        #  <  -1.0 : is mumbled/hard to hear
        # print('avg_logprob: {:.3f}, no_speech_prob: {:.3f}'.format(hypothesis.avg_logprob, hypothesis.no_speech_prob))
        logger.debug('avg_logprob: %.3f, no_speech_prob: %.3f', hypothesis.avg_logprob, hypothesis.no_speech_prob)
        return hypothesis.no_speech_prob < 0.75 and hypothesis.avg_logprob > -1.0

def run_whisper_loop(args, _audio_queue: Queue, _stt_results_queue: Queue, interrupted_event: Event):
  
    model = WhisperModel("../quantized_models/whisper-large-v2", device="cpu", compute_type="int8")
    signal.signal(signal.SIGINT, lambda sig, frame: signal_handler(interrupted_event))

    while True:
        try:
            audio = _audio_queue.get()
            # if audio == "stop":
            #     print("process_speech_to_text received 'stop'")
            #     break
            # logger.debug("sending audio to Whisper...")
            segments, _ = model.transcribe(audio, word_timestamps=False, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=1000))
            alternatives = list(segments)
            print(f"Whisper returned {alternatives}")
            #alternatives = filter(filter_results, segments)
            if len(alternatives) != 0:
                logger.info("Whisper recognised: %s", alternatives)
                _stt_results_queue.put(alternatives)
        except ValueError:
            print("  ValueError in reading from audio_queue")
            break
        except queue.Empty:
            print("empty audio queue")
            break
        except (KeyboardInterrupt, InterruptedError, SystemExit):
            _audio_queue.close()
            _audio_queue.cancel_join_thread()
            _stt_results_queue.close()
            _stt_results_queue.cancel_join_thread()
            print("---------- whisper process interrupted by keyboard ---------")
            break
        
def start_whisper_process(args, audio_queue: Queue, stt_results_queue: Queue, interrupted_event: Event):
    # interrupted_event = Event()
    whisper_process = Process(name="whisper_server speech to text",
                              target=run_whisper_loop,
                              args=(args, audio_queue, stt_results_queue, interrupted_event),
                              # daemon=True
                              )
    whisper_process.start()

    async def stop():
        # terminate is more graceful than kill (abort)
        # audio_queue.put("stop")
        audio_queue.close()
        audio_queue.cancel_join_thread()
        stt_results_queue.close()
        stt_results_queue.cancel_join_thread()
        whisper_process.terminate()
        whisper_process.join()

    return stop


def start_whisper_thread(args, audio_queue: Queue, stt_results_queue: Queue, interrupted_event: Event):
    whisper_thread = Thread(name="whisper_server speech to text",
                            target=run_whisper_loop,
                            args=(args, audio_queue, stt_results_queue, interrupted_event),)
    whisper_thread.start()

    async def stop():
        # terminate is more graceful than kill (abort)
        # audio_queue.put("stop")
        audio_queue.close()
        audio_queue.cancel_join_thread()
        stt_results_queue.close()
        stt_results_queue.cancel_join_thread()
        whisper_thread.join()

    return stop

In [None]:
interrupted_event = Event()
audio_active_event = Event()
audio_active_event.set()


audio_queue = Queue()
stt_results_queue = Queue()

audio_stop = start_audio_process(audio_queue=audio_queue, audio_active_event=audio_active_event, device_index=13)
whisper_stop = start_whisper_process(args=None, audio_queue=audio_queue, stt_results_queue=stt_results_queue, interrupted_event=interrupted_event)


-----------------------
Host audio API 0: ALSA
{'index': 0, 'structVersion': 2, 'name': 'HDA Intel PCH: ALC3235 Analog (hw:0,0)', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.008707482993197279, 'defaultLowOutputLatency': -1.0, 'defaultHighInputLatency': 0.034829931972789115, 'defaultHighOutputLatency': -1.0, 'defaultSampleRate': 44100.0}
  Device 0: HDA Intel PCH: ALC3235 Analog (hw:0,0)
{'index': 13, 'structVersion': 2, 'name': 'HD Pro Webcam C920: USB Audio (hw:2,0)', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.012, 'defaultLowOutputLatency': -1.0, 'defaultHighInputLatency': 0.048, 'defaultHighOutputLatency': -1.0, 'defaultSampleRate': 32000.0}
  Device 13: HD Pro Webcam C920: USB Audio (hw:2,0)
Use Device HD Pro Webcam C920: USB Audio (hw:2,0)
-----------------------
Host audio API 1: OSS
Use device index: 13 with sample rate 32000


ALSA lib pcm_dmix.c:1089:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:869:(find_matching_chmap) Found no matching channel map
Cannot connect to server socket err = No such file or directory
Cannot connect to server request channel
jack server is not running or cannot be started
JackShmReadWritePtr::~JackShmReadWritePtr - Init not done for -1, skipping unlock
JackShmReadWritePtr::~JackShmReadWritePtr - Init not done for -1, skipping unlock
Cannot connect to server socket err = No such file or directory
Cannot connect to server request channel
jack server is not running or cannot be started
JackShmReadWritePtr::~JackShmReadWritePtr - Init not done for -1, skipping unlock
JackShmReadWritePtr::~JackShmReadWritePtr - Init not done for -1, skipping unlock
ALSA

Wrote audio to queue
Wrote audio to queue
Whisper returned [Segment(id=1, seek=1108, start=3.57, end=13.92, text=' Schauen wir mal, jetzt sollten die Abstände etwas länger werden, die Audiosachen sollten noch seltener geschrieben werden.', tokens=[50364, 2065, 11715, 1987, 2806, 11, 4354, 29096, 978, 2847, 16913, 68, 9569, 40935, 4604, 11, 978, 8821, 2717, 11646, 29096, 3514, 5851, 1147, 260, 47397, 4604, 13, 50864], temperature=0.0, avg_logprob=-0.45756151477495827, compression_ratio=1.203883495145631, no_speech_prob=0.031490862369537354, words=None)]
Wrote audio to queue
Whisper returned [Segment(id=1, seek=1500, start=0.0, end=5.04, text=' Größenordnungen seltener. Dafür habe ich die Hoffnung, dass vielleicht die Qualität besser', tokens=[50364, 45778, 8989, 765, 77, 5084, 5851, 1147, 260, 13, 35865, 6015, 1893, 978, 29135, 15539, 11, 2658, 12547, 978, 13616, 14053, 18021, 50616], temperature=0.0, avg_logprob=-0.29906214200533354, compression_ratio=1.3224043715846994, no_speech_prob

In [7]:
print(audio_queue.get(block=False))

Empty: 

In [None]:
print(stt_results_queue.get(block=False))

Empty: 

In [8]:
await whisper_stop()

In [None]:
await audio_stop()

Exception in thread whisper_server audio:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
Expression 'alsa_snd_pcm_poll_descriptors_revents( self->pcm, pfds, self->nfds, &revents )' failed in 'src/hostapi/alsa/pa_linux_alsa.c', line: 3664
Expression 'PaAlsaStreamComponent_EndPolling( &self->capture, capturePfds, &pollCapture, &xrun )' failed in 'src/hostapi/alsa/pa_linux_alsa.c', line: 3891
Expression 'PaAlsaStream_WaitForFrames( stream, &framesAvail, &xrun )' failed in 'src/hostapi/alsa/pa_linux_alsa.c', line: 4438
    self.run()
  File "/usr/local/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_13729/2299478538.py", line 49, in run_audio_loop
  File "/tmp/ipykernel_13729/1389239608.py", line 106, in listen
  File "/home/ruzickal/Code/Privat/Whisper/whisper_venv/lib/python3.10/site-packages/pyaudio/__init__.py", line 570, in read
    return pa.read_str