```bash
ferganey@ferganey-linux:/media/ferganey/Data1/03_Projects/Final_Learning_AI-Projects$ source /home/ferganey/spechmdl/bin/activate

(spechmdl) ferganey@ferganey-linux:/media/ferganey/Data1/03_Projects/Final_Learning_AI-Projects$ python -m ipykernel install --user --name=spechmdl --display-name "Python (spechmdl)"

jupyter notebook
```

In [None]:
!pip install sounddevice

In [None]:
!pip install scipy


In [None]:
import whisper
import torch
import sounddevice as sd
import numpy as np
import queue
from threading import Thread
import time
import wave


class WhisperModel:
    """Handles Whisper model loading and transcription."""
    def __init__(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {device}")
        self.model = whisper.load_model("base", device=device)

    def transcribe(self, audio_data):
        """Transcribes audio data using Whisper."""
        audio_data = self.preprocess_audio(audio_data)
        result = self.model.transcribe(audio_data)
        print("Transcription:", result["text"])

    @staticmethod
    def preprocess_audio(audio_data):
        """Preprocesses audio data for Whisper input."""
        audio_data = np.squeeze(audio_data)  # Remove unnecessary dimensions
        audio_data = audio_data / np.max(np.abs(audio_data))  # Normalize to [-1, 1]
        return audio_data


class AudioProcessor:
    """Handles audio capture and processing."""
    def __init__(self, whisper_model):
        self.whisper_model = whisper_model
        self.queue = queue.Queue()

    def audio_callback(self, indata, frames, time, status):
        """Callback function for sounddevice."""
        if status:
            print(f"Audio callback status: {status}")
        if self.queue.qsize() < 10:  # Avoid excessive queue growth
            self.queue.put(indata.copy())

    def save_audio_debug(self, audio_data, filename="debug_audio.wav"):
        """Saves audio data to a WAV file for debugging."""
        with wave.open(filename, "wb") as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)  # 2 bytes per sample
            wf.setframerate(16000)
            wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())

    def process_queue(self):
        """Processes audio chunks from the queue."""
        while True:
            if not self.queue.empty():
                audio_chunk = self.queue.get()
                self.save_audio_debug(audio_chunk)  # Save audio for debugging
                self.whisper_model.transcribe(audio_chunk)


def main():
    """Main function to set up and run the application."""
    whisper_model = WhisperModel()  # Load the Whisper model
    audio_processor = AudioProcessor(whisper_model)  # Set up audio processing

    try:
        print("Available devices:")
        print(sd.query_devices())  # List audio devices for debugging

        # Set up the audio input stream
        with sd.InputStream(
            device=12,  # Replace with specific device ID if needed
            samplerate=16000,
            blocksize=1024,
            dtype="float32",
            channels=1,
            callback=audio_processor.audio_callback,
        ):
            print("Listening for audio input...")
            # Start a background thread to process audio data
            Thread(target=audio_processor.process_queue, daemon=True).start()

            # Keep the main thread alive
            while True:
                time.sleep(0.1)

    except sd.PortAudioError as e:
        print(f"Audio stream error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


if __name__ == "__main__":
    main()


In [None]:
import whisper
import torch
import sounddevice as sd
import numpy as np
import queue
from threading import Thread
import wave
import time


class WhisperModel:
    """Handles loading and transcription with the Whisper Model."""
    def __init__(self):
        # Force CPU usage for compatibility
        device = "cpu"
        print(f"Using device: {device}")
        self.model = whisper.load_model("base", device=device)

    def transcribe(self, audio_data):
        """Transcribes the audio data using Whisper."""
        try:
            if audio_data.size == 0:
                print("Empty audio received, skipping transcription.")
                return
            # Log size and first few samples for debug
            print(f"Transcribing chunk: {audio_data[:10]}")
            result = self.model.transcribe(audio_data)
            print("Transcription:", result["text"])
        except Exception as e:
            print("Error during transcription:", e)

    def preprocess_audio(self, audio_data):
        """Preprocess audio data for Whisper compatibility."""
        # Log raw audio for debugging purposes
        print(f"Preprocessing audio: {audio_data[:10]}")
        if audio_data is None or audio_data.size == 0:
            print("Invalid audio data.")
            return None
        # Clip/normalize just enough
        audio_data = np.clip(audio_data, -1.0, 1.0)
        return audio_data


# Queue for audio chunks
q = queue.Queue()


def save_audio_debug(audio_data, filename="debug_audio.wav"):
    """Saves audio to a file for debugging purposes."""
    try:
        # Save raw audio data directly without overly complex manipulation
        print(f"Saving audio with length {len(audio_data)}")
        with wave.open(filename, "wb") as wf:
            wf.setnchannels(1)  # Mono
            wf.setsampwidth(2)  # 2 bytes for 16-bit PCM
            wf.setframerate(16000)  # Whisper's expected 16kHz
            wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
        print(f"Saved audio for debugging to {filename}")
    except Exception as e:
        print(f"Failed to save debug audio: {e}")


def audio_callback(indata, frames, time, status):
    """
    Callback function for audio capture.
    Logs raw audio for verification and ensures it's queued properly.
    """
    if status:
        print(f"Audio stream status: {status}")

    # Log audio chunk directly
    audio_data = indata[:, 0]  # Mono channel data
    if np.max(np.abs(audio_data)) > 0.01:  # Only save meaningful signals
        q.put(audio_data.copy())
        print(f"Audio chunk queued: {audio_data[:10]}")


def process_queue(whisper_model):
    """
    Processes audio chunks from the queue and passes them to Whisper for transcription.
    Runs in a separate thread.
    """
    while True:
        if not q.empty():
            audio_chunk = q.get()
            print(f"Processing chunk, max level {np.max(np.abs(audio_chunk))}")
            save_audio_debug(audio_chunk)  # Save for debugging purposes
            whisper_model.transcribe(audio_chunk)
        else:
            time.sleep(0.1)  # Avoid busy waiting


def main():
    """
    Main application function.
    Sets up Whisper model, audio input, and queue processing.
    """
    whisper_model = WhisperModel()

    try:
        # List all available audio input devices
        print("Available audio devices:")
        print(sd.query_devices())

        # Select the correct device index
        selected_device = 12  # Adjust this index if necessary
        print(f"Using device index: {selected_device}")

        # Open InputStream with the selected device
        with sd.InputStream(
            device=selected_device,  # Replace with your microphone's index
            samplerate=16000,
            blocksize=1024,
            dtype="float32",
            channels=1,
            callback=audio_callback,
        ):
            print("Listening for audio input...")
            # Start queue processing in a separate thread
            Thread(target=process_queue, args=(whisper_model,), daemon=True).start()

            # Keep the application running
            while True:
                time.sleep(0.1)
    except sd.PortAudioError as e:
        print(f"Audio stream error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


if __name__ == "__main__":
    main()


In [None]:
import whisper
import sounddevice as sd
import numpy as np
import queue
from threading import Thread
import wave
import time


class WhisperModelHandler:
    """Handles loading and transcription with the Whisper Model."""
    
    def __init__(self):
        # Force CPU usage for compatibility
        device = "cpu"
        print(f"Using device: {device}")
        self.model = whisper.load_model("base", device=device)

    def transcribe(self, audio_data):
        """Transcribes the audio data using Whisper."""
        try:
            if audio_data.size == 0:
                print("Empty audio received, skipping transcription.")
                return
            # Log size and first few samples for debug
            print(f"Transcribing chunk: {audio_data[:10]}")
            result = self.model.transcribe(audio_data)
            print("Transcription:", result["text"])
        except Exception as e:
            print("Error during transcription:", e)

    def preprocess_audio(self, audio_data):
        """Preprocess audio data for Whisper compatibility."""
        # Log raw audio for debugging purposes
        print(f"Preprocessing audio: {audio_data[:10]}")
        if audio_data is None or audio_data.size == 0:
            print("Invalid audio data.")
            return None
        # Clip/normalize just enough
        audio_data = np.clip(audio_data, -1.0, 1.0)
        return audio_data


class VoiceCapture:
    """Handles real-time voice capture and streams audio into a queue."""
    
    def __init__(self):
        self.q = queue.Queue()

    def audio_callback(self, indata, frames, time, status):
        """
        Callback function for audio capture.
        Logs raw audio for verification and ensures it's queued properly.
        """
        if status:
            print(f"Audio stream status: {status}")

        # Log audio chunk directly
        audio_data = indata[:, 0]  # Mono channel data
        if np.max(np.abs(audio_data)) > 0.01:  # Only save meaningful signals
            self.q.put(audio_data.copy())
            print(f"Audio chunk queued: {audio_data[:10]}")

    def stream_audio(self, device_index=12):
        """
        Sets up and runs the audio input stream.
        """
        try:
            # List all available audio input devices
            print("Available audio devices:")
            print(sd.query_devices())

            with sd.InputStream(
                device=device_index,  # Replace with your microphone's index
                samplerate=16000,
                blocksize=1024,
                dtype="float32",
                channels=1,
                callback=self.audio_callback,
            ):
                print("Listening for audio input...")
                while True:
                    time.sleep(0.1)  # Keep the application alive
        except sd.PortAudioError as e:
            print(f"Audio stream error: {e}")
        except Exception as e:
            print(f"Unexpected error: {e}")

    def get_audio_queue(self):
        """Returns the queue for processing audio."""
        return self.q


class Controller:
    """Manages communication between voice capture and Whisper model processing."""

    def __init__(self):
        self.voice_capture = VoiceCapture()
        self.whisper_handler = WhisperModelHandler()
        self.q = self.voice_capture.get_audio_queue()

    def process_queue(self):
        """
        Processes audio chunks from the queue and passes them to Whisper for transcription.
        Runs in a separate thread.
        """
        while True:
            if not self.q.empty():
                audio_chunk = self.q.get()
                print(f"Processing chunk, max level {np.max(np.abs(audio_chunk))}")
                self.whisper_handler.transcribe(audio_chunk)
            else:
                time.sleep(0.1)  # Avoid busy waiting

    def start(self):
        """
        Starts voice capturing and transcription processing threads.
        """
        # Start processing queue in a separate thread
        Thread(target=self.process_queue, daemon=True).start()
        # Start capturing voice input
        self.voice_capture.stream_audio()


def main():
    """
    Main function to initialize and run the application.
    """
    controller = Controller()
    controller.start()


if __name__ == "__main__":
    main()


In [4]:
!source /home/ferganey/spechmdl/bin/activate && python /media/ferganey/Data1/03_Projects/Final_Learning_AI-Projects/02_DeepLearning_Projects/00_Speech_Recognition_RasPi_ContainerzedApp/03_Layered_Arch/main.py


Using device: cpu
  checkpoint = torch.load(fp, map_location=device)
Application is running...
Listening for live audio...
Buffered audio chunk: [-0.64166677 -1.0914617  -0.95857143 -1.0158346  -0.99948746 -0.992232
 -1.010731   -0.9899608  -1.0073634  -0.9958534 ]
Buffered audio chunk: [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
Buffered audio chunk: [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
Buffered audio chunk: [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
Buffered audio chunk: [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
Buffered audio chunk: [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
Buffered audio chunk: [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
Buffered audio chunk: [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
Buffered audio chunk: [-5.9604645e-08 -5.9604645e-08 -5.9604645e-08 -5.9604645e-08
 -5.9604645e-08 -5.9604645e-08 -5.9604645e-08 -5.9604645e-08
 -5.9604645e-08 -5.9604645e-08]
Buffered audio chunk: [ 0.00492023 -0.00356422  0.00223093 -0.00118243  0.00071325 -0.00097385
  0.00192995 -0