In [None]:
!pip install vosk
!pip install sounddevice

In [None]:
import vosk
import sounddevice as sd
import numpy as np
import json
import queue
import threading
import sys
import scipy statistics
class RealTimeSpeechRecognition:
    def __init__(self, model_path, sample_rate=16000, channels=1):
        try:
            # Initialize Vosk model
            self.model = vosk.Model(model_path)
            self.sample_rate = sample_rate
            self.channels = channels
            
            # Create recognizer instance
            self.recognizer = vosk.KaldiRecognizer(self.model, sample_rate)
            
            # Queue for thread-safe processing
            self.audio_queue = queue.Queue()
            
            # Flag to control recognition thread
            self.is_running = threading.Event()
            
        except Exception as e:
            print(f"Error initializing speech recognition: {e}")
            sys.exit(1)
    
    def audio_callback(self, indata, frames, time, status):
        """
        Callback function to process audio stream
        """
        if status:
            print(status)
        
        # Convert numpy array to bytes
        audio_bytes = indata.astype(np.int16).tobytes()
        self.audio_queue.put(audio_bytes)
    
    def recognition_thread(self):
        """
        Separate thread for continuous speech recognition
        """
        while self.is_running.is_set():
            try:
                # Get audio data from queue
                audio_bytes = self.audio_queue.get(timeout=1)
                
                # Recognize speech
                if self.recognizer.AcceptWaveform(audio_bytes):
                    result = json.loads(self.recognizer.Result())
                    text = result.get('text', '').strip()
                    
                    if text:
                        print("Recognized Text:", text)
            
            except queue.Empty:
                continue
            except Exception as e:
                print(f"Recognition error: {e}")
    
    def start_recognition(self):
        """
        Start real-time speech recognition
        """
        try:
            # Set running flag
            self.is_running.set()
            
            # Start recognition thread
            rec_thread = threading.Thread(target=self.recognition_thread)
            rec_thread.daemon = True
            rec_thread.start()
            
            # Start audio input stream
            with sd.InputStream(
                callback=self.audio_callback, 
                channels=self.channels, 
                samplerate=self.sample_rate, 
                dtype='int16'
            ):
                print("Listening... Press Ctrl+C to stop")
                
                # Keep main thread running
                while self.is_running.is_set():
                    sd.sleep(100)
        
        except KeyboardInterrupt:
            print("\nStopping speech recognition...")
        except Exception as e:
            print(f"Error during recognition: {e}")
        finally:
            # Stop recognition
            self.is_running.clear()

def main():
    # Specify your model path
    MODEL_PATH = "F:\\offline_stt\\vosk-model-small-en-us-0.15\\vosk-model-small-en-us-0.15"
    
    # Create and start recognition
    stt = RealTimeSpeechRecognition(MODEL_PATH)
    stt.start_recognition()

if __name__ == "__main__":
    main()

Listening... Press Ctrl+C to stop
Recognized Text: mommy i this if i have heard the healthy meal settlement with it
Recognized Text: can you hear me
Recognized Text: okay ours is it's very a light or something
Recognized Text: how k
Recognized Text: stop
Recognized Text: how this going on
Recognized Text: hey
Recognized Text: how to solve this

Stopping speech recognition...


In [11]:
import scipy
import statistics

In [12]:
import vosk
import sounddevice as sd
import numpy as np
import json
import time
import statistics

class STTEvaluator:
    def __init__(self, model_path, sample_rate=16000):
        # Initialize Vosk model
        self.model = vosk.Model(model_path)
        self.recognizer = vosk.KaldiRecognizer(self.model, sample_rate)
        
        # Evaluation metrics
        self.recognition_times = []
        self.accuracy_scores = []
    
    def test_recognition(self, audio_file, ground_truth):
        """
        Evaluate speech recognition on a specific audio file
        
        Args:
        audio_file (str): Path to audio file
        ground_truth (str): Correct transcription
        
        Returns:
        dict: Evaluation metrics
        """
        # Load audio file
        audio_data = self._load_audio(audio_file)
        
        # Start timing
        start_time = time.time()
        
        # Recognize speech
        results = []
        for chunk in self._audio_chunks(audio_data):
            if self.recognizer.AcceptWaveform(chunk):
                result = json.loads(self.recognizer.Result())
                results.append(result['text'])
        
        # Final result
        final_result = ' '.join(results)
        
        # Calculate metrics
        recognition_time = time.time() - start_time
        wer = self._calculate_wer(ground_truth, final_result)
        
        # Store metrics
        self.recognition_times.append(recognition_time)
        self.accuracy_scores.append(1 - wer)
        
        return {
            'recognized_text': final_result,
            'ground_truth': ground_truth,
            'recognition_time': recognition_time,
            'word_error_rate': wer,
            'accuracy': 1 - wer
        }
    
    def _load_audio(self, file_path, dtype=np.int16):
        """
        Load audio file
        
        Args:
        file_path (str): Path to audio file
        dtype (numpy dtype): Data type for audio
        
        Returns:
        numpy.ndarray: Audio data
        """
        from scipy.io import wavfile
        sample_rate, audio = wavfile.read(file_path)
        return audio.astype(dtype)
    
    def _audio_chunks(self, audio_data, chunk_size=16000):
        """
        Split audio into processable chunks
        
        Args:
        audio_data (numpy.ndarray): Full audio data
        chunk_size (int): Size of each chunk
        
        Yields:
        bytes: Audio chunks
        """
        for i in range(0, len(audio_data), chunk_size):
            chunk = audio_data[i:i+chunk_size]
            yield chunk.tobytes()
    
    def _calculate_wer(self, ground_truth, recognized_text):
        """
        Calculate Word Error Rate
        
        Args:
        ground_truth (str): Correct text
        recognized_text (str): Recognized text
        
        Returns:
        float: Word Error Rate
        """
        ground_words = ground_truth.lower().split()
        recognized_words = recognized_text.lower().split()
        
        def levenshtein_distance(s1, s2):
            m, n = len(s1), len(s2)
            dp = [[0] * (n + 1) for _ in range(m + 1)]
            
            for i in range(m + 1):
                dp[i][0] = i
            for j in range(n + 1):
                dp[0][j] = j
            
            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if s1[i-1] == s2[j-1]:
                        dp[i][j] = dp[i-1][j-1]
                    else:
                        dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
            
            return dp[m][n]
        
        edit_distance = levenshtein_distance(ground_words, recognized_words)
        return edit_distance / len(ground_words)
    
    def get_overall_performance(self):
        """
        Get overall performance metrics
        
        Returns:
        dict: Overall evaluation metrics
        """
        return {
            'average_recognition_time': statistics.mean(self.recognition_times),
            'average_accuracy': statistics.mean(self.accuracy_scores),
            'recognition_time_std': statistics.stdev(self.recognition_times),
            'accuracy_std': statistics.stdev(self.accuracy_scores)
        }

def main():
    # Configuration
    MODEL_PATH = "F:\\offline_stt\\vosk-model-small-en-us-0.15\\vosk-model-small-en-us-0.15"
    
    # Test audio files and ground truths
    test_cases = [
        {
            'audio_file': 'test1.wav',
            'ground_truth': 'hello world how are you'
        },
        {
            'audio_file': 'test2.wav',
            'ground_truth': 'python speech recognition is working'
        }
    ]
    
    # Initialize evaluator
    evaluator = STTEvaluator(MODEL_PATH)
    
    # Run evaluations
    results = []
    for case in test_cases:
        result = evaluator.test_recognition(
            case['audio_file'], 
            case['ground_truth']
        )
        results.append(result)
        print(f"Test Result: {result}")
    
    # Overall performance
    performance = evaluator.get_overall_performance()
    print("\nOverall Performance:")
    for metric, value in performance.items():
        print(f"{metric}: {value}")

if __name__ == "__main__":
    main()

FileNotFoundError: [Errno 2] No such file or directory: 'test1.wav'