Testing live

In [None]:
import os
import openvino
import openvino_genai
import torch
import whisper
import numpy as np
import speech_recognition as sr

from cmd_helper import optimum_cli
from datetime import datetime, timedelta
from pathlib import Path
from queue import Queue
from time import sleep

# Define arguments as variables for notebook use
non_english = False    # Use non-English model if True
energy_threshold = 1000  # Energy level for mic detection
record_timeout = 2.0     # Real-time recording in seconds
phrase_timeout = 3.0     # Pause length between phrases for new line

# Initialize variables
phrase_time = None
data_queue = Queue()
recorder = sr.Recognizer()
recorder.energy_threshold = energy_threshold
recorder.dynamic_energy_threshold = False

# Local organisation of models
model_value = "openai/whisper-medium"
model_dir = "whisper-medium"

# Set up microphone source
source = sr.Microphone(sample_rate=16000)

# export the whisper model
if not Path(model_dir).exists():
    optimum_cli(model_value, model_dir)

ov_pipeline = openvino_genai.WhisperPipeline(model_dir, device='CPU')

# Initialize transcription list
transcription = ['']

# Adjust microphone for ambient noise
with source:
    recorder.adjust_for_ambient_noise(source)

def record_callback(_, audio: sr.AudioData) -> None:
    """Threaded callback function to handle audio data."""
    data = audio.get_raw_data()
    data_queue.put(data)

# Start background recording
recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)
print("Model loaded and microphone initialized.\n")

try:
    while True:
        now = datetime.utcnow()
        if not data_queue.empty():
            phrase_complete = False
            if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
                phrase_complete = True
            phrase_time = now

            audio_data = b''.join(data_queue.queue)
            data_queue.queue.clear()

            audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
            print(f"Received {len(audio_np)} samples for transcription")

            result = ov_pipeline.generate(audio_np, task="transcribe", return_timestamps=True).chunks

            print(result)

            text = result['text'].strip()

            if phrase_complete:
                transcription.append(text)
            else:
                transcription[-1] = text

            os.system('cls' if os.name == 'nt' else 'clear')
            for line in transcription:
                print(line)
            print('', end='', flush=True)
        else:
            sleep(0.25)
except KeyboardInterrupt:
    print("\nTranscription stopped by user.")
    print("\nFinal Transcription:")
    for line in transcription:
        print(line)

Model loaded and microphone initialized.

Received 101376 samples for transcription
[<openvino_genai.py_openvino_genai.WhisperDecodedResultChunk object at 0x345593e70>]


TypeError: list indices must be integers or slices, not str

Fuzzy matching to find closest match phrase in the current verse

In [None]:
import os
import numpy as np
import speech_recognition as sr
import whisper
import torch
from datetime import datetime, timedelta
from queue import Queue
from time import sleep
from difflib import SequenceMatcher

# Known lyrics for "Twinkle, Twinkle, Little Star"
lyrics = {
    "Verse 1": [
        "Twinkle, twinkle, little star",
        "How I wonder what you are",
        "Up above the world so high",
        "Like a diamond in the sky",
    ]
}

# Fuzzy matching function
def find_closest_match(transcription, lyrics):
    best_match = ""
    highest_similarity = 0
    for line in lyrics:
        similarity = SequenceMatcher(None, transcription, line).ratio()
        if similarity > highest_similarity:
            highest_similarity = similarity
            best_match = line
    return best_match, highest_similarity

# Initialize variables for speech recognition and Whisper
energy_threshold = 1000  # Energy level for mic detection
record_timeout = 2.0  # Real-time recording in seconds
phrase_timeout = 3.0  # Pause length between phrases for new line
phrase_time = None
data_queue = Queue()
recorder = sr.Recognizer()
recorder.energy_threshold = energy_threshold
recorder.dynamic_energy_threshold = False

# Set up microphone source
source = sr.Microphone(sample_rate=16000)

# Load Whisper model
audio_model = whisper.load_model("base.en")

# Initialize transcription list
transcription = ['']

# Adjust microphone for ambient noise
with source:
    recorder.adjust_for_ambient_noise(source)

# Define a callback for audio data processing
def record_callback(_, audio: sr.AudioData) -> None:
    """Threaded callback function to handle audio data."""
    data = audio.get_raw_data()
    data_queue.put(data)

# Start background recording
recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)
print("Model loaded and microphone initialized.\n")

try:
    current_verse = "Verse 1"  # Start with the first verse
    while True:
        now = datetime.utcnow()
        if not data_queue.empty():
            phrase_complete = False
            if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
                phrase_complete = True
            phrase_time = now

            # Combine audio data from queue
            audio_data = b''.join(data_queue.queue)
            data_queue.queue.clear()

            # Convert audio data to the format Whisper expects
            audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0

            # Perform transcription using Whisper
            result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
            recognized_text = result['text'].strip()

            # Match the transcription to the current verse's lyrics
            match, similarity = find_closest_match(recognized_text, lyrics[current_verse])

            #if phrase_complete:
             #   transcription.append(match if similarity > 0.7 else recognized_text)
            #else:
             #   transcription[-1] = match if similarity > 0.7 else recognized_text

            # Print the transcription and matched lyrics
            os.system('cls' if os.name == 'nt' else 'clear')
            #print("Transcription (matched to lyrics):\n")
            #for line in transcription:
             #   print(line)
            print(f"\nRecognized: {recognized_text}")
            print(f"Best Match: {match} (Similarity: {similarity:.2f})")
        else:
            sleep(0.25)
except KeyboardInterrupt:
    print("\nTranscription stopped by user.")
    print("\nFinal Transcription:")
    for line in transcription:
        print(line)