Testing live

In [8]:
import os
import numpy as np
import speech_recognition as sr
import whisper
import torch

from datetime import datetime, timedelta
from queue import Queue
from time import sleep

# Define arguments as variables for notebook use
non_english = False    # Use non-English model if True
energy_threshold = 1000  # Energy level for mic detection
record_timeout = 2.0     # Real-time recording in seconds
phrase_timeout = 3.0     # Pause length between phrases for new line

# Initialize variables
phrase_time = None
data_queue = Queue()
recorder = sr.Recognizer()
recorder.energy_threshold = energy_threshold
recorder.dynamic_energy_threshold = False

# Set up microphone source
source = sr.Microphone(sample_rate=16000)

audio_model = whisper.load_model("base.en")

# Initialize transcription list
transcription = ['']

# Adjust microphone for ambient noise
with source:
    recorder.adjust_for_ambient_noise(source)

def record_callback(_, audio: sr.AudioData) -> None:
    """Threaded callback function to handle audio data."""
    data = audio.get_raw_data()
    data_queue.put(data)

# Start background recording
recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)
print("Model loaded and microphone initialized.\n")

try:
    while True:
        now = datetime.utcnow()
        if not data_queue.empty():
            phrase_complete = False
            if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
                phrase_complete = True
            phrase_time = now

            audio_data = b''.join(data_queue.queue)
            data_queue.queue.clear()

            audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0

            result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
            text = result['text'].strip()

            if phrase_complete:
                transcription.append(text)
            else:
                transcription[-1] = text

            os.system('cls' if os.name == 'nt' else 'clear')
            for line in transcription:
                print(line)
            print('', end='', flush=True)
        else:
            sleep(0.25)
except KeyboardInterrupt:
    print("\nTranscription stopped by user.")
    print("\nFinal Transcription:")
    for line in transcription:
        print(line)

  checkpoint = torch.load(fp, map_location=device)


Model loaded and microphone initialized.

[H[2J
[H[2J

[H[2J


[H[2J

Cold twinkle
[H[2J

little stuff.
[H[2J

Aw, why? Aaaaay!
[H[2J

Aw, why? Aaaaay!
to uh
[H[2J

Aw, why? Aaaaay!
Supa bop
[H[2J

Aw, why? Aaaaay!
Okay.
[H[2J

Aw, why? Aaaaay!
I like her.
[H[2J

Aw, why? Aaaaay!
Diamond in
[H[2J

Aw, why? Aaaaay!
I
[H[2J

Aw, why? Aaaaay!
Chingko, Chingko
[H[2J

Aw, why? Aaaaay!
little star
[H[2J

Aw, why? Aaaaay!
Ahh, ahh, ahh, ahh, ahh.
[H[2J

Aw, why? Aaaaay!
Wanda Watch
[H[2J

Aw, why? Aaaaay!
Thank you. Ahh.
[H[2J

Aw, why? Aaaaay!
Thank you. Ahh.
Drinko Drinko
[H[2J

Aw, why? Aaaaay!
Thank you. Ahh.
little star
[H[2J

Aw, why? Aaaaay!
Thank you. Ahh.
Oh
[H[2J

Aw, why? Aaaaay!
Thank you. Ahh.
Underwater
[H[2J

Aw, why? Aaaaay!
Thank you. Ahh.
AWWWWWW
[H[2J

Aw, why? Aaaaay!
Thank you. Ahh.
AWWWWWW
Bob Bob Bob
[H[2J

Aw, why? Aaaaay!
Thank you. Ahh.
AWWWWWW
Well
[H[2J

Aw, why? Aaaaay!
Thank you. Ahh.
AWWWWWW
phone Night
[H[2J



Fuzzy matching to find closest match phrase in the current verse

In [None]:
import os
import numpy as np
import speech_recognition as sr
import whisper
import torch
from datetime import datetime, timedelta
from queue import Queue
from time import sleep
from difflib import SequenceMatcher

# Known lyrics for "Twinkle, Twinkle, Little Star"
lyrics = {
    "Verse 1": [
        "Twinkle, twinkle, little star",
        "How I wonder what you are",
        "Up above the world so high",
        "Like a diamond in the sky",
    ]
}

# Fuzzy matching function
def find_closest_match(transcription, lyrics):
    best_match = ""
    highest_similarity = 0
    for line in lyrics:
        similarity = SequenceMatcher(None, transcription, line).ratio()
        if similarity > highest_similarity:
            highest_similarity = similarity
            best_match = line
    return best_match, highest_similarity

# Initialize variables for speech recognition and Whisper
energy_threshold = 1000  # Energy level for mic detection
record_timeout = 2.0  # Real-time recording in seconds
phrase_timeout = 3.0  # Pause length between phrases for new line
phrase_time = None
data_queue = Queue()
recorder = sr.Recognizer()
recorder.energy_threshold = energy_threshold
recorder.dynamic_energy_threshold = False

# Set up microphone source
source = sr.Microphone(sample_rate=16000)

# Load Whisper model
audio_model = whisper.load_model("base.en")

# Initialize transcription list
transcription = ['']

# Adjust microphone for ambient noise
with source:
    recorder.adjust_for_ambient_noise(source)

# Define a callback for audio data processing
def record_callback(_, audio: sr.AudioData) -> None:
    """Threaded callback function to handle audio data."""
    data = audio.get_raw_data()
    data_queue.put(data)

# Start background recording
recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)
print("Model loaded and microphone initialized.\n")

try:
    current_verse = "Verse 1"  # Start with the first verse
    while True:
        now = datetime.utcnow()
        if not data_queue.empty():
            phrase_complete = False
            if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
                phrase_complete = True
            phrase_time = now

            # Combine audio data from queue
            audio_data = b''.join(data_queue.queue)
            data_queue.queue.clear()

            # Convert audio data to the format Whisper expects
            audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0

            # Perform transcription using Whisper
            result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
            recognized_text = result['text'].strip()

            # Match the transcription to the current verse's lyrics
            match, similarity = find_closest_match(recognized_text, lyrics[current_verse])

            #if phrase_complete:
             #   transcription.append(match if similarity > 0.7 else recognized_text)
            #else:
             #   transcription[-1] = match if similarity > 0.7 else recognized_text

            # Print the transcription and matched lyrics
            os.system('cls' if os.name == 'nt' else 'clear')
            #print("Transcription (matched to lyrics):\n")
            #for line in transcription:
             #   print(line)
            print(f"\nRecognized: {recognized_text}")
            print(f"Best Match: {match} (Similarity: {similarity:.2f})")
        else:
            sleep(0.25)
except KeyboardInterrupt:
    print("\nTranscription stopped by user.")
    print("\nFinal Transcription:")
    for line in transcription:
        print(line)

Model loaded and microphone initialized.

[H[2J
Recognized: 
Best Match:  (Similarity: 0.00)
[H[2J
Recognized: 
Best Match:  (Similarity: 0.00)
[H[2J
Recognized: 
Best Match:  (Similarity: 0.00)
[H[2J
Recognized: Cold Winkle, Cold Winkle, Cold Winkle
Best Match: Twinkle, twinkle, little star (Similarity: 0.55)
[H[2J
Recognized: Go tweet call l ed who's
Best Match: How I wonder what you are (Similarity: 0.33)
[H[2J
Recognized: Call Twinkle Little Star Little Star Little Star Little Star Call Twinkle Little Star
Best Match: Twinkle, twinkle, little star (Similarity: 0.30)
[H[2J
Recognized: Oh
Best Match: How I wonder what you are (Similarity: 0.07)
[H[2J
Recognized: Star, how I wonder why
Best Match: How I wonder what you are (Similarity: 0.64)
[H[2J
Recognized: How I wonder what
Best Match: How I wonder what you are (Similarity: 0.81)
[H[2J
Recognized: Wanda, watch, she, Wanda, watch, she, Wanda, watch, she
Best Match: Twinkle, twinkle, little star (Similarity: 0.26)