In [None]:
import sys
sys.path.append('../')

import IPython.display as ipd
import json

In [None]:
import incremental_transcriber
from whisper_mlx.whisper_mlx import load_model as load_whisper_model
from whisper_mlx.tokenizer import get_tokenizer

whisper_model = load_whisper_model("../models/whisper")

tokenizer = get_tokenizer(
    multilingual=whisper_model.is_multilingual,
    num_languages=whisper_model.num_languages,
    language="en",
    task="transcribe",
)

In [None]:
from audio_io import SAMPLE_RATE
from min_rhasspy_piper.voice import PiperVoice
import numpy as np
import librosa

voice = PiperVoice.load(
    model_path="../models/piper/voice.onnx",
    config_path="../models/piper/voice.json")

def generate_speech_prefix(text):
    results = []
    for result in voice.synthesize_stream_raw(text):
        results.append(result)
    speech_arr = np.concatenate(results)

    # Make the sample rate used in audio_io
    original_sr = voice.config.sample_rate
    target_sr = SAMPLE_RATE
    speech_arr = librosa.resample(speech_arr, orig_sr=original_sr, target_sr=target_sr)

    # Add 2 seconds of silence to the end
    speech_arr = np.append(speech_arr, np.zeros(int(target_sr * 2)))

    return speech_arr

def generate_audio_prefix(text):
    speech_arr = generate_speech_prefix(text)
    transcribe_result = incremental_transcriber.transcribe(speech_arr, whisper_model, tokenizer)

    audio_prefix = {
        "result_logprob": transcribe_result.logprob,
        "tokens": transcribe_result.tokens,
        "np_arr": transcribe_result.audio_arr
    }

    return audio_prefix

In [None]:
synthetic_audio_prefix = generate_audio_prefix("Hello, I am a voice assistant that can help you with your questions. How can I help you today?")
ipd.display(ipd.Audio(synthetic_audio_prefix["np_arr"], rate=SAMPLE_RATE))

In [None]:
recorded_audio_prefix = {}
with open("../calibration/audio_prefix.json", "r") as f:
    serializable_dict = json.load(f)
    recorded_audio_prefix = {k: np.array(v) for k, v in serializable_dict.items()}

In [None]:
ipd.display(ipd.Audio(recorded_audio_prefix["np_arr"], rate=SAMPLE_RATE))

In [None]:
trimmed_utterance_to_audio_arr = {}
with open("short_utterances_audio_trimmed.json", "r") as f:
    serializable_dict = json.load(f)
    trimmed_utterance_to_audio_arr = {k: np.array(v) for k, v in serializable_dict.items()}

In [None]:
ipd.display(ipd.Audio(trimmed_utterance_to_audio_arr["Okay"], rate=SAMPLE_RATE))

In [None]:
import evaluate
wer = evaluate.load("wer")

def evaluate_prefix(audio_prefix):
    utterances = []
    predictions = []
    trimmed_utterances = []
    trimmed_results = []

    for utterance, audio_arr in trimmed_utterance_to_audio_arr.items():
        result = incremental_transcriber.transcribe(audio_arr, whisper_model, tokenizer, audio_prefix)
        print(utterance, "->", result.text)

        utterances.append(utterance)
        predictions.append(result.text)

        trimmed_result = ''.join(c for c in result.text.strip() if c.isalnum() or c.isspace())
        trimmed_utterance = ''.join(c for c in utterance.strip() if c.isalnum() or c.isspace())

        trimmed_utterances.append(trimmed_utterance)
        trimmed_results.append(trimmed_result)

    print("")
    print(f"WER:                      {wer.compute(references=utterances, predictions=predictions):.4f}")
    print(f"WER (remove punctuation): {wer.compute(references=trimmed_utterances, predictions=trimmed_results):.4f}")

In [None]:
evaluate_prefix(audio_prefix=None)

In [None]:
evaluate_prefix(audio_prefix=synthetic_audio_prefix)

In [None]:
evaluate_prefix(audio_prefix=recorded_audio_prefix)

In [None]:
def apply_noise_profile(clean_audio, noise_profile, noise_factor=0.1):
    # Normalize clean_audio and noise_profile
    clean_audio_norm = clean_audio / np.max(np.abs(clean_audio))
    noise_profile_norm = noise_profile / np.max(np.abs(noise_profile))
    
    # Tile the normalized noise profile to match the length of the clean audio
    tiled_noise = np.tile(noise_profile_norm, (len(clean_audio_norm) // len(noise_profile_norm)) + 1)[:len(clean_audio_norm)]
    
    # Mix the normalized noise with the normalized clean audio
    noisy_audio = clean_audio_norm + noise_factor * tiled_noise
    
    # Rescale the mixed audio to the original range of clean_audio
    max_amplitude = np.max(np.abs(clean_audio))
    noisy_audio_rescaled = noisy_audio * max_amplitude / np.max(np.abs(noisy_audio))
    
    return noisy_audio_rescaled

def generate_noisy_audio_prefix(text, noise_profile, noise_factor):
    speech_arr = generate_speech_prefix(text)
    noisy_speech_arr = apply_noise_profile(speech_arr, noise_profile, noise_factor)
    transcribe_result = incremental_transcriber.transcribe(noisy_speech_arr, whisper_model, tokenizer)

    print("Said: ", text)
    print("Heard: ", transcribe_result.text)

    audio_prefix = {
        "result_logprob": transcribe_result.logprob,
        "tokens": transcribe_result.tokens,
        "np_arr": transcribe_result.audio_arr
    }

    return audio_prefix

In [None]:
# Get noise sample from recorded audio
noise_profile = recorded_audio_prefix['np_arr'][:SAMPLE_RATE // 2]
ipd.display(ipd.Audio(noise_profile, rate=SAMPLE_RATE))

In [None]:
noisy_audio_prefix = generate_noisy_audio_prefix("Hello, how can I help you today?", noise_profile, noise_factor=1)

In [None]:
ipd.display(ipd.Audio(noisy_audio_prefix['np_arr'], rate=SAMPLE_RATE))

In [None]:
evaluate_prefix(audio_prefix=noisy_audio_prefix)