In [None]:
import IPython.display as ipd
import numpy as np
import json

SAMPLE_RATE = 16000

In [None]:
import sys
sys.path.append('../')

import incremental_transcriber
from whisper_mlx.whisper_mlx import load_model as load_whisper_model
from whisper_mlx.tokenizer import get_tokenizer

whisper_model = load_whisper_model("../models/whisper")

tokenizer = get_tokenizer(
    multilingual=whisper_model.is_multilingual,
    num_languages=whisper_model.num_languages,
    language="en",
    task="transcribe",
)

In [None]:
import pyaudio

p = pyaudio.PyAudio()

for i in range(p.get_device_count()):
    info = p.get_device_info_by_index(i)
    print(f"Device {i}: {info['name']} (Input Channels: {info['maxInputChannels']})")

p.terminate()

In [None]:
from math import ceil
import pyaudio

DEVICE_IDX = 3

def audio_bytes_to_np_array(bytes_data):
    arr = np.frombuffer(bytes_data, dtype='<i2')
    arr = arr.astype('float32') / 32768.0
    return arr

def record_audio(text, min_seconds=2, max_seconds=8):
    num_words = len(text.split())
    num_seconds = num_words * 0.4 # 400ms per word
    num_seconds = ceil(max(min(num_seconds, max_seconds), min_seconds))

    # Parameters
    FORMAT = pyaudio.paInt16  # Audio format
    CHANNELS = 1  # Number of audio channels
    RATE = 16000  # Sample rate
    CHUNK = 160  # Frame size
    RECORD_SECONDS = num_seconds  # Duration to record

    # Initialize pyaudio
    audio = pyaudio.PyAudio()

    # Open stream
    stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True,
                        input_device_index=DEVICE_IDX, 
                        frames_per_buffer=CHUNK)

    print(f"Recording for {RECORD_SECONDS} seconds...")

    frames = []

    # Record for the set duration
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("Finished recording.")

    # Stop and close the stream
    stream.stop_stream()
    stream.close()
    # Terminate the PortAudio interface
    audio.terminate()

    # Combine all the frames as a single byte string
    recorded_bytes = b''.join(frames)
    return recorded_bytes

def record_audio_prefix(text, num_seconds=10):
    print("Say: ", text)
    audio_bytes = record_audio(text, min_seconds=num_seconds, max_seconds=num_seconds)
    speech_arr = audio_bytes_to_np_array(audio_bytes)

    tokens = tokenizer.encode(text)

    transcribe_result = incremental_transcriber.transcribe(speech_arr, whisper_model, tokenizer)
    print("Heard: ", transcribe_result.text)

    audio_prefix = {
        "result_logprob": 0.0,
        "tokens": tokens,
        "np_arr": speech_arr
    }

    return audio_prefix

In [None]:
recorded_audio_prefix = record_audio_prefix("Hello, how can I help you today? Hi, what is the capital of France? The capital of France is Paris.", num_seconds=10)

In [None]:
ipd.display(ipd.Audio(recorded_audio_prefix["np_arr"], rate=SAMPLE_RATE))

In [None]:
file_path = "audio_prefix.json"
with open(file_path, 'w') as file:
    # Convert numpy array to list for JSON serialization
    audio_prefix_copy = recorded_audio_prefix.copy()
    audio_prefix_copy['np_arr'] = recorded_audio_prefix['np_arr'].tolist()
    json.dump(audio_prefix_copy, file)