In [None]:
import IPython.display as ipd
import numpy as np
import json

SAMPLE_RATE = 16000

In [None]:
import pyaudio

p = pyaudio.PyAudio()

for i in range(p.get_device_count()):
    info = p.get_device_info_by_index(i)
    print(f"Device {i}: {info['name']} (Input Channels: {info['maxInputChannels']})")

p.terminate()

In [None]:
from math import ceil
import pyaudio

DEVICE_IDX = 3

def audio_bytes_to_np_array(bytes_data):
    arr = np.frombuffer(bytes_data, dtype='<i2')
    arr = arr.astype('float32') / 32768.0
    return arr

def record_audio(text, min_seconds=2, max_seconds=8):
    num_words = len(text.split())
    num_seconds = num_words * 0.4 # 400ms per word
    num_seconds = ceil(max(min(num_seconds, max_seconds), min_seconds))

    # Parameters
    FORMAT = pyaudio.paInt16  # Audio format
    CHANNELS = 1  # Number of audio channels
    RATE = 16000  # Sample rate
    CHUNK = 160  # Frame size
    RECORD_SECONDS = num_seconds  # Duration to record

    # Initialize pyaudio
    audio = pyaudio.PyAudio()

    # Open stream
    stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True,
                        input_device_index=DEVICE_IDX, 
                        frames_per_buffer=CHUNK)

    print(f"Recording for {RECORD_SECONDS} seconds...")

    frames = []

    # Record for the set duration
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("Finished recording.")

    # Stop and close the stream
    stream.stop_stream()
    stream.close()
    # Terminate the PortAudio interface
    audio.terminate()

    # Combine all the frames as a single byte string
    recorded_bytes = b''.join(frames)
    return recorded_bytes

In [None]:
short_utterances = """
Hello
How are you?
What is the weather?
What is the time?
Open the file
Where is the file?
Tell me a joke
Okay
Listen to me
Stop
Open the browser
Close the window
Scroll down
Scroll up
Copy this text
Paste here
Save the document
Undo last action
Redo last action
Zoom in
Zoom out
Switch to next tab
Go back
Go forward
Refresh the page
Search for...
Open new tab
Close current tab
Minimize window
Maximize window
What's on my schedule today?
Can you set a reminder?
How do I create a new folder?
What's the current CPU usage?
How much free disk space do I have?
Turn on the lights
Turn off the lights
Increase volume
Decrease volume
Play music
Pause music
Next song
Previous song
Set an alarm
What's the temperature?
How's the traffic?
Send an email
Read my messages
Take a screenshot
Lock the screen
Restart the computer
Update the system
Install this app
Uninstall that program
Check for updates
What's my battery level?
Connect to Wi-Fi
Turn on Bluetooth
Mute the microphone
Enable dark mode
Translate this sentence
Calculate this equation
Convert currency
Show me directions to the airport
What's trending now?
Open file explorer
Show desktop
Empty recycle bin
Open task manager
Run virus scan
Check network status
Adjust screen brightness
Enable screen reader
Show system information
Open control panel
Launch settings app
Show available updates
Check disk space
Open file properties
Show hidden files
Create new user account
Change wallpaper
Enable firewall
Show running processes
Open device manager
Check RAM usage
Show network connections
Enable VPN
Disable notifications
Show battery health
Open sound settings
Launch voice assistant
Show system logs
Enable night light
Open power options
""".strip().split("\n")

utterance_to_audio_arr = {}

for utterance in short_utterances:
    print("Say: ", utterance)
    audio_bytes = record_audio(utterance, max_seconds=5)
    speech_arr = audio_bytes_to_np_array(audio_bytes)
    utterance_to_audio_arr[utterance] = speech_arr

In [None]:
ipd.display(ipd.Audio(utterance_to_audio_arr["Okay"], rate=SAMPLE_RATE))

In [None]:
with open("short_utterances_audio.json", "w") as f:
    serializable_dict = {k: v.tolist() for k, v in utterance_to_audio_arr.items()}
    json.dump(serializable_dict, f)

In [None]:
import sys
sys.path.append('../')

from webrtcvad import Vad
vad = Vad()
vad.set_mode(2)

import vad_checker

def np_array_to_audio_bytes(np_arr):
    np_arr = np_arr.astype('float32')
    scaled_arr = (np_arr * 32768.0).clip(-32768, 32767)
    int16_arr = scaled_arr.astype('<i2')
    return int16_arr.tobytes()

def trim_audio_arr(vad, audio_arr):
    audio_bytes = np_array_to_audio_bytes(audio_arr)
    vad_check_size = vad_checker.AUDIO_BYTES_VAD_CHECK_SIZE
    kept_segments = []

    for i in range(0, len(audio_bytes), vad_check_size):
        segment = audio_bytes[i:i+vad_check_size]
        if vad_checker.vad_check(vad, segment):
            kept_segments.append(segment)

    kept_audio_bytes = b''.join(kept_segments)
    kept_audio_arr = vad_checker.audio_bytes_to_np_array(kept_audio_bytes)

    return kept_audio_arr

In [None]:
trimmed_utterance_to_audio_arr = {}
for utterance, audio_arr in utterance_to_audio_arr.items():
    trimmed_audio_arr = trim_audio_arr(vad, audio_arr)
    trimmed_utterance_to_audio_arr[utterance] = trimmed_audio_arr

In [None]:
with open("short_utterances_audio_trimmed.json", "w") as f:
    serializable_dict = {k: v.tolist() for k, v in trimmed_utterance_to_audio_arr.items()}
    json.dump(serializable_dict, f)

In [None]:
trimmed_utterance_to_audio_arr = {}
with open("short_utterances_audio_trimmed.json", "r") as f:
    serializable_dict = json.load(f)
    trimmed_utterance_to_audio_arr = {k: np.array(v) for k, v in serializable_dict.items()}

In [None]:
ipd.display(ipd.Audio(trimmed_utterance_to_audio_arr["Okay"], rate=SAMPLE_RATE))