In [None]:
%pip install gradio

In [None]:
import gradio as gr
import json
import pandas as pd
import collections
import scipy.signal
import numpy as np
from functools import partial
import sys
import tensorflow as tf
from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op as frontend_op

In [None]:
def generate_features_for_clip(clip):
    micro_frontend = frontend_op.audio_microfrontend(
        tf.convert_to_tensor(clip),
        sample_rate=16000,
        window_size=30,
        window_step=20,
        num_channels=40,
        upper_band_limit=7500,
        lower_band_limit=125,
        enable_pcan=True,
        min_signal_remaining=0.05,
        out_scale=1,
        out_type=tf.float32)
    output = tf.multiply(micro_frontend, 0.0390625)
    return output.numpy()

def features_generator(generator):
    for data in generator:
        for clip in data:
            yield generate_features_for_clip(clip)

infer_model = tf.lite.Interpreter(model_path="./stream_state_internal_quantize.tflite", num_threads=1)
infer_model.resize_tensor_input(0, [1,1,40], strict=True)  # initialize with fixed input size
infer_model.allocate_tensors()
input_details = infer_model.get_input_details()
output_details = infer_model.get_output_details()
print()
print("Input details:")
print(input_details)
print()
print("Output details:")
print(output_details)
print()

In [None]:
test_vec = []

In [9]:

# Define function to process audio
def process_audio(audio, state=collections.defaultdict(partial(collections.deque, maxlen=60))):
    # Resample audio to 16khz if needed
    if audio[0] != 16000:
        data = scipy.signal.resample(audio[1], int(float(audio[1].shape[0])/audio[0]*16000))

    data = data.astype(np.int16)
    print(data.shape)
    res = generate_features_for_clip(data)
    # Get predictions
    for row in res:
        row1 = row.astype(np.int8)
        row2 = row1.reshap([1,1,40])
        infer_model.set_tensor(input_details[0]['index'], row1)
        infer_model.invoke()
        pred = infer_model.get_tensor(output_details[0]['index'])
        print(pred)
        # for key in prediction:
        #     #Fill deque with zeros if it's empty
        #     if len(state[key]) == 0:
        #         state[key].extend(np.zeros(60))

        #     # Add prediction
        #     state[key].append(prediction[key])

        # if len(data.shape) == 2 or data.shape[-1] == 2:
        #     chunk = data[i:i+40][:, 0]  # just get one channel of audio
        # else:
        #     chunk = data[i:i+40]

        # if chunk.shape[0] == 40:
        #     prediction = infer_model.predict(row)
        #     for key in prediction:
        #         #Fill deque with zeros if it's empty
        #         if len(state[key]) == 0:
        #             state[key].extend(np.zeros(60))

        #         # Add prediction
        #         state[key].append(prediction[key])

    # Make line plot
    dfs = []
    for key in state.keys():
        df = pd.DataFrame({"x": np.arange(len(state[key])), "y": state[key], "Model": key})
        dfs.append(df)

    df = pd.concat(dfs)
    plot = gr.LinePlot().update(value = df, x='x', y='y', color="Model", y_lim = (0,1), tooltip="Model",
                                width=600, height=300, x_title="Time (frames)", y_title="Model Score", color_legend_position="bottom")

    # Manually adjust how the legend is displayed
    tmp = json.loads(plot["value"]["plot"])
    tmp["layer"][0]['encoding']['color']['legend']["direction"] = "vertical"
    tmp["layer"][0]['encoding']['color']['legend']["columns"] = 4
    tmp["layer"][0]['encoding']['color']['legend']["labelFontSize"] = 12
    tmp["layer"][0]['encoding']['color']['legend']["titleFontSize"] = 14

    plot["value"]['plot'] = json.dumps(tmp)

    return plot, state

# Create Gradio interface and launch



In [None]:
desc = """
This is a test of the pre-trained models
"""

gr_int = gr.Interface(
    title = "openWakeWord Live Demo",
    description = desc,
    css = ".flex {flex-direction: column} .gr-panel {width: 100%}",
    fn=process_audio,
    inputs=[
        gr.Audio(sources=["microphone"], type="numpy", streaming=True, show_label=False),
        #gr.Audio(source=["microphone"], type="numpy", streaming=True, show_label=False),
        "state"
    ],
    outputs=[
        gr.LinePlot(show_label=False),
        "state"
    ],
    live=True)

gr_int.launch(share=True, debug=True)