In [21]:
import pathlib, os
import torch

In [22]:
from jc.speech import tool
import datasets
from tqdm.autonotebook import tqdm
import itertools
from more_itertools import chunked
from typing import Dict

In [23]:
os.getcwd()

'c:\\Users\\justatom\\Project\\jcommand'

In [24]:
model = torch.jit.load(str(pathlib.Path(os.getcwd()) / "silero_encoder_v6.pth"), map_location="cpu")

In [25]:
isinstance(model, torch.nn.Module)

True

In [26]:
decoder = tool.Decoder(labels=model.labels)

In [27]:
dataset_commands = datasets.load_dataset("speech_commands", "v0.02")

Reusing dataset speech_commands (C:\Users\justatom\.cache\huggingface\datasets\speech_commands\v0.02\0.2.0\ba3d9a6cf49aa1313c51abe16b59203451482ccb9fee6d23c94fecabf3e206da)
100%|██████████| 3/3 [00:00<00:00, 10.29it/s]


In [28]:
commands = [
    ("yes", 0), 
    ("no", 1),
    ("up",  2),
    ("down", 3), 
    ("left", 4),
    ("right", 5),
    ("on", 6),
    ("off", 7),
    ("stop", 8),
    ("go", 9),
    ("zero", 10),
    ("one", 11),
    ("two", 12),
    ("three", 13),
    ("four", 14),
    ("five", 15), 
    ("six", 16),
    ("seven", 17),
    ("eight", 18),
    ("nine", 19),
    ("bed", 20),
    ("bird", 21),
    ("cat", 22),
    ("dog", 23),
    ("happy", 24),
    ("house", 25),
    ("marvin", 26),
    ("sheila", 27),
    ("tree", 28),
    ("wow", 29),
    ("backward", 30),
    ("forward", 31),
    ("follow", 32),
    ("learn", 33),
    ("visual", 33)
]

In [29]:
mapping = { v: w for w, v in commands }

In [45]:
def wav_to_text(batch, encoder, decoder, device = None, sr: int = 16_000, mapping: Dict = None):
    if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
    labels = [_["label"] for _ in batch]
    audio  = [_["audio"] for _ in batch]
    noisy_array = [_["array"] for _ in audio]
    # denoisy_array = [nr.reduce_noise(y=a, sr=16_000) for a in noisy_array]
    x = tool.prepare_model_input(
        [torch.from_numpy(ex) for ex in noisy_array],
        device=device
    )
    output = model(x)
    assert len(output) == len(noisy_array)
    response = []
    for i, (example, label) in enumerate(zip(output, labels)):
        preds = decoder(example.cpu())
        response.append(
            {
                "preds": preds,
                "labels": mapping.get(label, "unknown"),
                "array": noisy_array[i]
            }
        )
    return response

In [46]:
batch_size = 1

In [47]:
num_samples = 10

In [48]:
monitor = []

In [49]:
for i, batch in itertools.islice(enumerate(tqdm(chunked(dataset_commands["train"], n=batch_size))), num_samples):
    response = wav_to_text(batch=batch, encoder=model, decoder=decoder, device="cpu", mapping=mapping)
    monitor.extend(response)

9it [00:00, 18.87it/s]


In [50]:
monitor

[{'preds': 'by cor',
  'labels': 'backward',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00012207,
         -0.00015259, -0.00012207])},
 {'preds': 'backward',
  'labels': 'backward',
  'array': array([ 0.00024414,  0.00106812,  0.00115967, ..., -0.0005188 ,
         -0.00012207, -0.00042725])},
 {'preds': 'big w',
  'labels': 'backward',
  'array': array([0.00027466, 0.00027466, 0.00091553, ..., 0.00567627, 0.00708008,
         0.00860596])},
 {'preds': 'backward',
  'labels': 'backward',
  'array': array([ 0.00018311,  0.00048828,  0.00067139, ..., -0.00018311,
         -0.00042725, -0.00033569])},
 {'preds': 'm',
  'labels': 'backward',
  'array': array([ 0.01748657,  0.02087402,  0.01416016, ..., -0.00268555,
         -0.00338745, -0.00393677])},
 {'preds': 'backward',
  'labels': 'backward',
  'array': array([-9.15527344e-05, -1.52587891e-04, -2.44140625e-04, ...,
         -2.13623047e-04,  3.05175781e-05, -1.52587891e-04])},
 {'preds': 'backward',
  'labels':

In [51]:
import sounddevice as sd

In [52]:
random_index = 7

In [53]:
sd.play(monitor[random_index]["array"], samplerate=16_000)