<a href="https://colab.research.google.com/github/anarlavrenov/Speech-Commands-Classifier/blob/main/inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

!pip install pydub

import torch
import torchaudio
from IPython import display as ipd
import sys
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
device = "cuda" if torch.cuda.is_available() else "cpu"

Mounted at /content/drive


In [None]:
labels_list = ['backward', 'bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five',
          'follow', 'forward', 'four', 'go', 'happy', 'house', 'learn', 'left',
          'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila',
          'six', 'stop', 'three', 'tree', 'two', 'up', 'visual', 'wow', 'yes', 'zero']

In [None]:
def transform_to_spectrogram(waveform):
  waveform = torch.tensor(waveform, dtype=torch.float32)

  if waveform.ndim > 1:
    waveform = torch.squeeze(waveform, dim=0)

  spectrogram = torchaudio.transforms.Spectrogram(
      n_fft=384,
      win_length=256,
      hop_length=160,
      power=None
  )(waveform).transpose(1, 0)

  spectrogram = torch.abs(spectrogram)
  spectrogram = torch.pow(spectrogram, 0.5)

  means = torch.mean(spectrogram, dim=1, keepdim=True)
  stddevs = torch.std(spectrogram, dim=1, keepdim=True)
  spectrogram = (spectrogram - means) / (stddevs + 1e-10)

  return spectrogram

def predict(model, waveform, sample_rate):
  model.eval()
  with torch.no_grad():
    waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=16000)
    spectrogram = transform_to_spectrogram(waveform).to(device)
    pred_logits = model(spectrogram.unsqueeze(0))
    pred_idx = torch.argmax(pred_logits, dim=1)
    pred_label = labels_list[pred_idx.item()]

    return pred_label

In [None]:
class EmotionClassifierModel(torch.nn.Module):
  def __init__(self, output_dim, n_mels=193, n_classes=35):
    super(EmotionClassifierModel, self).__init__()

    self.conv_block1 = self.conv_block(
        in_channels=1,
        out_channels=32,
        kernel_size=(11, 41),
        stride=(2, 2),
        padding=(5, 20),
        dilation=1,
        bias=False
    )

    self.conv_block2 = self.conv_block(
        in_channels=32,
        out_channels=32,
        kernel_size=(11, 21),
        stride=(1, 2),
        padding=(5, 10),
        dilation=1,
        bias=False
    )

    self.linear_block1 = self.linear_block(
        in_features=32 * 49,
        out_features=output_dim,
        activation=torch.nn.ReLU(),
        dropout=True
        )

    self.linear_block2 = self.linear_block(
        in_features=output_dim,
        out_features=n_classes,
        activation=None,
        dropout=False
        )

  def forward(self, src):
    src = torch.unsqueeze(src, 1)
    src = self.conv_block1(src)
    src = self.conv_block2(src)

    src = src.permute(0, 2, 1, 3)

    src = torch.mean(src, dim=1)
    src = src.reshape(src.shape[0], src.shape[1] * src.shape[2])

    src = self.linear_block1(src)
    output = self.linear_block2(src)

    return output

  def conv_block(self, in_channels, out_channels,
                 kernel_size, stride, padding, dilation, bias):
    block = torch.nn.Sequential(
            torch.nn.Conv2d(
                in_channels, out_channels,
                kernel_size, stride, padding, dilation, bias=bias
            ),
            torch.nn.BatchNorm2d(out_channels),
            torch.nn.ReLU()
    )

    torch.nn.init.xavier_uniform_(
        block[0].weight,
        gain=torch.nn.init.calculate_gain("relu"))

    return block

  def linear_block(self, in_features, out_features,
                   activation=None, dropout=False):
    block = torch.nn.Sequential(
        torch.nn.Linear(in_features, out_features)
    )

    if activation is not None:
      block.append(activation)
    if dropout == True:
      block.append(torch.nn.Dropout(0.5))

    torch.nn.init.xavier_uniform_(
        block[0].weight,
        gain=torch.nn.init.calculate_gain("linear"))

    return block

In [None]:
def record(seconds=1):

    from google.colab import output as colab_output
    import sys
    from base64 import b64decode
    from io import BytesIO
    from pydub import AudioSegment

    RECORD = (
        b"const sleep  = time => new Promise(resolve => setTimeout(resolve, time))\n"
        b"const b2text = blob => new Promise(resolve => {\n"
        b"  const reader = new FileReader()\n"
        b"  reader.onloadend = e => resolve(e.srcElement.result)\n"
        b"  reader.readAsDataURL(blob)\n"
        b"})\n"
        b"var record = time => new Promise(async resolve => {\n"
        b"  stream = await navigator.mediaDevices.getUserMedia({ audio: true })\n"
        b"  recorder = new MediaRecorder(stream)\n"
        b"  chunks = []\n"
        b"  recorder.ondataavailable = e => chunks.push(e.data)\n"
        b"  recorder.start()\n"
        b"  await sleep(time)\n"
        b"  recorder.onstop = async ()=>{\n"
        b"    blob = new Blob(chunks)\n"
        b"    text = await b2text(blob)\n"
        b"    resolve(text)\n"
        b"  }\n"
        b"  recorder.stop()\n"
        b"})"
    )
    RECORD = RECORD.decode("ascii")

    print(f"Recording started for {seconds} seconds.")
    display(ipd.Javascript(RECORD))
    s = colab_output.eval_js("record(%d)" % (seconds * 1000))
    print("Recording ended.")
    b = b64decode(s.split(",")[1])

    fileformat = "wav"
    filename = f"_audio.{fileformat}"
    AudioSegment.from_file(BytesIO(b)).export(filename, format=fileformat)
    return torchaudio.load(filename)

In [None]:
model = torch.load(
    "YOUR_PATH_TO_MODEL",
    map_location="cuda"
    )

  model = torch.load(


In [None]:
if "google.colab" in sys.modules:
    waveform, sample_rate = record()
    label = predict(model, waveform, sample_rate)
    print(f"You said: {label}")

Recording started for 1 seconds.


<IPython.core.display.Javascript object>

Recording ended.
left


  waveform = torch.tensor(waveform, dtype=torch.float32)
