## Main part

In [None]:
!pip install onnx lazycon coremltools sounddevice

Collecting sounddevice
  Obtaining dependency information for sounddevice from https://files.pythonhosted.org/packages/39/ae/5e84220bfca4256e4ca2a62a174636089ab6ff671b5f9ddd7e8238587acd/sounddevice-0.4.6-py3-none-win_amd64.whl.metadata
  Downloading sounddevice-0.4.6-py3-none-win_amd64.whl.metadata (1.4 kB)
Downloading sounddevice-0.4.6-py3-none-win_amd64.whl (199 kB)
   ---------------------------------------- 0.0/199.7 kB ? eta -:--:--
   ------- ------------------------------- 41.0/199.7 kB 991.0 kB/s eta 0:00:01
   ---------------------------------- ----- 174.1/199.7 kB 2.1 MB/s eta 0:00:01
   ---------------------------------------- 199.7/199.7 kB 2.0 MB/s eta 0:00:00
Installing collected packages: sounddevice
Successfully installed sounddevice-0.4.6


In [None]:
import torch
import onnx
import toml
import librosa
import lazycon
import time
import os, sys

import coremltools as ct
import numpy as np
import soundfile as sf
import sounddevice as sd

sys.path.append('../experiments')
import core

In [None]:
def create_features(
    data: np.array,
    hop_length_coef: float = 0.01,
    win_length_coef: float = 0.02,
    sample_rate: int = 16000,
    n_mels: int = 64,
) -> np.array:
    """
    As an input all models use standard speech features:
    64 Mel-filterbank calculated from 20ms windows with a 10ms overlap.
    """

    hop_length = int(sample_rate * hop_length_coef)
    win_length = int(sample_rate * win_length_coef)
    if len(data) != 0:
        spec = librosa.feature.melspectrogram(
                    y=data,
                    sr=sample_rate,
                    hop_length=hop_length,
                    n_fft=win_length,
                    n_mels=n_mels,
                )
    else:
        raise AttributeError
    mel_spec = librosa.power_to_db(spec, ref=np.max)

    return mel_spec

In [None]:
def create_features_for_audio(
    wav_name: str,
    hop_length_coef: float = 0.01,
    win_length_coef: float = 0.02,
    sample_rate: int = 16000,
    n_mels: int = 64,
) -> np.array:
    """
    As an input all models use standard speech features:
    64 Mel-filterbank calculated from 20ms windows with a 10ms overlap.
    """

    hop_length = int(sample_rate * hop_length_coef)
    win_length = int(sample_rate * win_length_coef)
    data, rate = librosa.load(wav_name, sr=sample_rate)
    print(data)
    if len(data) != 0:
        spec = librosa.feature.melspectrogram(
                    y=data,
                    sr=rate,
                    hop_length=hop_length,
                    n_fft=win_length,
                    n_mels=n_mels,
                )
    else:
        raise AttributeError
    mel_spec = librosa.power_to_db(spec, ref=np.max)
    return mel_spec

In [None]:
def index2name(
    index: int
) -> str:
    class_dict = {0:"angry", 1:"sad", 2:"neutral", 3:"positive"}

    if index > len(class_dict) or index < 0:
        raise AttributeError

    return class_dict[index]

In [None]:
dir_path = './model/'
model_name = 'podcasts_finetune_old_w_lr_1e-3_try1'
device = 'cpu'

In [None]:
from torchvision.models.mobilenetv2 import Conv2dNormActivation as ConvBNReLU

In [None]:
config_path = os.path.join(dir_path, "train.config")
assert os.path.exists(config_path), f"No train.config in {dir_path}"

model_path = os.path.join(dir_path, model_name)
# check the model
if not os.path.exists(model_path):
    print(f"There is no saved model {model_path}. Nothing to inference")
#     return None

# load the model
cfg = lazycon.load(config_path)
model = cfg.model

model.to(device)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model = model.double()
model.eval()

ConvSelfAttentionMobileNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=4, bias=False)
          (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(4, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(3

In [None]:
sd.query_devices()

   0 Microsoft Sound Mapper - Input, MME (2 in, 0 out)
>  1 Набор микрофонов (Технология In, MME (2 in, 0 out)
   2 Microsoft Sound Mapper - Output, MME (0 in, 2 out)
<  3 Динамики (Realtek(R) Audio), MME (0 in, 2 out)
   4 Первичный драйвер записи звука, Windows DirectSound (2 in, 0 out)
   5 Набор микрофонов (Технология Intel® Smart Sound), Windows DirectSound (2 in, 0 out)
   6 Первичный звуковой драйвер, Windows DirectSound (0 in, 2 out)
   7 Динамики (Realtek(R) Audio), Windows DirectSound (0 in, 2 out)
   8 Динамики (Realtek(R) Audio), Windows WASAPI (0 in, 2 out)
   9 Набор микрофонов (Технология Intel® Smart Sound), Windows WASAPI (2 in, 0 out)
  10 Стерео микшер (Realtek HD Audio Stereo input), Windows WDM-KS (2 in, 0 out)
  11 Headphones 1 (Realtek HD Audio 2nd output with SST), Windows WDM-KS (0 in, 2 out)
  12 Headphones 2 (Realtek HD Audio 2nd output with SST), Windows WDM-KS (0 in, 2 out)
  13 Динамик ПК (Realtek HD Audio 2nd output with SST), Windows WDM-KS (2 in, 0 out)

In [None]:
fs=16000
duration = 5 # seconds
myrecording = sd.rec(duration * fs, samplerate=fs, channels=1, dtype='float64')
print ("Recording Audio")
sd.wait()
print ("Audio recording complete , Play Audio")
sd.play(myrecording, fs)

Recording Audio
Audio recording complete , Play Audio


In [None]:
myrecording.shape

(80000, 1)

In [None]:
myrecording

array([[ 0.00000000e+00],
       [ 0.00000000e+00],
       [-3.05175781e-05],
       ...,
       [-9.06372070e-03],
       [-2.72827148e-02],
       [-2.60009766e-02]])

In [None]:
feat = create_features(np.transpose(myrecording)[0])

In [None]:
#feat = create_features_for_audio('wavs/c9780b567a8de31862971aa5412bf834.wav')

In [None]:
print(f"Calculating predicts")
inputs = torch.from_numpy(feat).to(device).unsqueeze(0).unsqueeze(0)

with torch.no_grad():
    probs = model(inputs)

Calculating predicts


In [None]:
pred_class = np.argmax(probs.cpu().numpy(), axis=1)
index2name(pred_class[0])

'positive'

In [None]:
probs

tensor([[-2.3097,  1.7922,  1.0512, -1.7895]], dtype=torch.float64)

## Added part (testing)

### Downloads & imports & dataset

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16


In [None]:
import datasets
from datasets import load_dataset

dataset = load_dataset("Aniemore/resd")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/391M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/94.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1116 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/280 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['name', 'path', 'emotion', 'speech'],
        num_rows: 1116
    })
    test: Dataset({
        features: ['name', 'path', 'emotion', 'speech'],
        num_rows: 280
    })
})

In [None]:
dataset["train"][0]

{'name': '32_happiness_enthusiasm_h_120',
 'path': 'happiness_enthusiasm_32/32_happiness_enthusiasm_h_120.wav',
 'emotion': 'happiness',
 'speech': {'path': '32_happiness_enthusiasm_h_120.wav',
  'array': array([-0.00018311, -0.00061035, -0.00076294, ...,  0.00085449,
          0.00048828,  0.00030518]),
  'sampling_rate': 16000}}

###Testing

In [None]:
answers = dataset["train"][:]["emotion"]
set(answers)

{'anger', 'disgust', 'enthusiasm', 'fear', 'happiness', 'neutral', 'sadness'}

In [None]:
def perform_test(arr):
    arr = np.reshape(arr, (arr.shape[0], 1))
    feat = create_features(np.transpose(arr)[0])
    inputs = torch.from_numpy(feat).to(device).unsqueeze(0).unsqueeze(0)
    with torch.no_grad():
        probs = model(inputs)
    pred_class = np.argmax(probs.cpu().numpy(), axis=1)
    return index2name(pred_class[0])

In [None]:
dictionary = {"angry": "anger",
              "sad": "sadness",
              "neutral": "neutral",
              "positive": "happiness"}

In [None]:
from tqdm import tqdm

#### Train part

In [None]:
all_cnt, correct_cnt = 0, 0
for i in tqdm(range(len(dataset["train"]))):
    sample = dataset["train"][i]
    if sample["emotion"] in dictionary.values():
        all_cnt += 1
        predicted = perform_test(sample["speech"]["array"]) # angry / sad / neutral / positive
        #print(predicted)
        correct = sample["emotion"] # anger / sadness / neutral / happiness
        # print("correct:", sample["emotion"], "\n")
        if dictionary[predicted] == correct:
            correct_cnt += 1

100%|██████████████████████████████████████████████████████████████████████████████| 1116/1116 [05:29<00:00,  3.38it/s]


In [None]:
correct_cnt / all_cnt

0.35443037974683544

In [None]:
all_cnt

632

####Test part

In [None]:
all_cnt, correct_cnt = 0, 0
for i in tqdm(range(len(dataset["test"]))):
    sample = dataset["test"][i]
    if sample["emotion"] in dictionary.values():
        all_cnt += 1
        predicted = perform_test(sample["speech"]["array"]) # angry / sad / neutral / positive
        #print(predicted)
        correct = sample["emotion"] # anger / sadness / neutral / happiness
        # print("correct:", sample["emotion"], "\n")
        if dictionary[predicted] == correct:
            correct_cnt += 1

100%|████████████████████████████████████████████████████████████████████████████████| 280/280 [00:41<00:00,  6.68it/s]


In [None]:
correct_cnt / all_cnt

0.379746835443038