# Emotions classification

## Downloads & imports

In [None]:
!pip install aniemore

Collecting aniemore
  Downloading aniemore-1.2.3-py3-none-any.whl (26 kB)
Collecting datasets<3.0.0,>=2.9.0 (from aniemore)
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpeg-python<0.3.0,>=0.2.0 (from aniemore)
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting openai-whisper<20230125,>=20230124 (from aniemore)
  Downloading openai-whisper-20230124.tar.gz (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sox<2.0.0,>=1.4.1 (from aniemore)
  Downloading sox-1.4.1-py2.py3-none-any.whl (39 kB)
Collecting transformers==4.26.1 (from aniemore)
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m 

In [None]:
import torch
import aniemore
import os
import numpy as np
import warnings
import scipy.io.wavfile as wavf
from tqdm import tqdm
from aniemore.recognizers.voice import VoiceRecognizer
from aniemore.models import HuggingFaceModel
from zipfile import ZipFile as zlib
warnings.filterwarnings('ignore')

## RESD

In [None]:
from datasets import load_dataset

dataset = load_dataset("Aniemore/resd")

Downloading readme:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/391M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/94.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1116 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/280 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['name', 'path', 'emotion', 'speech'],
        num_rows: 1116
    })
    test: Dataset({
        features: ['name', 'path', 'emotion', 'speech'],
        num_rows: 280
    })
})

In [None]:
dataset["train"][0]

{'name': '32_happiness_enthusiasm_h_120',
 'path': 'happiness_enthusiasm_32/32_happiness_enthusiasm_h_120.wav',
 'emotion': 'happiness',
 'speech': {'path': '32_happiness_enthusiasm_h_120.wav',
  'array': array([-0.00018311, -0.00061035, -0.00076294, ...,  0.00085449,
          0.00048828,  0.00030518]),
  'sampling_rate': 16000}}

In [None]:
answers = dataset["train"][:]["emotion"]
answers_test = dataset["test"][:]["emotion"]

In [None]:
os.mkdir("resd")

In [None]:
for i, arr in enumerate(dataset["train"]):
    out_f = "resd/" + str(i) + ".wav"
    wavf.write(out_f, 16000, arr["speech"]["array"])

In [None]:
os.mkdir("resd_test")

In [None]:
for i, arr in enumerate(dataset["test"]):
    out_f = "resd_test/" + str(i) + ".wav"
    wavf.write(out_f, 16000, arr["speech"]["array"])

## Models

In [None]:
[e.value for e in HuggingFaceModel.Voice]

[Model(model_cls=<class 'transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification'>, model_url='aniemore/wav2vec2-emotion-russian-resd'),
 Model(model_cls=<class 'transformers.models.auto.modeling_auto.AutoModelForSequenceClassification'>, model_url='aniemore/wav2vec2-xlsr-53-russian-emotion-recognition'),
 Model(model_cls=<class 'transformers.models.wavlm.modeling_wavlm.WavLMForSequenceClassification'>, model_url='aniemore/wavlm-emotion-russian-resd'),
 Model(model_cls=<class 'transformers.models.hubert.modeling_hubert.HubertForSequenceClassification'>, model_url='aniemore/hubert-emotion-russian-resd'),
 Model(model_cls=<class 'transformers.models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatForSequenceClassification'>, model_url='aniemore/unispeech-sat-emotion-russian-resd')]

In [None]:
def get_acc(model):
    acc = 0
    size = len(answers)
    for i in tqdm(range(len(answers))):
        if answers[i] == "fear": # extra
            size -= 1
            continue
        name = "resd/" + str(i) + ".wav"
        res = model.recognize(name, return_single_label=True)
        acc += res == answers[i]
    return acc, size

### WavLM

In [None]:
wavlm = VoiceRecognizer(model=HuggingFaceModel.Voice.WavLM)

config.json:   0%|          | 0.00/2.91k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

In [None]:
acc_wavlm, size = get_acc(wavlm)

100%|██████████| 1116/1116 [3:07:38<00:00, 10.09s/it]


In [None]:
print(size)
acc_wavlm / size

938


0.5479744136460555

### Hubert

In [None]:
hubert = VoiceRecognizer(model=HuggingFaceModel.Voice.Hubert)

In [None]:
acc_hubert, size = get_acc(hubert)

100%|████████████████████████████████████████████████████████████████████████████| 1116/1116 [1:29:40<00:00,  4.82s/it]


In [None]:
print(size)
acc_hubert / size

938


0.5597014925373134

### Wav2Vec2

In [None]:
wav2vec2 = VoiceRecognizer(model=HuggingFaceModel.Voice.Wav2Vec2)

In [None]:
acc_wav2vec2, size = get_acc(wav2vec2)

100%|████████████████████████████████████████████████████████████████████████████| 1116/1116 [1:35:15<00:00,  5.12s/it]


In [None]:
print(size)
acc_wav2vec2 / size

938


0.5660980810234542

### UniSpeech

In [None]:
uni_speech = VoiceRecognizer(model=HuggingFaceModel.Voice.UniSpeech)

In [None]:
acc_uni_speech, size = get_acc(uni_speech)

100%|████████████████████████████████████████████████████████████████████████████| 1116/1116 [1:36:35<00:00,  5.19s/it]


In [None]:
print(size)
acc_uni_speech / size

938


0.5543710021321961

## RAVDESS (English)

Радость - 03 - happiness

Грусть - 04 - sadness

Злость - 05 - anger

Удивление - 08

Нейтрально - 01 (02 - спокойствие) - neutral

Отвращение - 07 - disgust

In [None]:
with zlib("ravdess.zip", 'r') as z:
    z.extractall()
    print("Done")

Done


In [None]:
model = HuggingFaceModel.Voice.WavLM
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
vr = VoiceRecognizer(model=model)

Downloading model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at aniemore/wavlm-emotion-russian-resd were not used when initializing WavLMForSequenceClassification: ['wavlm.encoder.pos_conv_embed.conv.weight_g', 'wavlm.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some we

Downloading (…)rocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

Some weights of the model checkpoint at aniemore/wavlm-emotion-russian-resd were not used when initializing WavLMForSequenceClassification: ['wavlm.encoder.pos_conv_embed.conv.weight_g', 'wavlm.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMForSequenceClassification were not initialized from the model checkpoint at aniemore/wavlm-emotion-russian-resd and are newly initialized: ['wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wavlm.encoder.pos_conv_embed.conv.param

In [None]:
res = vr.recognize('ravdess/03-01-03-02-01-01-01.wav')



In [None]:
res

{'anger': 0.041953977197408676,
 'disgust': 0.00017660057346802205,
 'enthusiasm': 0.000249645730946213,
 'fear': 0.0046153864823281765,
 'happiness': 0.9528968334197998,
 'neutral': 6.682628009002656e-05,
 'sadness': 4.0788618207443506e-05}

In [None]:
ravdess_emotions = {
    1: "neutral",
    3: "happiness",
    4: "sadness",
    5: "anger",
    7: "disgust",
    8: "surprise"
}

In [None]:
res = {
    "anger": 0.0,
    "disgust": 0.0,
    "enthusiasm": 0.0,
    "fear": 0.0,
    "happiness": 0.0,
    "neutral": 0.0,
    "sadness": 0.0
}

In [None]:
use = {"01", "03", "04", "05", "07"}

In [None]:
len(os.listdir('ravdess'))

1440

In [None]:
cnt = 0
acc = 0
for file in os.listdir('ravdess'):
    if file.split('-')[2] in use:
        cnt += 1
        #print(file)
        #print(vr.recognize('ravdess/' + file))
        #print(vr.recognize('ravdess/' + file, return_single_label=True) == ravdess_emotions[int(file.split('-')[2])])
        acc += vr.recognize('ravdess/' + file, return_single_label=True) == ravdess_emotions[int(file.split('-')[2])]
        if cnt % 100 == 0:
            print(cnt)

100
200
300
400
500
600
700
800


In [None]:
print(acc)
print(acc / cnt)

263
0.30439814814814814
