In [1]:
import os
os.chdir(os.path.abspath(".."))

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2Model,
    Wav2Vec2PreTrainedModel,
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class RegressionHead(nn.Module):
    r"""Classification head."""

    def __init__(self, config):

        super().__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):

        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x


class EmotionModel(Wav2Vec2PreTrainedModel):
    r"""Speech emotion classifier."""

    def __init__(self, config):

        super().__init__(config)

        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = RegressionHead(config)
        self.init_weights()

    def forward(
            self,
            input_values,
    ):

        outputs = self.wav2vec2(input_values)
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits = self.classifier(hidden_states)

        return hidden_states, logits



# load model from hub
device = 'cpu'
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionModel.from_pretrained(model_name)

# dummy signal
sampling_rate = 16000
signal = np.zeros((1, sampling_rate), dtype=np.float32)


def process_func(
    x: np.ndarray,
    sampling_rate: int,
    embeddings: bool = False,
) -> np.ndarray:
    r"""Predict emotions or extract embeddings from raw audio signal."""

    # run through processor to normalize signal
    # always returns a batch, so we just get the first entry
    # then we put it on the device
    y = processor(x, sampling_rate=sampling_rate)
    y = y['input_values'][0]
    y = y.reshape(1, -1)
    y = torch.from_numpy(y).to(device)

    # run through model
    with torch.no_grad():
        y = model(y)[0 if embeddings else 1]

    # convert to numpy
    y = y.detach().cpu().numpy()

    return y


print(process_func(signal, sampling_rate))
#  Arousal    dominance valence
# [[0.5460754  0.6062266  0.40431657]]

print(process_func(signal, sampling_rate, embeddings=True))
# Pooled hidden states of last transformer layer
# [[-0.00752167  0.0065819  -0.00746342 ...  0.00663632  0.00848748
#    0.00599211]]

Some weights of EmotionModel were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[[0.54607517 0.60622644 0.40431607]]
[[-0.00752167  0.0065819  -0.00746342 ...  0.00663632  0.00848749
   0.00599211]]


In [4]:
from src.dataload import load_audio_data, audio_select_mean_vote
from src.emotion_translation import ekman_emotion

In [5]:
df_annotations = pd.read_excel("data/annotations_2.xlsx")
df_training_audios = pd.read_excel('data/training_audios.xlsx')
df_features_complete = pd.read_csv('data/classic_features_complete.csv')

In [6]:
part_num = 1
audio_name = 'MSP-Conversation_0002.wav'

In [7]:
# Input data
data, timestamps, sr = load_audio_data(df_annotations, part_num = part_num, audio_name = audio_name)
df_input = pd.DataFrame(data = np.stack([timestamps, data]).T, columns = ['Time','Data'])

# Labels
df_label = audio_select_mean_vote(df_annotations,  part_num = part_num, audio_name = audio_name)
df_label['Emotion'] = df_label.apply(lambda row : ekman_emotion(row['Valence'], row['Arousal'], row['Dominance']), axis = 1)

# Merge
time_index = pd.concat([df_label['Time'],df_input['Time']], ignore_index = True).drop_duplicates().sort_values().reset_index(drop = True)
df_train = pd.merge(time_index, df_input, how = 'left', on = 'Time')
df_train = pd.merge(df_train, df_label, how = 'left', on = 'Time')
df_train = df_train.fillna(method='ffill').fillna(method='bfill')
df_train = df_train.set_index('Time').drop_duplicates().reset_index()

  time = pd.concat([time, temp_df], ignore_index = True)
  df_pivot = df_pivot.fillna(method='ffill')
  votation_means = pd.concat([votation_means, df_pivot.reset_index()])
  time = pd.concat([time, temp_df], ignore_index = True)
  df_pivot = df_pivot.fillna(method='ffill')
  time = pd.concat([time, temp_df], ignore_index = True)
  df_pivot = df_pivot.fillna(method='ffill')
  df_emotions_vote = df_emotions_vote.fillna(method='ffill')
  df_train = df_train.fillna(method='ffill').fillna(method='bfill')


In [9]:
end = df_train.iloc[-1]['Time']
frame_duration = 2.5
step = 0.5
start = 0
X, Y = [], [] 

while start + frame_duration < end:
    
    df_frame = df_train[(df_train['Time'] >= start) & (df_train['Time'] <= start + frame_duration)]
    
    print(start, '/', end, len(df_frame['Data'].values))
    
    emotion = df_frame.groupby('Emotion').count().sort_values(by = 'Time', ascending = False).reset_index().loc[0,'Emotion']
    a, d, v = df_frame[['Arousal','Dominance','Valence']].mean()
    
    X.append(process_func(df_frame['Data'].values, sr)[0])
    Y.append([a,v,d,emotion])
     
    start += step

0 / 284.1103 38079
0.5 / 284.1103 38773
1.0 / 284.1103 38900
1.5 / 284.1103 38158
2.0 / 284.1103 38448
2.5 / 284.1103 38505
3.0 / 284.1103 37701
3.5 / 284.1103 37746
4.0 / 284.1103 37711
4.5 / 284.1103 37588
5.0 / 284.1103 37382
5.5 / 284.1103 37601
6.0 / 284.1103 37640
6.5 / 284.1103 38093
7.0 / 284.1103 38107
7.5 / 284.1103 38275
8.0 / 284.1103 38798
8.5 / 284.1103 37003
9.0 / 284.1103 36906
9.5 / 284.1103 36955
10.0 / 284.1103 36881
10.5 / 284.1103 36589
11.0 / 284.1103 38241
11.5 / 284.1103 38731
12.0 / 284.1103 36960
12.5 / 284.1103 36966
13.0 / 284.1103 37244
13.5 / 284.1103 36912
14.0 / 284.1103 36529
14.5 / 284.1103 37983
15.0 / 284.1103 37878
15.5 / 284.1103 37046
16.0 / 284.1103 37434
16.5 / 284.1103 37709
17.0 / 284.1103 37972
17.5 / 284.1103 38158
18.0 / 284.1103 38143
18.5 / 284.1103 37969
19.0 / 284.1103 37838
19.5 / 284.1103 37545
20.0 / 284.1103 36926
20.5 / 284.1103 37745
21.0 / 284.1103 37847
21.5 / 284.1103 37340
22.0 / 284.1103 37525
22.5 / 284.1103 38101
23.0 / 284

183.0 / 284.1103 38798
183.5 / 284.1103 38575
184.0 / 284.1103 38385
184.5 / 284.1103 37992
185.0 / 284.1103 38169
185.5 / 284.1103 38215
186.0 / 284.1103 38319
186.5 / 284.1103 38639
187.0 / 284.1103 38987
187.5 / 284.1103 38954
188.0 / 284.1103 38934
188.5 / 284.1103 38970
189.0 / 284.1103 38499
189.5 / 284.1103 38706
190.0 / 284.1103 37647
190.5 / 284.1103 37669
191.0 / 284.1103 37663
191.5 / 284.1103 37893
192.0 / 284.1103 37909
192.5 / 284.1103 38762
193.0 / 284.1103 38102
193.5 / 284.1103 37963
194.0 / 284.1103 38127
194.5 / 284.1103 37845
195.0 / 284.1103 37921
195.5 / 284.1103 38088
196.0 / 284.1103 37850
196.5 / 284.1103 37350
197.0 / 284.1103 36762
197.5 / 284.1103 36299
198.0 / 284.1103 35283
198.5 / 284.1103 35337
199.0 / 284.1103 35862
199.5 / 284.1103 35444
200.0 / 284.1103 35869
200.5 / 284.1103 37153
201.0 / 284.1103 37248
201.5 / 284.1103 37380
202.0 / 284.1103 37322
202.5 / 284.1103 36787
203.0 / 284.1103 36685
203.5 / 284.1103 36756
204.0 / 284.1103 36578
204.5 / 284

In [35]:
df_features = pd.DataFrame(X, columns = ['Arousal','Dominance','Valence'])

In [39]:
df_features = (df_features - 0.5) * 100

In [40]:
df_features_real = pd.DataFrame(Y, columns = ['Arousal_Real','Dominance_Real','Valence_Real','Emotion'])

In [42]:
df_final = pd.concat([df_features, df_features_real],axis = 1)

In [43]:
from src.metricas import ccc

In [44]:
df_final

Unnamed: 0,Arousal,Dominance,Valence,Arousal_Real,Dominance_Real,Valence_Real,Emotion
0,15.479374,16.677416,-3.381297,23.030792,19.139518,18.282451,surprise
1,8.824801,12.015867,-0.924268,29.637498,23.773327,24.622130,joy
2,5.294519,7.849634,2.317530,32.978009,26.895799,29.594931,joy
3,9.405905,10.914696,1.555467,35.268256,28.493258,32.581644,joy
4,12.042046,14.230526,17.722923,34.513128,29.270606,33.464845,joy
...,...,...,...,...,...,...,...
559,-10.457813,-1.522979,-7.543582,33.889990,9.368666,37.532240,anger
560,-9.809235,-1.416820,3.928220,34.801995,8.064776,36.970464,anger
561,-16.543251,-8.957895,-0.761151,34.677230,7.128171,36.024064,anger
562,-17.047548,-11.321795,-1.300740,33.882036,7.074809,35.892542,anger


In [47]:
ccc(df_final['Arousal_Real'].values, df_final['Arousal'].values)

array([0.01561464])

In [48]:
ccc(df_final['Dominance_Real'].values, df_final['Dominance'].values)

array([0.11946536])

In [49]:
ccc(df_final['Valence_Real'].values, df_final['Valence'].values)

array([0.0089326])