In [2]:
import numpy as np
import os
import sys

import keras
from keras.models import Sequential, Model, load_model
from keras.layers.core import Dense, Activation
from keras.layers import LSTM, Input, Flatten, Concatenate, Embedding, Convolution1D,Dropout, Conv2D, Conv1D, Bidirectional
from keras.layers.wrappers import TimeDistributed

from keras.optimizers import SGD, Adam, RMSprop
from keras.layers.normalization import BatchNormalization
from sklearn.preprocessing import label_binarize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.utils import to_categorical

import tensorflow as tf
from keras import backend as K
from keras import regularizers, constraints, initializers, activations
from keras.layers.recurrent import Recurrent
from keras.engine import InputSpec
from keras.callbacks import EarlyStopping,TensorBoard, ModelCheckpoint
from keras_self_attention import SeqSelfAttention
from keras_multi_head import MultiHeadAttention

import pickle as plk

from utilz import *
gpu_options = tf.GPUOptions(allow_growth=True)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
keras.backend.tensorflow_backend.set_session(sess)

In [None]:
data_utt_ori_tra = plk.load(open('../data_clean/split/data_utt_ori_tra', 'rb'))
data_utt_EA_tra = plk.load(open('../data_clean/split/data_utt_EA_tra', 'rb'))

data_utt_ori_pre_tra = plk.load(open('../data_clean/split/data_utt_ori_pre_tra', 'rb'))
data_utt_EA_pre_tra = plk.load(open('../data_clean/split/data_utt_EA_pre_tra', 'rb'))

data_utt_pre_tes = plk.load(open('../data_clean/split/data_utt_pre_tes', 'rb'))
data_utt_tes = plk.load(open('../data_clean/split/data_utt_tes', 'rb'))


data_emos_tra = plk.load(open('../data_clean/split/data_emos_tra_', 'rb'))
data_gens_tra = plk.load(open('../data_clean/split/data_gens_tra', 'rb'))
data_emos_tes = plk.load(open('../data_clean/split/data_emos_tes', 'rb'))
data_gens_tes = plk.load(open('../data_clean/split/data_gens_tes_', 'rb'))

In [None]:
data = []
loop_n = {0:3,1:2,2:1,3:2,4:2,5:1}
for (utt, utt_pre, emo, gen) in zip(data_utt_ori_tra, data_utt_ori_pre_tra, data_emos_tra, data_gens_tra):
    for _ in range(loop_n[emo]):
        data.append([utt, utt_pre, emo, gen])
for (utt, utt_pre, emo, gen) in zip(data_utt_EA_tra, data_utt_EA_pre_tra, data_emos_tes, data_gens_tes):
    for _ in range(loop_n[emo]):
        data.append([utt, utt_pre, emo, gen])
        
np.random.shuffle(data)

tra_utt, tra_utt_pre, tra_emos, tra_gens = [], [], [], []
for (utt, utt_pre, emo, gen) in data:
    tra_utt.append(utt)
    tra_utt_pre.append(utt_pre)
    tra_emos.append(emo)
    tra_gens.append(gen)

[tes_utt_pre, tes_utt, tes_emos, tes_gens] = [data_utt_pre_tes, data_utt_tes, 
                                              data_emos_tes, data_gens_tes]

data_type = 'float32'
[tra_utt, tra_utt_pre, tra_emos, tra_gens,
tes_utt, tes_utt_pre, tes_emos, tes_gens]  = [  np.asarray(tra_utt, data_type),
                                                np.asarray(tra_utt_pre, data_type),
                                                to_categorical(tra_emos, 6, dtype=data_type),
                                                to_categorical(tra_gens, 2, dtype=data_type),
                                                                                                             
                                                np.asarray(tes_utt, data_type),
                                                np.asarray(tes_utt_pre, data_type),
                                                to_categorical(tes_emos, 6, dtype=data_type),
                                                to_categorical(tes_gens, 2, dtype=data_type) ]  

In [None]:
max_len = 256
features_number = 384
hidden_unit = 512
dropout_rate = 0.35
lstm_cells = 128
classes = 6
batch = 32
epochs = 5000

In [None]:
pre_utt = Input((384,256))
utt = Input((384,256))

Audio_processing = Sequential()
Audio_processing.add(Bidirectional(LSTM(lstm_cells, return_sequences=True, recurrent_dropout = 0.2)))
Audio_processing.add(MultiHeadAttention(head_num=8))
Audio_processing.add(Dropout(dropout_rate))
Audio_processing.add(EmoEncDec(lstm_cells,lstm_cells, name='EmoEncDec'))
# Audio_processing.add(LSTM(lstm_cells,return_sequences=True, name='EmoEncDec_LSTM'))
Audio_processing.add(Dropout(dropout_rate))
Audio_processing.add(Flatten())
Audio_processing.add(Dense(256))


pre_utt_feature = Audio_processing(pre_utt)
utt_feature = Audio_processing(utt)

merge = Concatenate(axis=-1)([pre_utt_feature, utt_feature])

# merge_att = AttentionDecoder(lstm_cells,lstm_cells)(merge)


# R = MultiHeadAttention(head_num=8)(merge)
# R = Flatten()(merge)
R = Dense(64)(merge)
emo = Dense(classes, name='emo', activation='softmax')(R)
gen = Dense(2, name='gen', activation='softmax')(R)


model = Model(inputs=[pre_utt, utt],outputs=[emo, gen])

model.summary()

In [None]:
file_path_root = './audio/'
model_file = file_path_root+'audio_model.h5'
callback_list = [
                    TensorBoard(log_dir=file_path_root),
                    EarlyStopping(
                        monitor='val_emo_acc',
                        patience=100,
                        verbose=1,
                        mode='auto'
                    ),
                    ModelCheckpoint(
                        filepath=model_file,
                        monitor='val_emo_acc',
                        save_best_only='True',
                        verbose=1,
                        mode='auto',
                        period=1
                    )
                    ]

In [None]:
model.compile(optimizer='adam', 
              loss={'emo':'categorical_crossentropy',
                        'gen':'categorical_crossentropy',
                    },
              loss_weights={'emo':1.,
                            'gen':1.,
                            },
              metrics=['acc'])

In [None]:
training = model.fit([tra_utt_pre, tra_utt], 
          [tra_emos, tra_gens],
          batch_size=batch,
          epochs=epochs,
          callbacks=callback_list,      
          validation_data=([tes_utt_pre, tes_utt], 
                           [tes_emos, tes_gens]))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd


model = load_model(model_file, custom_objects={'MultiHeadAttention':MultiHeadAttention, 'EmoEncDec': EmoEncDec})

predicted_test_labels = model.predict([tes_utt_pre, tes_utt])[0].argmax(axis=1)
numeric_test_labels = np.array(tes_emos).argmax(axis=1)

report_filename = file_path_root+'Results_4digits.txt' 

with open(report_filename, 'w', encoding='utf-8') as f:
    print(classification_report(numeric_test_labels, predicted_test_labels, target_names = ['hap', 'sad', 'neu', 'ang', 'exc', 'fru'], digits=4), file=f)
print(classification_report(numeric_test_labels, predicted_test_labels, target_names = ['hap', 'sad', 'neu', 'ang', 'exc', 'fru'], digits=4))
labels = ['hap', 'sad', 'neu', 'ang', 'exc', 'fru']
print('   '+' '.join(labels))
cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)

nor_cm = []
for i in range(6):
    row_sum = cm[i].sum()
#     print(row_sum)
    l_n = []
    for j in range(6):
        l_n.append(cm[i][j]/row_sum)
    nor_cm.append(l_n)
    
df_cm = pd.DataFrame(nor_cm, index = [i for i in ['hap', 'sad', 'neu', 'ang', 'exc', 'fru']],
                  columns = [i for i in ['hap', 'sad', 'neu', 'ang', 'exc', 'fru']])

sn.heatmap(df_cm,  annot=True)

plt.savefig(file_path_root+'cm.jpg')

cm = np.transpose(cm)

nor_cm = []
for i in range(6):
    row_sum = cm[i].sum()
    l_n = []
    for j in range(6):
        l_n.append(cm[i][j]/row_sum)
    nor_cm.append(l_n)
    
df_cm = pd.DataFrame(nor_cm, index = [i for i in ['hap', 'sad', 'neu', 'ang', 'exc', 'fru']],
                  columns = [i for i in ['hap', 'sad', 'neu', 'ang', 'exc', 'fru']])

sn.heatmap(df_cm,  annot=True)
plt.savefig(file_path_root+'cm_precision.jpg')