<a href="https://colab.research.google.com/github/arjasc5231/moodots/blob/ACRNN/SER/CNN/ACRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
learning_rate = 0.001
training_epochs = 60
batch_size = 100

In [4]:
mel_X_train, mel_X_test, mel_Y_train, mel_Y_test = np.load("/content/drive/MyDrive/team_runner/colab/dataset/emoDB/emo_mel_more.npy", allow_pickle=True)

print(mel_X_train.shape)
print(mel_Y_train.shape)
print(mel_X_test.shape)
print(mel_Y_test.shape)

(683, 128, 128)
(683,)
(228, 128, 128)
(228,)


In [5]:
mel_X_train = np.expand_dims(mel_X_train, axis=-1)
mel_X_test = np.expand_dims(mel_X_test, axis=-1)

mel_Y_train = to_categorical(mel_Y_train, 7)
mel_Y_test = to_categorical(mel_Y_test, 7)    

mel_train_dataset = tf.data.Dataset.from_tensor_slices((mel_X_train, mel_Y_train)).batch(batch_size)
mel_test_dataset = tf.data.Dataset.from_tensor_slices((mel_X_test, mel_Y_test)).batch(batch_size)

In [22]:
# 일단 매우매우 얕게 만들었음

"""
attention 이론은 https://wikidocs.net/22893 참고
Bahdanau방식 https://github.com/philipperemy/keras-attention-mechanism 참고
ACRNN 논문에서는 LSTM의 마지막 출력값도 사용하지 않은 특이한 attention을 사용...
  논문 코드 https://github.com/xuanjihe/speech-emotion-recognition/blob/master/crnn.py 에서 u_omega의 역할을 이해 못했음 일단 빼고 구현
https://stackoverflow.com/questions/42918446/how-to-add-an-attention-mechanism-in-keras?answertab=votes#tab-top repeatvector를 이용하기도 하네
https://github.com/keras-team/keras/issues/1472 여기도 참고
"""


def create_model():
    inputs = keras.Input(shape=(128, 128, 1))
    conv1 = keras.layers.Conv2D(filters=32, kernel_size=[3, 3], activation=tf.nn.relu)(inputs)
    pool1 = keras.layers.MaxPool2D(padding='SAME')(conv1)
    conv2 = keras.layers.Conv2D(filters=64, kernel_size=[3, 3], activation=tf.nn.relu)(pool1)
    pool2 = keras.layers.MaxPool2D(padding='SAME')(conv2)
    conv3 = keras.layers.Conv2D(filters=128, kernel_size=[3, 3], activation=tf.nn.relu)(pool2)
    pool3 = keras.layers.MaxPool2D(padding='SAME')(conv3)

    trans = keras.layers.Permute((2,1,3))(pool3)
    reshape = keras.layers.Reshape((-1, 15*128))(trans) # 열 개수(freq축)*ch
    lstm = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True, dropout=0.3))(reshape) # (time=15,hidden=256)

    attention_score1 = keras.layers.Dense(1, activation='tanh')(lstm) # lstm(time,hidden)*W(hidden,1)=score(time,1)
    attention_score2 = keras.layers.Softmax()(attention_score1)
    attention = keras.layers.Dot(axes=(1,1))([lstm, attention_score2]) # (time=15, hidden=256) * (time,) => (hidden=256)
    flatten = keras.layers.Flatten()(attention)
    
    fc = keras.layers.Dense(7)(flatten)
    return keras.Model(inputs=inputs, outputs=fc)

In [23]:
model = create_model()
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 128, 128, 1) 0                                            
__________________________________________________________________________________________________
conv2d_6 (Conv2D)               (None, 126, 126, 32) 320         input_3[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_6 (MaxPooling2D)  (None, 63, 63, 32)   0           conv2d_6[0][0]                   
__________________________________________________________________________________________________
conv2d_7 (Conv2D)               (None, 61, 61, 64)   18496       max_pooling2d_6[0][0]            
____________________________________________________________________________________________

In [16]:
def loss_fn(model, images, labels):
    logits = model(images, training=True)
    loss = tf.reduce_mean(tf.keras.losses.categorical_crossentropy(y_pred=logits, y_true=labels, from_logits=True))    
    return loss

In [17]:
def grad(model, images, labels):
    with tf.GradientTape() as tape:
        loss = loss_fn(model, images, labels)
    return tape.gradient(loss, model.variables)

In [18]:
def evaluate(model, images, labels):
    logits = model(images, training=False)
    correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return accuracy

In [19]:
optimizer = tf.optimizers.Adam(learning_rate=learning_rate)

In [25]:
print('Learning started. It takes sometime.')
print('Epoch\tloss\t\ttrain acc\ttest acc')
for epoch in range(training_epochs):
    loss = 0.
    train_acc = 0.
    test_acc = 0.
    
    for images, labels in mel_train_dataset:
        grads = grad(model, images, labels)                
        optimizer.apply_gradients(zip(grads, model.variables)) 
        loss += loss_fn(model, images, labels)
        train_acc += evaluate(model, images, labels)
    loss = loss / len(mel_train_dataset)
    train_acc = train_acc / len(mel_train_dataset)
    
    for images, labels in mel_test_dataset:
        test_acc += evaluate(model, images, labels)
    test_acc = test_acc / len(mel_test_dataset)  

    print('{}  \t{:.4f}  \t{:.4f}  \t{:.4f}'.format(epoch+1, loss, train_acc, test_acc))

print('Learning Finished!')
# 3분 소요. 초반 학습 속도(수렴 속도)가 빠르다는 느낌.
# dropout=0.5: 38epoch에서 71%달성. 87epoch에서 73.67%달성
# dropout=0.3: 상대적으로 학습을 너무 못하는데? 설정을 잘못했나?
# dropout=0: 70%달성 못함


Learning started. It takes sometime.
Epoch	loss		train acc	test acc
1  	0.9038  	0.6652  	0.5076
2  	0.9191  	0.6512  	0.5386
3  	0.9130  	0.6667  	0.5162
4  	0.9107  	0.6541  	0.5067
5  	0.8855  	0.6967  	0.5295
6  	0.8921  	0.6764  	0.5329
7  	0.8876  	0.6707  	0.5229
8  	0.8363  	0.6876  	0.5495
9  	0.8553  	0.6996  	0.5043
10  	0.8370  	0.7050  	0.5276
11  	0.8645  	0.6827  	0.5429
12  	0.8368  	0.6787  	0.5429
13  	0.8941  	0.6847  	0.5333
14  	0.8154  	0.7018  	0.5029
15  	0.8196  	0.6890  	0.5181
16  	0.8241  	0.7053  	0.5481
17  	0.8244  	0.7096  	0.5686
18  	0.7826  	0.6993  	0.5543
19  	0.8002  	0.7121  	0.5819
20  	0.8163  	0.7021  	0.5567
21  	0.7899  	0.7087  	0.5062
22  	0.8154  	0.7236  	0.5348
23  	0.7909  	0.7261  	0.4990
24  	0.7740  	0.7253  	0.5581
25  	0.7715  	0.7076  	0.4910
26  	0.7752  	0.7376  	0.5448
27  	0.7776  	0.7299  	0.5800
28  	0.8014  	0.7264  	0.5481
29  	0.7844  	0.7281  	0.5667
30  	0.7700  	0.7347  	0.5486
31  	0.7282  	0.7541  	0.5705
32  	0.7401

In [None]:
conf_mat = [[0]*7 for i in range(7)] #mat[real_label]=predicted_label list

for images, labels in mel_test_dataset:
  logits = model(images, training=False)
  logits_max = tf.math.argmax(logits,1)
  labels_max = tf.math.argmax(labels,1)
  for i in range(len(logits)): conf_mat[labels_max[i]][logits_max[i]]+=1

for i in range(7): print(conf_mat[i])

[46, 0, 0, 0, 2, 0, 0]
[0, 24, 2, 0, 1, 5, 4]
[5, 0, 18, 1, 0, 1, 4]
[7, 2, 0, 7, 3, 2, 5]
[7, 0, 7, 1, 6, 0, 3]
[0, 4, 1, 1, 0, 37, 0]
[1, 4, 1, 0, 0, 0, 16]


In [None]:
conf_mat_normal = []
for i in range(7):
  s = sum(conf_mat[i])
  conf_mat_normal.append(list(map(lambda x:(x/s)*100, conf_mat[i])))

label = ['anger','boredom','disgust','fear','happy','sad','neutral']
print('\t'+'\t'.join(label))
for i in range(7):
  print(label[i], end='')
  for j in range(7): print('\t%2.0f'%conf_mat_normal[i][j], end=' ')
  print()

	anger	boredom	disgust	fear	happy	sad	neutral
anger	96 	 0 	 0 	 0 	 4 	 0 	 0 
boredom	 0 	67 	 6 	 0 	 3 	14 	11 
disgust	17 	 0 	62 	 3 	 0 	 3 	14 
fear	27 	 8 	 0 	27 	12 	 8 	19 
happy	29 	 0 	29 	 4 	25 	 0 	12 
sad	 0 	 9 	 2 	 2 	 0 	86 	 0 
neutral	 5 	18 	 5 	 0 	 0 	 0 	73 
