In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [156]:
import numpy as np
import os
import sys

import wave
import copy
import math

from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation
from keras.layers import LSTM, Input, Flatten, Add, concatenate, Embedding, Convolution1D, Dropout, Dense, merge, Bidirectional
from keras.layers.wrappers import TimeDistributed
from keras.layers.convolutional import Conv2D
from keras.optimizers import SGD, Adam, RMSprop
from keras.layers.normalization import BatchNormalization
from sklearn.preprocessing import label_binarize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras import optimizers

from features import *
from helper import *
from attention_helper import *

In [3]:
code_path = os.path.dirname(os.path.realpath(os.getcwd()))
emotions_used = np.array(['ang', 'exc', 'neu', 'sad'])
data_path = '/media/bagus/data01/dataset/IEMOCAP_full_release/'
sessions = ['Session1', 'Session2', 'Session3', 'Session4', 'Session5']
framerate = 16000

In [4]:
import pickle
with open(data_path +'data_collected.pickle', 'rb') as handle:
    data2 = pickle.load(handle)

In [5]:
text = []

for ses_mod in data2:
    text.append(ses_mod['transcription'])
    
MAX_SEQUENCE_LENGTH = 500

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)

token_tr_X = tokenizer.texts_to_sequences(text)
x_train_text = []

x_train_text = sequence.pad_sequences(token_tr_X, maxlen=MAX_SEQUENCE_LENGTH)

In [6]:
import codecs
EMBEDDING_DIM = 300

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

#file_loc = data_path + '../glove.42B.300d.txt'
file_loc = '../../data/glove.840B.300d.txt'
print (file_loc)

gembeddings_index = {}
with codecs.open(file_loc, encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        gembedding = np.asarray(values[1:], dtype='float32')
        gembeddings_index[word] = gembedding
#
f.close()
print('G Word embeddings:', len(gembeddings_index))

nb_words = len(word_index) +1
g_word_embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    gembedding_vector = gembeddings_index.get(word)
    if gembedding_vector is not None:
        g_word_embedding_matrix[i] = gembedding_vector
        
print('G Null word embeddings: %d' % np.sum(np.sum(g_word_embedding_matrix, axis=1) == 0))

Found 2736 unique tokens
../../data/glove.840B.300d.txt
G Word embeddings: 2196018
G Null word embeddings: 100


In [10]:
Y=[]
for ses_mod in data2:
    Y.append(ses_mod['emotion'])
    
Y = label_binarize(Y,emotions_used)

Y.shape

(4936, 4)

In [11]:
# read speech feature data
x_train_speech = np.load('voiced_feat_file_001_001.npy')
x_train_speech.shape

(4936, 100, 34)

In [55]:
# USE Keras API model
text_input = Input(shape=(MAX_SEQUENCE_LENGTH,))
model_text = Embedding(nb_words,
                      EMBEDDING_DIM,
                      weights = [g_word_embedding_matrix],
                      input_length = MAX_SEQUENCE_LENGTH)(text_input)
conv1 = Convolution1D(256, 3, padding='same', activation='relu')(model_text)
conv2 = Convolution1D(128, 3, padding='same', activation='relu')(conv1)
conv3 = Convolution1D(64, 3, padding='same', activation='relu')(conv2)
conv4 = Convolution1D(128, 3, padding='same', activation='relu')(conv3)
flat = Flatten()(conv4)
out_text = Dense(256)(flat)

speech_input = Input(shape=(100,34))
model_speech1 = Flatten()(speech_input)
model_speech2 = Dense(1024, activation='relu')(model_speech1)
model_speech3 = Dense(512, activation='relu')(model_speech2)
out_speech = Dense(256, activation='relu')(model_speech3)

model_combined1 = concatenate([out_text, out_speech])
model_combined2 = Dense(256, activation='relu')(model_combined1)
model_combined3 = Dense(4, activation='softmax')(model_combined2)

# model compile
model_combined = Model([text_input, speech_input], model_combined3)
model_combined.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [56]:
hist = model_combined.fit([x_train_text, x_train_speech], Y, batch_size=16, epochs=25, verbose=1, 
                          validation_split=0.2)

Train on 3948 samples, validate on 988 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [57]:
acc1 = hist.history['val_acc']
print(np.mean(acc1), max(acc1))

0.6407287450840599 0.6882591090704265


In [68]:
# text model
text_input = Input(shape=(MAX_SEQUENCE_LENGTH,))
model_text1 = (Embedding(2737, 128, input_length=500))(text_input)
model_text2 = Flatten()(model_text1)
model_text3 = Dense(1024, activation='relu')(model_text2)
model_text4 = Dropout(0.2)(model_text3)
model_text5 = Dense(512, activation='relu')(model_text4)
model_text6 = Dropout(0.2)(model_text5)
model_text = Dense(256)(model_text6)

# speech model
speech_input = Input(shape=(100,34))
model_speech1 = Flatten()(speech_input)
model_speech2 = Dense(1024, activation='relu')(model_speech1)
model_speech3 = Dropout(0.2)(model_speech2)
model_speech4 = Dense(512, activation='relu')(model_speech3)
model_speech5 = Dropout(0.2)(model_speech4)
model_speech = Dense(256)(model_speech5)

# combined model
model_combined1 = concatenate([model_text, model_speech])
model_combined2 = Dense(256, activation='relu')(model_combined1)
model_combined3 = Dense(4, activation='softmax')(model_combined2)

model_combined =  Model([text_input, speech_input], model_combined3)
model_combined.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

## compille it here according to instructions

#model.compile()
#model_speech.summary()
#model_text.summary()
model_combined.summary()

print("Model2 Built")

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_47 (InputLayer)           (None, 500)          0                                            
__________________________________________________________________________________________________
embedding_32 (Embedding)        (None, 500, 128)     350336      input_47[0][0]                   
__________________________________________________________________________________________________
input_48 (InputLayer)           (None, 100, 34)      0                                            
__________________________________________________________________________________________________
flatten_50 (Flatten)            (None, 64000)        0           embedding_32[0][0]               
__________________________________________________________________________________________________
flatten_51

In [69]:
hist = model_combined.fit([x_train_text,x_train_speech], Y, 
                 batch_size=32, epochs=30, verbose=1, 
                 validation_split=0.2)

Train on 3948 samples, validate on 988 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [70]:
acc2 = max(hist.history['val_acc'])
print(acc2)

0.638663966887393


In [83]:
text_input = Input(shape=(MAX_SEQUENCE_LENGTH, ))
model_text1 = Embedding(2737, 128, input_length=500)(text_input)

model_text2 = LSTM(256, return_sequences=True)(model_text1)
model_text3 = LSTM(256, return_sequences=False)(model_text2)
model_text = Dense(256)(model_text3)

speech_input = Input(shape=(100, 34))
model_speech1 = Flatten()(speech_input)
model_speech2 = Dense(1024, activation='relu')(model_speech1)
model_speech3 = Dropout(0.2)(model_speech2)
model_speech = Dense(256)(model_speech3)

# combination of text and speech
model_combined1 = concatenate([model_text, model_speech])
model_combined2 = Dense(256, activation='relu')(model_combined1)
model_combined3 = Dense(4, activation='softmax')(model_combined2)

model_combined = Model([text_input, speech_input], model_combined3)
model_combined.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

#model.compile()
model_combined.summary()

print("Model3 Built")

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_59 (InputLayer)           (None, 500)          0                                            
__________________________________________________________________________________________________
input_60 (InputLayer)           (None, 100, 34)      0                                            
__________________________________________________________________________________________________
embedding_38 (Embedding)        (None, 500, 128)     350336      input_59[0][0]                   
__________________________________________________________________________________________________
flatten_57 (Flatten)            (None, 3400)         0           input_60[0][0]                   
__________________________________________________________________________________________________
lstm_11 (L

In [93]:
hist = model_combined.fit([x_train_text, x_train_speech], Y, batch_size=64, epochs=25, verbose=1, validation_split=0.3)

Train on 3455 samples, validate on 1481 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [96]:
acc3=hist.history['val_acc']
print(np.mean(acc3), max(acc3))

0.7459284267386902 0.7548953409858203
