In [None]:
import numpy as np
import os
import sys

import wave
import copy
import math

from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation
from keras.layers import LSTM, Input, Flatten, Merge, Embedding, Convolution1D,Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers.convolutional import Conv2D
from keras.optimizers import SGD, Adam, RMSprop
from keras.layers.normalization import BatchNormalization
from sklearn.preprocessing import label_binarize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence


from features import *
from helper import *

Using TensorFlow backend.


In [None]:
code_path = os.path.dirname(os.path.realpath(os.getcwd()))
emotions_used = np.array(['ang', 'exc', 'neu', 'sad','frus','surp'])
data_path = code_path + "/../data/sessions/"
sessions = ['Session1', 'Session2', 'Session3', 'Session4', 'Session5']
framerate = 16000

In [None]:
import pickle
with open(data_path + '/../'+'data_collected.pickle', 'rb') as handle:
    data2 = pickle.load(handle)

In [None]:
text = []

for ses_mod in data2:
    text.append(ses_mod['transcription'])
    
MAX_SEQUENCE_LENGTH = 500

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)

token_tr_X = tokenizer.texts_to_sequences(text)
x_train_text = []

x_train_text = sequence.pad_sequences(token_tr_X, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
x_train_speech = []

counter = 0
for ses_mod in data2:
    x_head = ses_mod['signal']
    st_features = calculate_features(x_head, framerate, None)
    st_features, _ = pad_sequence_into_array(st_features, maxlen=100)
    x_train_speech.append( st_features.T )
    counter+=1
    if(counter%100==0):
        print(counter)
    
x_train_speech = np.array(x_train_speech)
x_train_speech.shape

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


(4936, 100, 34)

In [None]:
x_train_mocap = []
counter = 0
for ses_mod in data2:
    x_head = ses_mod['mocap_head']
    if(x_head.shape != (200,18)):
        x_head = np.zeros((200,18))   
    x_head[np.isnan(x_head)]=0
    x_hand = ses_mod['mocap_hand']
    if(x_hand.shape != (200,6)):
        x_hand = np.zeros((200,6))   
    x_hand[np.isnan(x_hand)]=0
    x_rot = ses_mod['mocap_rot']
    if(x_rot.shape != (200,165)):
        x_rot = np.zeros((200,165))   
    x_rot[np.isnan(x_rot)]=0
    x_mocap = np.concatenate((x_head, x_hand), axis=1)
    x_mocap = np.concatenate((x_mocap, x_rot), axis=1)
    x_train_mocap.append( x_mocap )
    
x_train_mocap = np.array(x_train_mocap)
x_train_mocap = x_train_mocap.reshape(-1,200,189,1)
x_train_mocap.shape

(4936, 200, 189, 1)

In [None]:
Y=[]
for ses_mod in data2:
    Y.append(ses_mod['emotion'])
    
Y = label_binarize(Y,emotions_used)

Y.shape

(4936, 4)

 # TEXT-SPEECH MODEL 1

In [None]:
model_text = Sequential()
#model.add(Embedding(2737, 128, input_length=MAX_SEQUENCE_LENGTH))
model_text.add(Embedding(2736,
                    128,input_length=500))
model_text.add(Flatten())
model_text.add(Dense(1024))
model_text.add(Activation('relu'))
model_text.add(Dropout(0.2))
model_text.add(Dense(512))
model_text.add(Activation('relu'))
model_text.add(Dropout(0.2))
model_text.add(Dense(256))



model_speech = Sequential()
model_speech.add(Flatten(input_shape=(100, 34)))
model_speech.add(Dense(1024))
model_speech.add(Activation('relu'))
model_speech.add(Dropout(0.2))
model_speech.add(Dense(512))
model_speech.add(Activation('relu'))
model_speech.add(Dropout(0.2))
model_speech.add(Dense(256))


model_combined = Sequential()
model_combined.add(Merge([model_text, model_speech], mode='concat'))

model_combined.add(Activation('relu'))
model_combined.add(Dense(256))
model_combined.add(Activation('relu'))

model_combined.add(Dense(6))
model_combined.add(Activation('softmax'))

#sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model_combined.compile(loss='categorical_crossentropy',optimizer='adam' ,metrics=['acc'])


model_speech.summary()
model_text.summary()
model_combined.summary()

print("Model1 Built")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_21 (Flatten)         (None, 3400)              0         
_________________________________________________________________
dense_59 (Dense)             (None, 1024)              3482624   
_________________________________________________________________
activation_65 (Activation)   (None, 1024)              0         
_________________________________________________________________
dropout_49 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_60 (Dense)             (None, 512)               524800    
_________________________________________________________________
activation_66 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_50 (Dropout)         (None, 512)               0         
__________



In [None]:
hist = model_combined.fit([x_train_text,x_train_speech], Y, 
                 batch_size=64, nb_epoch=30, verbose=1, 
                 validation_split=0.2)



Train on 3948 samples, validate on 988 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


#**TEXT - SPEECH MODEL 2**

In [None]:
model_text = Sequential()
#model.add(Embedding(2737, 128, input_length=MAX_SEQUENCE_LENGTH))
model_text.add(Embedding(2736,
                    128,input_length=500))

model_text.add(LSTM(256, return_sequences=True, input_shape=(100, 34)))
model_text.add(LSTM(256, return_sequences=False))
model_text.add(Dense(256))


model_speech = Sequential()
model_speech.add(Flatten(input_shape=(100, 34)))
model_speech.add(Dense(1024))
model_speech.add(Activation('relu'))
model_speech.add(Dropout(0.2))
model_speech.add(Dense(256))


model_combined = Sequential()
model_combined.add(Merge([model_text, model_speech], mode='concat'))

model_combined.add(Activation('relu'))
model_combined.add(Dense(256))
model_combined.add(Activation('relu'))

model_combined.add(Dense(6))
model_combined.add(Activation('softmax'))

#sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model_combined.compile(loss='categorical_crossentropy',optimizer='adam' ,metrics=['acc'])


model_speech.summary()
model_text.summary()
model_combined.summary()

print("Model2 Built")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_23 (Flatten)         (None, 3400)              0         
_________________________________________________________________
dense_70 (Dense)             (None, 1024)              3482624   
_________________________________________________________________
activation_73 (Activation)   (None, 1024)              0         
_________________________________________________________________
dropout_52 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_71 (Dense)             (None, 256)               262400    
Total params: 3,745,024
Trainable params: 3,745,024
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   




In [None]:
hist = model_combined.fit([x_train_text,x_train_speech], Y, 
                 batch_size=64, nb_epoch=10, verbose=1, 
                 validation_split=0.2)



Train on 3948 samples, validate on 988 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#TEXT - SPEECH MODEL 3

In [None]:
model_text = Sequential()
#model.add(Embedding(2737, 128, input_length=MAX_SEQUENCE_LENGTH))
model_text.add(Embedding(nb_words,
                    EMBEDDING_DIM,
                    weights = [g_word_embedding_matrix],
                    input_length = MAX_SEQUENCE_LENGTH,
                    trainable = True))

model_text.add(LSTM(256, return_sequences=True))
model_text.add(LSTM(256, return_sequences=False))
model_text.add(Dense(256))


model_speech = Sequential()
model_speech.add(Flatten(input_shape=(100, 34)))
model_speech.add(Dense(1024))
model_speech.add(Activation('relu'))
model_speech.add(Dropout(0.2))
model_speech.add(Dense(256))


model_combined = Sequential()
model_combined.add(Merge([model_text, model_speech], mode='concat'))

model_combined.add(Activation('relu'))
model_combined.add(Dense(256))
model_combined.add(Activation('relu'))

model_combined.add(Dense(4))
model_combined.add(Activation('softmax'))

#sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model_combined.compile(loss='categorical_crossentropy',optimizer='adam' ,metrics=['acc'])


model_speech.summary()
model_text.summary()
model_combined.summary()

print("Model3 Built")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_24 (Flatten)         (None, 3400)              0         
_________________________________________________________________
dense_75 (Dense)             (None, 1024)              3482624   
_________________________________________________________________
activation_77 (Activation)   (None, 1024)              0         
_________________________________________________________________
dropout_53 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_76 (Dense)             (None, 256)               262400    
Total params: 3,745,024
Trainable params: 3,745,024
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   




In [None]:
hist = model_combined.fit([x_train_text,x_train_speech], Y, 
                 batch_size=64, nb_epoch=10, verbose=1, 
                 validation_split=0.2)



Train on 3948 samples, validate on 988 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#**TRI MODEL 1**

In [None]:
model_text = Sequential()
#model.add(Embedding(2737, 128, input_length=MAX_SEQUENCE_LENGTH))
model_text.add(Embedding(nb_words,
                    EMBEDDING_DIM,
                    weights = [g_word_embedding_matrix],
                    input_length = MAX_SEQUENCE_LENGTH,
                    trainable = True))

model_text.add(LSTM(256, return_sequences=True))
model_text.add(LSTM(256, return_sequences=False))
model_text.add(Dense(256))


model_speech = Sequential()
model_speech.add(Flatten(input_shape=(100, 34)))
model_speech.add(Dense(1024))
model_speech.add(Activation('relu'))
model_speech.add(Dropout(0.2))
model_speech.add(Dense(256))

model_mocap = Sequential()
model_mocap.add(Conv2D(32, 3, strides=(2, 2), border_mode='same', input_shape=(200, 189, 1)))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(64, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(64, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(128, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(128, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Flatten())
model_mocap.add(Dense(256))

model_combined = Sequential()
model_combined.add(Merge([model_text, model_speech, model_mocap], mode='concat'))

model_combined.add(Activation('relu'))

model_combined.add(Dense(256))
model_combined.add(Activation('relu'))

model_combined.add(Dense(6))
model_combined.add(Activation('softmax'))

#sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model_combined.compile(loss='categorical_crossentropy',optimizer='Adam' ,metrics=['acc'])


model_speech.summary()
model_text.summary()
model_mocap.summary()
model_combined.summary()

print("Model1 Built")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_29 (Flatten)         (None, 3400)              0         
_________________________________________________________________
dense_93 (Dense)             (None, 1024)              3482624   
_________________________________________________________________
activation_100 (Activation)  (None, 1024)              0         
_________________________________________________________________
dropout_67 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_94 (Dense)             (None, 256)               262400    
Total params: 3,745,024
Trainable params: 3,745,024
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   




In [None]:
hist = model_combined.fit([x_train_text,x_train_speech,x_train_mocap], Y, 
                 batch_size=64, nb_epoch=20, verbose=1, 
                 validation_split=0.2)



Train on 3948 samples, validate on 988 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


#TRI MODEL 2

In [None]:
model_text = Sequential()
#model.add(Embedding(2737, 128, input_length=MAX_SEQUENCE_LENGTH))
model_text.add(Embedding(nb_words,
                    EMBEDDING_DIM,
                    weights = [g_word_embedding_matrix],
                    input_length = MAX_SEQUENCE_LENGTH,
                    trainable = True))
model_text.add(Convolution1D(256, 3, border_mode='same'))
model_text.add(Dropout(0.2))
model_text.add(Activation('relu'))
model_text.add(Convolution1D(128, 3, border_mode='same'))
model_text.add(Dropout(0.2))
model_text.add(Activation('relu'))
model_text.add(Convolution1D(64, 3, border_mode='same'))
model_text.add(Dropout(0.2))
model_text.add(Activation('relu'))
model_text.add(Convolution1D(32, 3, border_mode='same'))
model_text.add(Dropout(0.2))
model_text.add(Activation('relu'))
model_text.add(Flatten())
model_text.add(Dropout(0.2))
model_text.add(Dense(256))


model_speech = Sequential()
model_speech.add(Flatten(input_shape=(100, 34)))
model_speech.add(Dense(1024))
model_speech.add(Activation('relu'))
model_speech.add(Dropout(0.2))
model_speech.add(Dense(256))

model_mocap = Sequential()
model_mocap.add(Conv2D(32, 3, strides=(2, 2), border_mode='same', input_shape=(200, 189, 1)))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(64, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(64, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(128, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(128, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Flatten())
model_mocap.add(Dense(256))

model_combined = Sequential()
model_combined.add(Merge([model_text, model_speech, model_mocap], mode='concat'))
model_combined.add(Activation('relu'))

model_combined.add(Dense(256))
model_combined.add(Activation('relu'))

model_combined.add(Dense(6))
model_combined.add(Activation('softmax'))

#sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model_combined.compile(loss='categorical_crossentropy',optimizer='Adam' ,metrics=['acc'])


model_speech.summary()
model_text.summary()
model_mocap.summary()
model_combined.summary()

print("Model2 Built")


  
  # This is added back by InteractiveShellApp.init_path()
  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_35 (Flatten)         (None, 3400)              0         
_________________________________________________________________
dense_106 (Dense)            (None, 1024)              3482624   
_________________________________________________________________
activation_126 (Activation)  (None, 1024)              0         
_________________________________________________________________
dropout_90 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_107 (Dense)            (None, 256)               262400    
Total params: 3,745,024
Trainable params: 3,745,024
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   




In [None]:
hist = model_combined.fit([x_train_text,x_train_speech,x_train_mocap], Y, 
                 batch_size=64, nb_epoch=60, verbose=1, 
                 validation_split=0.2)



Train on 3948 samples, validate on 988 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


#TRI MODEL 3

In [None]:
model_text = Sequential()
#model.add(Embedding(2737, 128, input_length=MAX_SEQUENCE_LENGTH))
model_text.add(Embedding(nb_words,
                    EMBEDDING_DIM,
                    weights = [g_word_embedding_matrix],
                    input_length = MAX_SEQUENCE_LENGTH,
                    trainable = True))

model_text.add(LSTM(256, return_sequences=True))
model_text.add(LSTM(256, return_sequences=False))
model_text.add(Dense(256))


model_speech = Sequential()
model_speech.add(LSTM(128, return_sequences=True, input_shape=(100, 34)))
model_speech.add(AttentionDecoder(128,128))
model_speech.add(Flatten())
model_speech.add(Dense(256))

model_mocap = Sequential()
model_mocap.add(Conv2D(32, 3, strides=(2, 2), border_mode='same', input_shape=(200, 189, 1)))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(64, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(64, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(128, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Conv2D(128, 3, strides=(2, 2), border_mode='same'))
model_mocap.add(Dropout(0.2))
model_mocap.add(Activation('relu'))
model_mocap.add(Flatten())
model_mocap.add(Dense(256))

model_combined = Sequential()
model_combined.add(Merge([model_text, model_speech, model_mocap], mode='concat'))

model_combined.add(Activation('relu'))

model_combined.add(Dense(256))
model_combined.add(Activation('relu'))

model_combined.add(Dense(6))
model_combined.add(Activation('softmax'))

#sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model_combined.compile(loss='categorical_crossentropy',optimizer='Adam' ,metrics=['acc'])


model_speech.summary()
model_text.summary()
model_mocap.summary()
model_combined.summary()

print("Model3 Built")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_23 (LSTM)               (None, 100, 128)          83456     
_________________________________________________________________
AttentionDecoder (AttentionD (None, 100, 128)          246528    
_________________________________________________________________
flatten_43 (Flatten)         (None, 12800)             0         
_________________________________________________________________
dense_120 (Dense)            (None, 256)               3277056   
Total params: 3,607,040
Trainable params: 3,607,040
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_26 (Embedding)     (None, 500, 300)          821100    
_________________________________________________________________




In [None]:
hist = model_combined.fit([x_train_text,x_train_speech,x_train_mocap], Y, 
                 batch_size=64, nb_epoch=30, verbose=1, 
                 validation_split=0.2)



Train on 3948 samples, validate on 988 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
