In [1]:
import tensorflow as tf


def attention(inputs, attention_size, time_major=False, return_alphas=False):
    """
    Attention mechanism layer which reduces RNN/Bi-RNN outputs with Attention vector.

    The idea was proposed in the article by Z. Yang et al., "Hierarchical Attention Networks
     for Document Classification", 2016: http://www.aclweb.org/anthology/N16-1174.
    Variables notation is also inherited from the article
    
    Args:
        inputs: The Attention inputs.
            Matches outputs of RNN/Bi-RNN layer (not final state):
                In case of RNN, this must be RNN outputs `Tensor`:
                    If time_major == False (default), this must be a tensor of shape:
                        `[batch_size, max_time, cell.output_size]`.
                    If time_major == True, this must be a tensor of shape:
                        `[max_time, batch_size, cell.output_size]`.
                In case of Bidirectional RNN, this must be a tuple (outputs_fw, outputs_bw) containing the forward and
                the backward RNN outputs `Tensor`.
                    If time_major == False (default),
                        outputs_fw is a `Tensor` shaped:
                        `[batch_size, max_time, cell_fw.output_size]`
                        and outputs_bw is a `Tensor` shaped:
                        `[batch_size, max_time, cell_bw.output_size]`.
                    If time_major == True,
                        outputs_fw is a `Tensor` shaped:
                        `[max_time, batch_size, cell_fw.output_size]`
                        and outputs_bw is a `Tensor` shaped:
                        `[max_time, batch_size, cell_bw.output_size]`.
        attention_size: Linear size of the Attention weights.
        time_major: The shape format of the `inputs` Tensors.
            If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
            If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
            Using `time_major = True` is a bit more efficient because it avoids
            transposes at the beginning and end of the RNN calculation.  However,
            most TensorFlow data is batch-major, so by default this function
            accepts input and emits output in batch-major form.
        return_alphas: Whether to return attention coefficients variable along with layer's output.
            Used for visualization purpose.
    Returns:
        The Attention output `Tensor`.
        In case of RNN, this will be a `Tensor` shaped:
            `[batch_size, cell.output_size]`.
        In case of Bidirectional RNN, this will be a `Tensor` shaped:
            `[batch_size, cell_fw.output_size + cell_bw.output_size]`.
    """

    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.array_ops.transpose(inputs, [1, 0, 2])

    hidden_size = inputs.shape[2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    W_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
    b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
    u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))

    # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
    #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
    #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    vu = tf.tensordot(v, u_omega, axes=1)   # (B,T) shape
    alphas = tf.nn.softmax(vu)              # (B,T) shape also

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)

    if not return_alphas:
        return output
    else:
        return output, alphas


In [2]:
import tensorflow as tf
from tensorflow.python.training import moving_averages
# Importer and Exporting
# ========

# tf.app.flags.DEFINE_string  ('data_path',  'IEMOCAP1.pkl',   'total dataset includes training set, valid set and test set')
# tf.app.flags.DEFINE_string  ('checkpoint', 'checkpoint/',   'the checkpoint dir')
# tf.app.flags.DEFINE_string  ('model_name', 'model.ckpt',      'model name')
# tf.app.flags.DEFINE_string  ('pred_name',  'pred0.pkl',        'the test output dir')
# tf.app.flags.DEFINE_integer ('checkpoint_secs',  60,         'checkpoint saving interval in seconds')
# # Global Constants
# # ================

# tf.app.flags.DEFINE_float   ('dropout_conv',     1,        'dropout rate for covvolutional layers')
# tf.app.flags.DEFINE_float   ('dropout_linear',   1,        'dropout rate for linear layer')
# tf.app.flags.DEFINE_float   ('dropout_lstm',     1,        'dropout rate for lstm')
# tf.app.flags.DEFINE_float   ('dropout_fully1',   1,        'dropout rate for fully connected layer1')
# tf.app.flags.DEFINE_float   ('dropout_fully2',   1,        'dropout rate for fully connected layer1')

# tf.app.flags.DEFINE_float('decay_rate', 0.99, 'the lr decay rate')
# tf.app.flags.DEFINE_float('beta1', 0.9, 'parameter of adam optimizer beta1')
# tf.app.flags.DEFINE_float('beta2', 0.999, 'adam parameter beta2')


# tf.app.flags.DEFINE_integer('decay_steps', 570, 'the lr decay_step for optimizer')
# tf.app.flags.DEFINE_float('momentum', 0.99, 'the momentum')
# tf.app.flags.DEFINE_integer('num_epochs', 30000, 'maximum epochs')
# tf.app.flags.DEFINE_float   ('relu_clip',        20.0,        'ReLU clipping value for non-recurrant layers')

# tf.app.flags.DEFINE_float   ('adam_beta1',            0.9,         'beta 1 parameter of Adam optimizer')
# tf.app.flags.DEFINE_float   ('adam_beta2',            0.999,       'beta 2 parameter of Adam optimizer')
# tf.app.flags.DEFINE_float   ('epsilon',          1e-8,        'epsilon parameter of Adam optimizer')
# tf.app.flags.DEFINE_float   ('learning_rate',    0.0001,       'learning rate of Adam optimizer')


# tf.app.flags.DEFINE_integer ('train_batch_size', 40,           'number of elements in a training batch')
# tf.app.flags.DEFINE_integer ('valid_batch_size',   40,           'number of elements in a validation batch')
# tf.app.flags.DEFINE_integer ('test_batch_size',  40,           'number of elements in a test batch')

# tf.app.flags.DEFINE_integer('save_steps', 10, 'the step to save checkpoint')

# tf.app.flags.DEFINE_integer('image_height', 300, 'image height')
# tf.app.flags.DEFINE_integer('image_width', 40, 'image width')
# tf.app.flags.DEFINE_integer('image_channel', 3, 'image channels as input')
# tf.app.flags.DEFINE_integer('linear_num', 786, 'hidden number of linear layer')
# tf.app.flags.DEFINE_integer('seq_len', 150, 'sequence length of lstm')
# tf.app.flags.DEFINE_integer('cell_num', 128, 'cell units of the lstm')
# tf.app.flags.DEFINE_integer('hidden1', 64, 'number of hidden units of fully connected layer')
# tf.app.flags.DEFINE_integer('hidden2', 4, 'number of softmax layer')
# tf.app.flags.DEFINE_integer('attention_size', 1, 'attention_size')
# tf.app.flags.DEFINE_boolean('attention', False, 'whether to use attention, False mean use max-pooling')

data_path = '/IEMOCAP1.pkl'
checkpoint = '/checkpoint/'
model_name = '/model.ckpt'
pred_name = '/pred0.pkl'
checkpoint_secs = 60

dropout_conv = 1
dropout_linear = 1
dropout_lstm = 1
dropout_fully1 = 1
dropout_fully2 = 1

#decayed_learning rate
decay_rate = 0.99
beta1 = 0.9
beta2 = 0.999

#Moving Average
decay_steps = 570
momentum = 0.99
num_epochs = 30000
relu_clip =  20.0

# Adam optimizer (http://arxiv.org/abs/1412.6980) parameters

adam_beta1 = 0.9
adam_beta2 = 0.999
epsilon =  1e-8
learning_rate =   0.0001

# Batch sizes


train_batch_size = 40
valid_batch_size = 40
test_batch_size =  40

save_steps =   10

image_height =   300
image_width = 40
image_channel = 3

linear_num =  786
seq_len =   150
cell_num = 128
hidden1 = 64
hidden2 =  4
attention_size =   1
attention = False

In [3]:
class CRNN(object):
    
    def __init__(self, mode):
        self.mode = mode
        # log Mel-spectrogram
        self.attention = attention
        self.inputs = tf.placeholder(tf.float32, [None, image_height, image_width, image_channel])
        # emotion label
        self.labels = tf.placeholder(tf.int32, shape=[None, 4])
        # lstm time step
        #self.seq_len = tf.placeholder(tf.int32, [None])
        # l2
        self._extra_train_ops = []

    def _conv2d(self, x, name, filter_size, in_channels, out_channels, strides):
        with tf.variable_scope(name):
            kernel = tf.get_variable(name='DW',
                                     shape=[filter_size[0], filter_size[1], in_channels, out_channels],
                                     dtype=tf.float32,
                                     initializer=tf.contrib.layers.xavier_initializer())

            b = tf.get_variable(name='bais',
                                shape=[out_channels],
                                dtype=tf.float32,
                                initializer=tf.constant_initializer())

            con2d_op = tf.nn.conv2d(x, kernel, [1, strides[0], strides[1], 1], padding='SAME')

        return tf.nn.bias_add(con2d_op, b) 
    
    def _max_pool(self, x, ksize, strides):
        return tf.nn.max_pool(x,
                              ksize=[1, ksize[0], ksize[1], 1],
                              strides=[1, strides[0], strides[1], 1],
                              padding='VALID',
                              name='max_pool')
    

    def _linear(self,x,names,shapes):
        with tf.variable_scope(names):
            weights = tf.get_variable(name='weights',
                                      shape=shapes,
                                      initializer=tf.truncated_normal_initializer(stddev=0.1))
            bias = tf.get_variable(name='bias',
                                   shape=shapes[1],
                                   initializer=tf.constant_initializer(0.0))
        return tf.matmul(x,weights) + bias
    

    def _leaky_relu(self, x, leakiness=0.0):
        return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu')
    
    def _batch_norm(self, name, x):
        """Batch normalization."""
        with tf.variable_scope(name):
            params_shape = [x.get_shape()[-1]]

            beta = tf.get_variable(
                'beta', params_shape, tf.float32,
                initializer=tf.constant_initializer(0.0, tf.float32))
            gamma = tf.get_variable(
                'gamma', params_shape, tf.float32,
                initializer=tf.constant_initializer(1.0, tf.float32))

            if self.mode == 'train':
                mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments')

                moving_mean = tf.get_variable(
                    'moving_mean', params_shape, tf.float32,
                    initializer=tf.constant_initializer(0.0, tf.float32),
                    trainable=False)
                moving_variance = tf.get_variable(
                    'moving_variance', params_shape, tf.float32,
                    initializer=tf.constant_initializer(1.0, tf.float32),
                    trainable=False)

                self._extra_train_ops.append(moving_averages.assign_moving_average(
                    moving_mean, mean, 0.9))
                self._extra_train_ops.append(moving_averages.assign_moving_average(
                    moving_variance, variance, 0.9))
            else:
                mean = tf.get_variable(
                    'moving_mean', params_shape, tf.float32,
                    initializer=tf.constant_initializer(0.0, tf.float32),
                    trainable=False)
                variance = tf.get_variable(
                    'moving_variance', params_shape, tf.float32,
                    initializer=tf.constant_initializer(1.0, tf.float32),
                    trainable=False)

#                tf.summary.histogram(mean.op.name, mean)
#                tf.summary.histogram(variance.op.name, variance)
            # elipson used to be 1e-5. Maybe 0.001 solves NaN problem in deeper net.
            x_bn = tf.nn.batch_normalization(x, mean, variance, beta, gamma, 0.001)
            x_bn.set_shape(x.get_shape())

            return x_bn
        
        
    def _batch_norm_wrapper(self, name, inputs, decay = 0.999):
        #batch normalization for fully connected layer
        with tf.variable_scope(name):
            scale = tf.Variable(tf.ones([inputs.get_shape()[-1]]))
            beta = tf.Variable(tf.zeros([inputs.get_shape()[-1]]))
            pop_mean = tf.Variable(tf.zeros([inputs.get_shape()[-1]]), trainable=False)
            pop_var = tf.Variable(tf.ones([inputs.get_shape()[-1]]), trainable=False)

            if self.mode == 'train':
                batch_mean, batch_var = tf.nn.moments(inputs,[0])
                train_mean = tf.assign(pop_mean,
                                       pop_mean * decay + batch_mean * (1 - decay))
                train_var = tf.assign(pop_var,
                                      pop_var * decay + batch_var * (1 - decay))
                with tf.control_dependencies([train_mean, train_var]):
                    return tf.nn.batch_normalization(inputs,
                                                     batch_mean, batch_var, beta, scale, epsilon)
            else:
                return tf.nn.batch_normalization(inputs,
                                                 pop_mean, pop_var, beta, scale, epsilon)
            
            
    def _attention(self,inputs, attention_size, time_major=False, return_alphas=False):
        
        if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
            inputs = tf.concat(inputs, 2)

        if time_major:
        # (T,B,D) => (B,T,D)
            inputs = tf.array_ops.transpose(inputs, [1, 0, 2])

        hidden_size = inputs.shape[2].value  # D value - hidden size of the RNN layer

        # Trainable parameters
        W_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
        b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
        u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))

        # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
        #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
        #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
        v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
        # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
        vu = tf.tensordot(v, u_omega, axes=1)   # (B,T) shape
        alphas = tf.nn.softmax(vu)              # (B,T) shape also
        
        # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
        output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)

        if not return_alphas:
            return output
        else:
            return output, alphas
        
        
    def _build_model(self):
        filters = [128, 512]
        filter_size = [5, 3]
        filter_strides = [1, 1]
        pool1_size = [2, 4]
        pool2_size = [1, 2]
        p = 5
        with tf.variable_scope('cnn'):
            with tf.variable_scope('unit-1'):
                x = self._conv2d(self.inputs, 'cnn-1', filter_size, image_channel, filters[0], filter_strides)
                x = self._batch_norm('bn1', x)
                x = self._leaky_relu(x, 0.01)
                x = self._max_pool(x, pool1_size, pool1_size)
#                print x.get_shape()
            with tf.variable_scope('unit-2'):
                x = self._conv2d(x, 'cnn-2',  filter_size, filters[0], filters[1], filter_strides)
                x = self._batch_norm('bn2', x)
                x = self._leaky_relu(x, 0.01)
                x = self._max_pool(x, pool2_size, pool2_size)
#                print x.get_shape()
        with tf.variable_scope('linear'):
            # linear layer for dim reduction
            x = tf.reshape(x,[-1,p*filters[1]])
            x = self._linear(x,'linear1',[p*filters[1],linear_num])
#            print x.get_shape()
        with tf.variable_scope('lstm'):
            x = tf.reshape(x,[-1,seq_len,linear_num])
            
            cell_fw = tf.contrib.rnn.BasicLSTMCell(cell_num, forget_bias=1.0)
            if self.mode == 'train':
                cell_fw = tf.contrib.rnn.DropoutWrapper(cell=cell_fw, output_keep_prob=dropout_lstm)

            cell_bw = tf.contrib.rnn.BasicLSTMCell(cell_num, forget_bias=1.0)
            if self.mode == 'train':
                cell_bw = tf.contrib.rnn.DropoutWrapper(cell=cell_bw, output_keep_prob=dropout_lstm)
            
            # Now we feed `linear` into the LSTM BRNN cell and obtain the LSTM BRNN output.
            outputs, output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw,
                                                                       cell_bw=cell_bw,
                                                                       inputs= x,
                                                                       dtype=tf.float32,
                                                                       time_major=False,
                                                                       scope='LSTM1')
        with tf.variable_scope('time_pooling'):
            if self.attention is not None:
                outputs, alphas = self._attention(outputs, attention_size, return_alphas=True)
            else:
                outputs = tf.concat(outputs,2)
                outputs = tf.reshape(outputs, [-1, seq_len,2*cell_num, 1])
                outputs = self._max_pool(outputs,[seq_len,1],[seq_len,1])
                outputs = tf.reshape(outputs, [-1,2*cell_num])
#            print outputs.get_shape()
        
        with tf.variable_scope('dense'):
            y = self._linear(outputs,'dense-matmul',[2*cell_num,hidden1])
            y = self._batch_norm_wrapper('dense-bn', y)
            y = self._leaky_relu(y, 0.01)
        
        self.logits = self._linear(y,'softmax',[hidden1,hidden2])

In [4]:
epsilon = 1e-3

def leaky_relu(x, leakiness=0.0):
    return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu')

def batch_norm_wrapper(inputs, is_training, decay = 0.999):

    scale = tf.Variable(tf.ones([inputs.get_shape()[-1]]))
    beta = tf.Variable(tf.zeros([inputs.get_shape()[-1]]))
    pop_mean = tf.Variable(tf.zeros([inputs.get_shape()[-1]]), trainable=False)
    pop_var = tf.Variable(tf.ones([inputs.get_shape()[-1]]), trainable=False)

    if is_training is not None:
        batch_mean, batch_var = tf.nn.moments(inputs,[0])
        train_mean = tf.assign(pop_mean,
                               pop_mean * decay + batch_mean * (1 - decay))
        train_var = tf.assign(pop_var,
                              pop_var * decay + batch_var * (1 - decay))
        with tf.control_dependencies([train_mean, train_var]):
            return tf.nn.batch_normalization(inputs,
                batch_mean, batch_var, beta, scale, epsilon)
    else:
        return tf.nn.batch_normalization(inputs,
            pop_mean, pop_var, beta, scale, epsilon)

def acrnn(inputs, num_classes=4,
                  is_training=True,
                  L1=128,
                  L2=256,
                  cell_units=128,
                  num_linear=768,
                  p=10,
                  time_step=150,
                  F1=64,
                  dropout_keep_prob=1):
    
    global ndims
    layer1_filter = tf.compat.v1.get_variable('layer1_filter', shape=[5, 3, 3, L1], dtype=tf.float32, 
                                    initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.1))
    layer1_bias = tf.compat.v1.get_variable('layer1_bias', shape=[L1], dtype=tf.float32,
                                  initializer=tf.compat.v1.constant_initializer(0.1))
    layer1_stride = [1, 1, 1, 1]
    layer2_filter = tf.compat.v1.get_variable('layer2_filter', shape=[5, 3, L1, L2], dtype=tf.float32, 
                                    initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.1))
    layer2_bias = tf.compat.v1.get_variable('layer2_bias', shape=[L2], dtype=tf.float32,
                                  initializer=tf.compat.v1.constant_initializer(0.1))
    layer2_stride = [1, 1, 1, 1]
    layer3_filter = tf.compat.v1.get_variable('layer3_filter', shape=[5, 3, L2, L2], dtype=tf.float32, 
                                    initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.1))
    layer3_bias = tf.compat.v1.get_variable('layer3_bias', shape=[L2], dtype=tf.float32,
                                  initializer=tf.compat.v1.constant_initializer(0.1))
    layer3_stride = [1, 1, 1, 1]
    layer4_filter = tf.compat.v1.get_variable('layer4_filter', shape=[5, 3, L2, L2], dtype=tf.float32, 
                                    initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.1))
    layer4_bias = tf.compat.v1.get_variable('layer4_bias', shape=[L2], dtype=tf.float32,
                                  initializer=tf.compat.v1.constant_initializer(0.1))
    layer4_stride = [1, 1, 1, 1]
    layer5_filter = tf.compat.v1.get_variable('layer5_filter', shape=[5, 3, L2, L2], dtype=tf.float32, 
                                    initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.1))
    layer5_bias = tf.compat.v1.get_variable('layer5_bias', shape=[L2], dtype=tf.float32,
                                  initializer=tf.compat.v1.constant_initializer(0.1))
    layer5_stride = [1, 1, 1, 1]
    layer6_filter = tf.compat.v1.get_variable('layer6_filter', shape=[5, 3, L2, L2], dtype=tf.float32, 
                                    initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.1))
    layer6_bias = tf.compat.v1.get_variable('layer6_bias', shape=[L2], dtype=tf.float32,
                                  initializer=tf.compat.v1.constant_initializer(0.1))
    layer6_stride = [1, 1, 1, 1]
    
    linear1_weight = tf.compat.v1.get_variable('linear1_weight', shape=[p*L2,num_linear], dtype=tf.float32,
                                    initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.1))
    linear1_bias = tf.compat.v1.get_variable('linear1_bias', shape=[num_linear], dtype=tf.float32,
                                  initializer=tf.compat.v1.constant_initializer(0.1))
 
    fully1_weight = tf.compat.v1.get_variable('fully1_weight', shape=[2*cell_units,F1], dtype=tf.float32,
                                    initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.1))
    fully1_bias = tf.compat.v1.get_variable('fully1_bias', shape=[F1], dtype=tf.float32,
                                  initializer=tf.compat.v1.constant_initializer(0.1))
    fully2_weight = tf.compat.v1.get_variable('fully2_weight', shape=[F1,num_classes], dtype=tf.float32,
                                    initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.1))
    fully2_bias = tf.compat.v1.get_variable('fully2_bias', shape=[num_classes], dtype=tf.float32,
                                  initializer=tf.compat.v1.constant_initializer(0.1))
    
    layer1 = tf.nn.conv2d(inputs, layer1_filter, layer1_stride, padding='SAME')
    layer1 = tf.nn.bias_add(layer1,layer1_bias)
    layer1 = leaky_relu(layer1, 0.01)
    layer1 = tf.nn.max_pool(layer1,ksize=[1, 2, 4, 1], strides=[1, 2, 4, 1], padding='VALID', name='max_pool')
    layer1 = tf.keras.layers.Dropout(layer1)
    
    layer2 = tf.nn.conv2d(layer1, layer2_filter, layer2_stride, padding='SAME')
    layer2 = tf.nn.bias_add(layer2,layer2_bias)
    layer2 = leaky_relu(layer2, 0.01)
    layer2 = tf.keras.layers.Dropout(layer2)
    
    layer3 = tf.nn.conv2d(layer2, layer3_filter, layer3_stride, padding='SAME')
    layer3 = tf.nn.bias_add(layer3,layer3_bias)
    layer3 = leaky_relu(layer3, 0.01)
    layer3 = tf.keras.layers.Dropout(layer3)
    
    layer4 = tf.nn.conv2d(layer3, layer4_filter, layer4_stride, padding='SAME')
    layer4 = tf.nn.bias_add(layer4,layer4_bias)
    layer4 = leaky_relu(layer4, 0.01)
    layer4 = tf.keras.layers.Dropout(layer4)
    
    layer5 = tf.nn.conv2d(layer4, layer5_filter, layer5_stride, padding='SAME')
    layer5 = tf.nn.bias_add(layer5,layer5_bias)
    layer5 = leaky_relu(layer5, 0.01)    
    layer5 = tf.keras.layers.Dropout(layer5)

    layer6 = tf.nn.conv2d(layer5, layer6_filter, layer6_stride, padding='SAME')
    layer6 = tf.nn.bias_add(layer6,layer6_bias)
    layer6 = leaky_relu(layer6, 0.01)    
    layer6 = tf.keras.layers.Dropout(layer6)
    
    layer6 = tf.reshape(layer6,[-1,time_step,L2*p])
    layer6 = tf.reshape(layer6, [-1,p*L2])
    
    linear1 = tf.matmul(layer6,linear1_weight) + linear1_bias
    linear1 = batch_norm_wrapper(linear1,is_training)
    linear1 = leaky_relu(linear1, 0.01)
    #linear1 = batch_norm_wrapper(linear1,is_training)
    linear1 = tf.reshape(linear1, [-1, time_step, num_linear])
    
    
    
    # Define lstm cells with tensorflow
    # Forward direction cell
    gru_fw_cell1 = tf.contrib.rnn.BasicLSTMCell(cell_units, forget_bias=1.0)
    # Backward direction cell
    gru_bw_cell1 = tf.contrib.rnn.BasicLSTMCell(cell_units, forget_bias=1.0)
    
    # Now we feed `layer_3` into the LSTM BRNN cell and obtain the LSTM BRNN output.
    outputs1, output_states1 = tf.nn.bidirectional_dynamic_rnn(cell_fw=gru_fw_cell1,
                                                             cell_bw=gru_bw_cell1,
                                                             inputs= linear1,
                                                             dtype=tf.float32,
                                                             time_major=False,
                                                             scope='LSTM1')

    # Attention layer
    gru, alphas = attention(outputs1, 1, return_alphas=True)
    
    
    fully1 = tf.matmul(gru,fully1_weight) + fully1_bias
    fully1 = leaky_relu(fully1, 0.01)
    fully1 = tf.nn.dropout(fully1, dropout_keep_prob)
    
    
    Ylogits = tf.matmul(fully1, fully2_weight) + fully2_bias
    #Ylogits = tf.nn.softmax(Ylogits)
    return Ylogits


In [5]:
import numpy as np
import pickle
from sklearn.metrics import recall_score as recall
from sklearn.metrics import confusion_matrix as confusion
import os

In [6]:
# tf.app.flags.DEFINE_integer('num_epoch', 5000, 'The number of epoches for training.')
# tf.app.flags.DEFINE_integer('num_classes', 4, 'The number of emotion classes.')
# tf.app.flags.DEFINE_integer('batch_size', 60, 'The number of samples in each batch.')
# tf.app.flags.DEFINE_boolean('is_adam', True,'whether to use adam optimizer.')
# tf.app.flags.DEFINE_float('learning_rate', 0.00001, 'learning rate of Adam optimizer')
# tf.app.flags.DEFINE_float   ('dropout_keep_prob',     1,        'the prob of every unit keep in dropout layer')
# tf.app.flags.DEFINE_integer('image_height', 300, 'image height')
# tf.app.flags.DEFINE_integer('image_width', 40, 'image width')
# tf.app.flags.DEFINE_integer('image_channel', 3, 'image channels as input')

# tf.app.flags.DEFINE_string  ('traindata_path', './IEMOCAP.pkl', 'total dataset includes training set')
# tf.app.flags.DEFINE_string  ('validdata_path', 'inputs/valid.pkl', 'total dataset includes valid set')
# tf.app.flags.DEFINE_string  ('checkpoint', './checkpoint/', 'the checkpoint dir')
# tf.app.flags.DEFINE_string  ('model_name', 'model4.ckpt', 'model name')

def load_data(in_dir):
    f = open(in_dir,'rb')
    train_data,train_label,test_data,test_label,valid_data,valid_label,Valid_label,Test_label,pernums_test,pernums_valid = pickle.load(f)
    return train_data,train_label,test_data,test_label,valid_data,valid_label,Valid_label,Test_label,pernums_test,pernums_valid

def dense_to_one_hot(labels_dense, num_classes):
    """Convert class labels from scalars to one-hot vectors."""
    num_labels = labels_dense.shape[0]
    index_offset = np.arange(num_labels) * num_classes
    labels_one_hot = np.zeros((num_labels, num_classes))
    labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
    return labels_one_hot

In [7]:
import tensorflow.compat.v1 as tf

tf.disable_v2_behavior() 
tf.disable_eager_execution()

num_classes = 4
is_adam = True
dropout_keep_prob = 1
data_path = 'IEMOCAP.pkl'
checkpoint = 'checkpoint/'

train_data,train_label,test_data,test_label,valid_data,valid_label,Valid_label,Test_label,pernums_test,pernums_valid = load_data(data_path)



train_label = dense_to_one_hot(train_label,num_classes)
valid_label = dense_to_one_hot(valid_label,num_classes)
Valid_label = dense_to_one_hot(Valid_label,num_classes)

valid_size = valid_data.shape[0]
dataset_size = train_data.shape[0]
vnum = pernums_valid.shape[0]
best_valid_uw = 0



X = tf.compat.v1.placeholder(tf.float32, shape=[None, image_height,image_width,image_channel])
Y = tf.compat.v1.placeholder(tf.int32, shape=[None, num_classes])

is_training = tf.compat.v1.placeholder(tf.bool)
lr = tf.compat.v1.placeholder(tf.float32)
keep_prob = tf.compat.v1.placeholder(tf.float32)

Ylogits = acrnn(X, is_training=is_training, dropout_keep_prob=keep_prob)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels =  Y, logits =  Ylogits)
cost = tf.reduce_mean(cross_entropy)
var_trainable_op = tf.trainable_variables()
if is_adam:
    # not apply gradient clipping
    train_op = tf.train.AdamOptimizer(lr).minimize(cost)            
else:
    # apply gradient clipping
    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, var_trainable_op), 5)
    opti = tf.train.AdamOptimizer(lr)
    train_op = opti.apply_gradients(zip(grads, var_trainable_op))
    
correct_pred = tf.equal(tf.argmax(Ylogits, 1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
saver=tf.train.Saver(tf.global_variables())
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for i in range(num_epoch):
        #learning_rate = FLAGS.learning_rate            
        start = (i * batch_size) % dataset_size
        end = min(start+batch_size, dataset_size)
        [_,tcost,tracc] = sess.run([train_op,cost,accuracy], feed_dict={X:train_data[start:end,:,:,:], Y:train_label[start:end,:],
                                        is_training:True, keep_prob:dropout_keep_prob, lr:learning_rate})
        if i % 5 == 0:
            #for valid data
            valid_iter = divmod((valid_size),batch_size)[0]
            y_pred_valid = np.empty((valid_size,num_classes),dtype=np.float32)
            y_valid = np.empty((vnum,4),dtype=np.float32)
            index = 0
            cost_valid = 0
            if(valid_size < batch_size):
                loss, y_pred_valid = sess.run([cross_entropy,Ylogits],feed_dict = {X:valid_data, Y:Valid_label,is_training:False, keep_prob:1})
                cost_valid = cost_valid + np.sum(loss)
            for v in range(valid_iter):
                v_begin = v*batch_size
                v_end = (v+1)*batch_size
                if(v == valid_iter-1):
                    if(v_end < valid_size):
                        v_end = valid_size
                loss, y_pred_valid[v_begin:v_end,:] = sess.run([cross_entropy,Ylogits],feed_dict = {X:valid_data[v_begin:v_end],Y:Valid_label[v_begin:v_end],is_training:False, keep_prob:1})
                cost_valid = cost_valid + np.sum(loss)
            cost_valid = cost_valid/valid_size
            
            for s in range(vnum):
                y_valid[s,:] = np.max(y_pred_valid[index:index+pernums_valid[s],:],0)
                index = index + pernums_valid[s]

            valid_acc_uw = recall(np.argmax(valid_label,1),np.argmax(y_valid,1),average='macro')
            valid_conf = confusion(np.argmax(valid_label, 1),np.argmax(y_valid,1))
            
            if valid_acc_uw > best_valid_uw:
                best_valid_uw = valid_acc_uw
                best_valid_conf = valid_conf
                saver.save(sess, os.path.join(checkpoint, model_name), global_step = i+1)
            
            print ("*****************************************************************")
            print ("Epoch: %05d" %(i+1))
            print ("Training cost: %2.3g" %tcost)   
            print ("Training accuracy: %3.4g" %tracc) 
            print ("Valid cost: %2.3g" %cost_valid)
            print ("Valid_UA: %3.4g" %valid_acc_uw)    
            print ("Best valid_UA: %3.4g" %best_valid_uw) 
            print ('Valid Confusion Matrix:["ang","sad","hap","neu"]')
            print (valid_conf)
            print ('Best Valid Confusion Matrix:["ang","sad","hap","neu"]')
            print (best_valid_conf)
            print ("*****************************************************************" )
                 

UnboundLocalError: local variable 'ndims' referenced before assignment

In [None]:
epsilon = 1e-3

def Batch_Normalization(x, training, scope):
    with arg_scope([batch_norm],
                   scope=scope,
                   updates_collections=None,
                   decay=0.9,
                   center=True,
                   scale=True,
                   zero_debias_moving_mean=True) :
        return tf.cond(training,
                       lambda : batch_norm(inputs=x, is_training=training, reuse=None),
                       lambda : batch_norm(inputs=x, is_training=training, reuse=True))

def leaky_relu(x, leakiness=0.0):
    return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu')
def load_data():
    f = open('./CASIA_40_delta.pkl','rb')
    train_data,train_label,test_data,test_label,valid_data,valid_label,Valid_label,Test_label,pernums_test,pernums_valid = cPickle.load(f)
    #train_data,train_label,test_data,test_label,valid_data,valid_label = cPickle.load(f)
    return train_data,train_label,test_data,test_label,valid_data,valid_label
def batch_norm_wrapper(inputs, is_training, decay = 0.999):

    scale = tf.Variable(tf.ones([inputs.get_shape()[-1]]))
    beta = tf.Variable(tf.zeros([inputs.get_shape()[-1]]))
    pop_mean = tf.Variable(tf.zeros([inputs.get_shape()[-1]]), trainable=False)
    pop_var = tf.Variable(tf.ones([inputs.get_shape()[-1]]), trainable=False)

    if is_training is not None:
        batch_mean, batch_var = tf.nn.moments(inputs,[0])
        train_mean = tf.assign(pop_mean,
                               pop_mean * decay + batch_mean * (1 - decay))
        train_var = tf.assign(pop_var,
                              pop_var * decay + batch_var * (1 - decay))
        with tf.control_dependencies([train_mean, train_var]):
            return tf.nn.batch_normalization(inputs,
                batch_mean, batch_var, beta, scale, epsilon)
    else:
        return tf.nn.batch_normalization(inputs,
            pop_mean, pop_var, beta, scale, epsilon)

def batchnorm(Ylogits, is_test, iteration, offset, convolutional=False):
    exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, iteration) # adding the iteration prevents from averaging across non-existing iterations
    bnepsilon = 1e-5
    if convolutional:
        mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
    else:
        mean, variance = tf.nn.moments(Ylogits, [0])
    update_moving_averages = exp_moving_avg.apply([mean, variance])
    m = tf.cond(is_test, lambda: exp_moving_avg.average(mean), lambda: mean)
    v = tf.cond(is_test, lambda: exp_moving_avg.average(variance), lambda: variance)
    Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
    return Ybn, update_moving_averages
def dense_to_one_hot(labels_dense, num_classes):
  """Convert class labels from scalars to one-hot vectors."""
  num_labels = labels_dense.shape[0]
  index_offset = np.arange(num_labels) * num_classes
  labels_one_hot = np.zeros((num_labels, num_classes))
  labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
  return labels_one_hot
def build_model(inputX, is_training,keep_prob):
    # 3 2-D convolution layers
    L1 = 256
    L2 = 512
    L3 = 512
    Li1 = 768
    F1 = 64
    F2 = 6
    p = 5
    cell_units1 = 128
    timesteps = 200
    ATTENTION_SIZE = 1
    layer1_filter = tf.get_variable('layer1_filter', shape=[5, 3, 3, L1], dtype=tf.float32, 
                                    initializer=tf.truncated_normal_initializer(stddev=0.1))
    layer1_bias = tf.get_variable('layer1_bias', shape=[L1], dtype=tf.float32,
                                  initializer=tf.constant_initializer(0.1))
    layer1_stride = [1, 1, 1, 1]
    layer2_filter = tf.get_variable('layer2_filter', shape=[5, 3, L1, L2], dtype=tf.float32, 
                                    initializer=tf.truncated_normal_initializer(stddev=0.1))
    layer2_bias = tf.get_variable('layer2_bias', shape=[L2], dtype=tf.float32,
                                  initializer=tf.constant_initializer(0.1))
    layer2_stride = [1, 1, 1, 1]
    layer3_filter = tf.get_variable('layer3_filter', shape=[5, 3, L2, L3], dtype=tf.float32, 
                                    initializer=tf.truncated_normal_initializer(stddev=0.1))
    layer3_bias = tf.get_variable('layer3_bias', shape=[L3], dtype=tf.float32,
                                  initializer=tf.constant_initializer(0.1))
    layer3_stride = [1, 1, 1, 1]
    
    linear1_weight = tf.get_variable('linear1_weight', shape=[p*L2,Li1], dtype=tf.float32,
                                    initializer=tf.truncated_normal_initializer(stddev=0.1))
    linear1_bias = tf.get_variable('linear1_bias', shape=[Li1], dtype=tf.float32,
                                  initializer=tf.constant_initializer(0.1))
 
    fully1_weight = tf.get_variable('fully1_weight', shape=[2*cell_units1,F1], dtype=tf.float32,
                                    initializer=tf.truncated_normal_initializer(stddev=0.1))
    fully1_bias = tf.get_variable('fully1_bias', shape=[F1], dtype=tf.float32,
                                  initializer=tf.constant_initializer(0.1))
    fully2_weight = tf.get_variable('fully2_weight', shape=[F1,F2], dtype=tf.float32,
                                    initializer=tf.truncated_normal_initializer(stddev=0.1))
    fully2_bias = tf.get_variable('fully2_bias', shape=[F2], dtype=tf.float32,
                                  initializer=tf.constant_initializer(0.1))
    layer1 = tf.nn.conv2d(inputX, layer1_filter, layer1_stride, padding='SAME')
    layer1 = tf.nn.bias_add(layer1,layer1_bias)
    #layer1 = tf.layers.batch_normalization(layer1, training=is_training)
    #layer1 = Batch_Normalization(layer1, training=is_training, scope='layer1_batch')
    layer1 = leaky_relu(layer1, 0.01)
    #layer1 = Batch_Normalization(layer1, training=is_training, scope='layer1_batch')
    #print layer1.get_shape()
    layer1 = tf.nn.max_pool(layer1,ksize=[1, 1, 4, 1], strides=[1, 1, 4, 1], padding='VALID', name='max_pool')
    #print layer1.get_shape()
    layer1 = tf.contrib.layers.dropout(layer1, keep_prob=keep_prob, is_training=is_training)
    #layer1 = tf.reshape(layer1,[-1,timesteps,L1*p])
    
    layer2 = tf.nn.conv2d(layer1, layer2_filter, layer2_stride, padding='SAME')
    layer2 = tf.nn.bias_add(layer2,layer2_bias)
    #layer1 = tf.layers.batch_normalization(layer1, training=is_training)
    
    layer2 = leaky_relu(layer2, 0.01)
    #print layer2.get_shape()
    #layer2 = Batch_Normalization(layer2, training=is_training, scope='layer1_batch')
    layer2 = tf.nn.max_pool(layer2,ksize=[1, 1, 2, 1], strides=[1, 1, 2, 1], padding='VALID', name='max_pool')
    #print layer2.get_shape()
    layer2 = tf.contrib.layers.dropout(layer2, keep_prob=keep_prob, is_training=is_training)
    layer2 = tf.reshape(layer2,[-1,timesteps,L2*p])
    
    
    layer2 = tf.reshape(layer2, [-1,p*L2])
    
    #layer1 = tf.reshape(layer1,[-1,p*L1])
    linear1 = tf.matmul(layer2,linear1_weight) + linear1_bias
    linear1 = batch_norm_wrapper(linear1,is_training)
    linear1 = leaky_relu(linear1, 0.01)
    #linear1 = batch_norm_wrapper(linear1,is_training)
    linear1 = tf.reshape(linear1, [-1, timesteps, Li1])
    
    
    '''
    #adding gru cell
    gru_bw_cell1 = tf.nn.rnn_cell.GRUCell(cell_units)
    #if is_training is not None:
    #    gru_bw_cell1 = tf.contrib.rnn.DropoutWrapper(cell=gru_bw_cell1, output_keep_prob=keep_prob)
    # Forward direction cell: (if else required for TF 1.0 and 1.1 compat)
    gru_fw_cell1 = tf.nn.rnn_cell.GRUCell(cell_units)
    #if is_training is not None:
    #    gru_fw_cell1 = tf.contrib.rnn.DropoutWrapper(cell=gru_fw_cell1, output_keep_prob=keep_prob)
    
    '''
    # Define lstm cells with tensorflow
    # Forward direction cell
    gru_fw_cell1 = tf.contrib.rnn.BasicLSTMCell(cell_units1, forget_bias=1.0)
    # Backward direction cell
    gru_bw_cell1 = tf.contrib.rnn.BasicLSTMCell(cell_units1, forget_bias=1.0)
    
    '''
    # Define lstm cells with tensorflow
    # Forward direction cell
    gru_fw_cell1 = tf.contrib.rnn.BasicLSTMCell(cell_units, forget_bias=1.0)
    if is_training is not None:
        gru_fw_cell1 = tf.contrib.rnn.DropoutWrapper(cell=gru_fw_cell1, output_keep_prob=keep_prob)
    # Backward direction cell
    gru_bw_cell1 = tf.contrib.rnn.BasicLSTMCell(cell_units, forget_bias=1.0)
    if is_training is not None:
        gru_bw_cell1 = tf.contrib.rnn.DropoutWrapper(cell=gru_bw_cell1, output_keep_prob=keep_prob)
    '''
    # Now we feed `layer_3` into the LSTM BRNN cell and obtain the LSTM BRNN output.
    outputs1, output_states1 = tf.nn.bidirectional_dynamic_rnn(cell_fw=gru_fw_cell1,
                                                             cell_bw=gru_bw_cell1,
                                                             inputs= linear1,
                                                             dtype=tf.float32,
                                                             time_major=False,
                                                             scope='LSTM1')
    '''
    outputs1 = tf.concat(outputs1,2)
     # Forward direction cell
    gru_fw_cell2 = tf.contrib.rnn.BasicLSTMCell(cell_units2, forget_bias=1.0)
    # Backward direction cell
    gru_bw_cell2 = tf.contrib.rnn.BasicLSTMCell(cell_units2, forget_bias=1.0)
    # Now we feed `layer_3` into the LSTM BRNN cell and obtain the LSTM BRNN output.
    outputs, output_states2 = tf.nn.bidirectional_dynamic_rnn(cell_fw=gru_fw_cell2,
                                                             cell_bw=gru_bw_cell2,
                                                             inputs= outputs1,
                                                             dtype=tf.float32,
                                                             time_major=False,
                                                             scope='LSTM2')
    '''
    #time_major=false,tensor的shape为[batch_size, max_time, depth]。实验中使用tf.concat(outputs, 2)将其拼接
    
    outputs = tf.concat(outputs1,2)
    outputs = tf.reshape(outputs, [-1, timesteps,2*cell_units1, 1])
    gru = tf.nn.max_pool(outputs,ksize=[1,timesteps,1,1], strides=[1,timesteps,1,1], padding='VALID', name='max_pool')
    gru = tf.reshape(gru, [-1,2*cell_units1])    
    '''
    # Attention layer
    gru, alphas = attention(outputs1, ATTENTION_SIZE, return_alphas=True)
    ''' 
    
    fully1 = tf.matmul(gru,fully1_weight) + fully1_bias
    #fully1 = batch_norm_wrapper(fully1,is_training)
    fully1 = leaky_relu(fully1, 0.01)
    #fully1 = batch_norm_wrapper(fully1,is_training) 
    fully1 = tf.nn.dropout(fully1, keep_prob)
    
    
    Ylogits = tf.matmul(fully1, fully2_weight) + fully2_bias
    #Ylogits = tf.nn.softmax(Ylogits)
    '''
    fully2 = tf.matmul(fully1,fully2_weight) + fully2_bias  
    fully2 = leaky_relu(fully2, 0.01)
    #fully2 = batch_norm_wrapper(fully2,is_training) 
    Ylogits = tf.matmul(fully2, fully3_weight) + fully3_bias
    #Ylogits = tf.nn.softmax(Ylogits)
    '''
    return Ylogits
    
def train_op(norm):
    STEPS = 50000
    batch_size = 60
    grad_clip = 5
    MODEL_SAVE_PATH = "./model2/"
    MODEL_NAME = "model.ckpt"
    X = tf.placeholder(tf.float32, shape=[None, 300,40,3])
    Y = tf.placeholder(tf.int32, shape=[None, 4])
    is_training = tf.placeholder(tf.bool)
    # variable learning rate
    lr = tf.placeholder(tf.float32)
    keep_prob = tf.placeholder(tf.float32)
    Ylogits = build_model(X, is_training, keep_prob)
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels =  Y, logits =  Ylogits)
    cost = tf.reduce_mean(cross_entropy)
    #train_op = tf.train.AdamOptimizer(lr).minimize(cost)
    var_trainable_op = tf.trainable_variables()
    if norm == -1:
        # not apply gradient clipping
        train_op = tf.train.AdamOptimizer(lr).minimize(cost)            
    else:
        # apply gradient clipping
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, var_trainable_op), grad_clip)
        opti = tf.train.AdamOptimizer(lr)
        train_op = opti.apply_gradients(zip(grads, var_trainable_op))
    correct_pred = tf.equal(tf.argmax(Ylogits, 1), tf.argmax(Y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))   
    saver=tf.train.Saver(tf.global_variables())
    
    train_data,train_label,test_data,test_label,valid_data,valid_label = load_data()
    train_label = dense_to_one_hot(train_label,len(np.unique(train_label)))
    test_label = dense_to_one_hot(test_label,len(np.unique(test_label)))
    valid_label = dense_to_one_hot(valid_label,len(np.unique(valid_label)))
    max_learning_rate = 0.0001
    min_learning_rate = 0.000001
    decay_speed = 1600
    dataset_size = train_data.shape[0]
    # init
    init = tf.global_variables_initializer()
    best_acc = 0
    with tf.Session() as sess:
        sess.run(init)
        for i in range(STEPS):
            learning_rate = min_learning_rate + (max_learning_rate - min_learning_rate) * math.exp(-i/decay_speed)
            start = (i * batch_size) % dataset_size
            end = min(start+batch_size, dataset_size)
            if i % 5 == 0:
                loss, train_acc = sess.run([cost,accuracy],feed_dict = {X:valid_data, Y:valid_label,is_training:False, keep_prob:1})
                test_acc = sess.run(accuracy, feed_dict = {X:test_data, Y:test_label, is_training:False, keep_prob:1})
                if test_acc > best_acc:
                    best_acc = test_acc
                print "After %5d trainging step(s), validation cross entropy is %2.2g, validation accuracy is %3.2g, test accuracy is %3.2g, the best accuracy is %3.2g" %(i, loss, train_acc, test_acc, best_acc)
                saver.save(sess, os.path.join(MODEL_SAVE_PATH, MODEL_NAME),global_step = i)
            sess.run(train_op, feed_dict={X:train_data[start:end,:,:,:], Y:train_label[start:end,:],
                                            is_training:True, keep_prob:1, lr:learning_rate})
                                    
if __name__=='__main__':
    train_op(1)
