In [1]:
# -*- coding: utf-8 -*-
import tensorflow as tf
from SpeechData import SpeechData
from tensorflow.python.ops import ctc_ops


In [2]:
# idx = tf.where(tf.not_equal(input_label, 0))
# # Use tf.shape(a_t, out_type=tf.int64) instead of a_t.get_shape() if tensor shape is dynamic
# sparse_labels = tf.SparseTensor(idx, tf.gather_nd(input_label, idx), tf.shape(input_label, out_type=tf.int64))



# gather_nd is taken from https://github.com/tensorflow/tensorflow/issues/206#issuecomment-229678962
# 
# Unfortunately we can't just use tf.gather_nd because it does not have gradients
# implemented yet, so we need this workaround.
#
def gather_nd(params, indices, shape):
    rank = len(shape)
    flat_params = tf.reshape(params, [-1])
    multipliers = [reduce(lambda x, y: x*y, shape[i+1:], 1) for i in range(0, rank)]
    indices_unpacked = tf.unstack(tf.transpose(indices, [rank - 1] + range(0, rank - 1)))
    flat_indices = sum([a*b for a,b in zip(multipliers, indices_unpacked)])
    return tf.gather(flat_params, flat_indices)

# ctc_label_dense_to_sparse is taken from https://github.com/tensorflow/tensorflow/issues/1742#issuecomment-205291527
#
# The CTC implementation in TensorFlow needs labels in a sparse representation,
# but sparse data and queues don't mix well, so we store padded tensors in the
# queue and convert to a sparse representation after dequeuing a batch.

def ctc_label_dense_to_sparse(labels, label_lengths, batch_size):
    # The second dimension of labels must be equal to the longest label length in the batch
    correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths))
    with tf.control_dependencies([correct_shape_assert]):
        labels = tf.identity(labels)

    label_shape = tf.shape(labels)
    num_batches_tns = tf.stack([label_shape[0]])
    max_num_labels_tns = tf.stack([label_shape[1]])
    def range_less_than(previous_state, current_input):
        return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input

    init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool)
    init = tf.expand_dims(init, 0)
    dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1)
    dense_mask = dense_mask[:, 0, :]

    label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns),
          label_shape)
    label_ind = tf.boolean_mask(label_array, dense_mask)

    batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0])))
    batch_ind = tf.boolean_mask(batch_array, dense_mask)

    indices = tf.transpose(tf.reshape(tf.concat([batch_ind, label_ind], 0), [2, -1]))
    shape = [batch_size, tf.reduce_max(label_lengths)]
    vals_sparse = gather_nd(labels, indices, shape)
    
    return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))


In [3]:
# set log level to debug
#tf.sg_verbosity(10)

#
# hyper parameters
#
batch_size = 16    # batch size
num_blocks = 3     # dilated blocks
num_dim = 128      # latent dimension
initial_learning_rate = 0.001
#
# inputs
#

data = SpeechData(batch_size=batch_size, data_path='SpeechData/')

# vocabulary size
voca_size = data.voca_size

# spectrum feature of audio
input_features = data.features
input_features_length = data.features_length

# # target sentence label
input_labels = data.labels
input_labels_length = data.labels_length

num_batch = data.num_batch


print 'num_batch', num_batch

# Reshape input speech data  [Batch, Height, Width, Channel]
input_features = tf.reshape(input_features, shape=[batch_size, 81, -1, 1])
#input_feature.get_shape()


# Store layers weight & bias   --- Weights are called filter in convolution
# 9*5 filter  but stride of 2,2  this will help to find overlap regions
convolution_neuron_out_channels = 32
weights = {
    'wc1': tf.Variable(tf.random_normal([9, 5, 1, convolution_neuron_out_channels]))
}

biases = {
    'bc1': tf.Variable(tf.random_normal([convolution_neuron_out_channels]))
}


# Create some wrappers for simplicity
def conv2d(x, W, b, strides):
    # Conv2D wrapper, with bias and relu activation
    x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME', data_format="NHWC")
    x = tf.nn.bias_add(x, b)
    return tf.nn.relu(x)


def maxpool2d(x, shift):
    # MaxPool2D wrapper
    return tf.nn.max_pool(x, ksize=[1, shift, shift, 1], strides=[1, shift, shift, 1], padding='SAME')


# x = tf.reshape(x, shape=[-1, 81, None, 1])
# Convolution Layer
conv1 = conv2d(input_features, weights['wc1'], biases['bc1'], strides=2)

conv1.get_shape()

conv1 = maxpool2d(conv1, shift=2)

conv1 = tf.transpose(conv1, perm=[0, 1, 3, 2])
#Above shape is TensorShape([Dimension(16), Dimension(21), Dimension(32), Dimension(None)]): 9*5 filter 2*2 stride
conv1 = tf.reshape(conv1, shape=[batch_size, convolution_neuron_out_channels, -1])
#Above shape is TensorShape([Dimension(16), Dimension(32), Dimension(None)])
#conv1 = tf.transpose(conv1, perm=[0, 2, 1])


#conv1.get_shape()   
#TensorShape([Dimension(16), , Dimension(32), Dimension(None)])

# RNN Layers

num_neurons = 200
num_layers = 2
dropout = 0.8
# The number of characters in the target language plus one ===   (a-z + space + one extra)
out_size = 28
relu_clip = 20


weight = tf.Variable(tf.truncated_normal([num_neurons, out_size], stddev=0.1))
bias = tf.Variable(tf.constant(0.1, shape=[out_size]))

cell = tf.contrib.rnn.GRUCell(num_neurons)  # Or LSTMCell(num_neurons)
cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=dropout)
cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers)

# Batch size x max_length x num_neurons.
conv1 =  tf.transpose(conv1, perm=[0, 2, 1])
output, state = tf.nn.dynamic_rnn(cell, conv1, dtype=tf.float32, time_major=False)

#Batch size x max_length x num_neurons.
max_length = tf.shape(output)[1]

output = tf.reshape(output, [-1, num_neurons])
prediction = tf.minimum(tf.nn.relu(tf.matmul(output, weight) + bias), relu_clip)
prediction = tf.reshape(prediction, [batch_size, max_length, out_size])


#CTC Layer

#Dense to sparse vector conversion
sparse_labels = ctc_label_dense_to_sparse(input_labels, input_labels_length, batch_size)

#For every item in batch find the length of the batch
transformed_sequence_length = tf.map_fn(lambda x: tf.shape(x)[0], prediction, dtype=(tf.int32))

loss_from_ctc = ctc_ops.ctc_loss(inputs=prediction, labels=sparse_labels, sequence_length=transformed_sequence_length, time_major=False)


#Backpropagation Layer
cost = tf.reduce_mean(loss_from_ctc)
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)



with tf.Session() as sess:

    # initialize the variables
    sess.run(tf.global_variables_initializer())

    # initialize the queue threads to start to shovel data
    coord = tf.train.Coordinator()

    threads = tf.train.start_queue_runners(sess=sess,coord=coord)

    try:
        for epoch in xrange(1):
            epoch_loss = 0
            for step in xrange(num_batch):
                print 'Epoch:', epoch ,' Step:', step
                _, c = sess.run([optimizer, cost])
                print 'Step:', step, ' Loss:', c
                epoch_loss += c
            print 'Epoch:', epoch, ' AccLoss:', epoch_loss
        print 'Finished.'
    except Exception, e:
        # Report exceptions to the coordinator.
        coord.request_stop(e)
    finally:
        # stop our queue threads and properly close the session
        coord.request_stop()
        coord.join(threads)
        sess.close()




Base path from where data will be read: SpeechData/
Loading the speech metadata from path: SpeechData/
Load data from enabled directory with training flag on : ['SpeechData/train/1/', 'SpeechData/train/2/']
3195
Wrong media files:
[]
wav file length after validation
3195
Length CHeck Failed: 
[]
Wrong transcription. contains numeric or special character
[]
set([])
set([])
3195
3195
num_batch 199
Epoch: 0  Step: 0
Step: 0  Loss: 2012.21
Epoch: 0  Step: 1
Step: 1  Loss: 1916.31
Epoch: 0  Step: 2
Step: 2  Loss: 1641.97
Epoch: 0  Step: 3
Step: 3  Loss: 1264.85
Epoch: 0  Step: 4
Step: 4  Loss: 921.767
Epoch: 0  Step: 5
Step: 5  Loss: 764.682
Epoch: 0  Step: 6
Step: 6  Loss: 478.825
Epoch: 0  Step: 7
Step: 7  Loss: 169.54
Epoch: 0  Step: 8
Step: 8  Loss: 137.999
Epoch: 0  Step: 9
Step: 9  Loss: 181.184
Epoch: 0  Step: 10
Step: 10  Loss: 113.739
Epoch: 0  Step: 11
Step: 11  Loss: 146.989
Epoch: 0  Step: 12
Step: 12  Loss: 171.294
Epoch: 0  Step: 13
Step: 13  Loss: 215.249
Epoch: 0  Step: 14
S