In [1]:
import tensorflow as tf
import numpy as np
import random
import h5py
import sys
import os

#Limit available GPUs to 1 (first gpu)
os.environ["CUDA_VISIBLE_DEVICES"]="0"

  from ._conv import register_converters as _register_converters


##### **Data Generator**

In [2]:
class Generator:
    #initialize variables
    def __init__(self, dataset_path, shape, shuffle=True):
        self.dataset_path = dataset_path
        self.datapaths = list()
        self.labels = list()
        self.read_samples()
        self.num_sample = len(self.datapaths)
        self.shape = shape
      
    #populate lists with csv file paths and their corresponding labels from dataset dir
    #dataset folder should be organised as following
    #data_path
    #    class 1
    #        sample 1
    #        sample 2
    #            .
    #            .
    #            .
    #    class 2
    #        sample 1
    #        sample 2
    #            .
    #            .
    #            .
    #
    def read_samples(self):
        label = 0

        classes = sorted(os.walk(self.dataset_path).__next__()[1])

        # List each sub-directory (the classes)
        for c in classes:
            c_dir = os.path.join(self.dataset_path, c)
            walk = os.walk(c_dir).__next__()
            # Add each image to the training set
            for sample in walk[2]:
                # Only keeps csv samples
                if sample.endswith('.csv'):
                    self.datapaths.append(os.path.join(c_dir, sample))
                    self.labels.append(label)
            label += 1
            
    #shuffle data paths along with their labels together
    def combined_shuffle(self):
        combined = list(zip(self.datapaths, self.labels ))
        random.shuffle(combined)
        self.datapaths[:], self.labels [:] = zip(*combined)
    
    #function called by the tensorflow object
    #returns pairs of numpy arrays read from csv files along with their labels
    def __iter__(self):
        if shuffle:
            self.combined_shuffle()

        for i in range(self.num_sample):
            data = np.loadtxt(open(self.datapaths[i], "rb"), delimiter=",")
            label = self.labels[i]
            if data.shape == self.shape:
                yield data, label
            else:
                continue

# **Seq2Seq Autoencoder**

In [None]:
#Seq2Seq Autoencoder with a single LSTM for both encoder and decoder,
#along with optional rollout for the decoder LSTM
def autoencoder(inputs, reuse, sequence_length, input_width, rollout):
    
    def encoder_loop_fn(time, cell_output, cell_state, loop_state):
        emit_output = cell_output  # == None for time == 0

        #Bool to check if sequence is over
        elements_finished = (time >= sequence_length)
        finished = tf.reduce_all(elements_finished)
        
        if cell_output is None:  # time == 0
            next_cell_state = encoder_cell.zero_state(batch_size, tf.float32)
        else:
            next_cell_state = cell_state
  
        #Bool to check if sequence is over
        elements_finished = (time >= sequence_length)
        finished = tf.reduce_all(elements_finished)
        
        #input to lstm at each time step        
        next_input = tf.cond(finished,
                             lambda: tf.zeros([batch_size, input_width], dtype=tf.float32),
                             lambda: inputs_ta.read(time))

        next_loop_state = None

        return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state)

    def decoder_loop_fn(time, cell_output, cell_state, loop_state):
        emit_output = cell_output  # == None for time == 0
        
        #Bool to check if sequence is over
        elements_finished = (time >= sequence_length)
 
        #input to lstm at each time step, cell state   
        if cell_output is None:
            next_cell_state = encoder_cell_states
            next_input = tf.zeros([batch_size, input_width], dtype=tf.float32)
        else:
            next_cell_state = cell_state
            next_input = tf.cond(rollout,
                                 lambda: cell_output,
                                 lambda: inputs_ta.read(time-1))   
            
            next_input.set_shape([batch_size, input_width])
            
        next_loop_state = None

        return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state)  
    
    
    # Define a scope for reusing the variables
    with tf.variable_scope('autoencoder', reuse=reuse):

        #convert sample into tensor array
        inputs_ta = tf.TensorArray(dtype=tf.float32, size=sequence_length, clear_after_read=False)
        inputs_ta = inputs_ta.unstack(inputs)

        with tf.name_scope('encoder'):
            #encoder lstm
            encoder_cell = tf.contrib.rnn.LSTMCell(input_width)
            _, encoder_cell_states, _ = tf.nn.raw_rnn(encoder_cell, encoder_loop_fn)

        with tf.name_scope('decoder'):
            #decoder lstm
            decoder_cell = tf.contrib.rnn.LSTMCell(input_width)
            decoder_hidden_states_ta, _, _ = tf.nn.raw_rnn(decoder_cell, decoder_loop_fn)

        #convert lstm output array into a tensor
        outputs = decoder_hidden_states_ta.stack()

        return outputs, encoder_cell_states

# **Training**

In [None]:
data_path = "/home/kalvik/shared/CSI_DATA/preprocessed_final/train"
initial_learning_rate = 0.001
rollout_status = False
sequence_length = 8000
input_width = 540
decay_rate = 0.6
batch_size = 8
data_steps = 850
data_steps = data_steps//batch_size
decay_steps = data_steps-1
rollout_step = data_steps-1
save_epoch = 2
epochs = 10

#reset graph
tf.reset_default_graph()

with tf.name_scope('input'):
    # Graph input and label
    gen = Generator(data_path, (sequence_length, input_width))
    dataset = tf.data.Dataset().from_generator(lambda: gen, output_types=(tf.float32), output_shapes=(sequence_length, input_width)).prefetch(2 * batch_size).batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    inputs = iterator.get_next()
    inputs = tf.transpose(inputs, perm=[1, 0, 2])

with tf.name_scope('rollout'):
    #variable to control rnn rollout
    rollout = tf.placeholder(tf.bool)

# autoencoder model
outputs, _ = autoencoder(inputs=inputs, 
                         reuse=tf.AUTO_REUSE, 
                         sequence_length=sequence_length, 
                         input_width=input_width,
                         rollout=rollout)

with tf.name_scope('loss'):
    # Define loss
    loss_op = tf.reduce_mean(tf.square(inputs - outputs))
    tf.summary.scalar("loss", loss_op) 

with tf.name_scope('optimizer'):
    #global step counter
    global_step = tf.Variable(0, trainable=False)
    
    #learning rate decay
    learning_rate = tf.train.exponential_decay(initial_learning_rate, 
                                               global_step, 
                                               decay_steps, 
                                               decay_rate, 
                                               staircase=True) 
    tf.summary.scalar("learning_rate", learning_rate)
    
    #adam optimizer
    train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss_op, global_step=global_step) 

# Initialize the variables, model saver, merge tensorboard summaries
init = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=10)
merged = tf.summary.merge_all()

print("Training")
# Start
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)
    
    #tensorboard writer
    writer = tf.summary.FileWriter('/home/kalvik/shared/autoencoder/tensorboard/train', sess.graph)

    # Keep training until reach max iterations
    for epoch in range(1, epochs + 1):
        for step in range(1, data_steps + 1):

            # Run optimization op (backprop)
            _, loss, summary = sess.run([train_op, loss_op, merged], feed_dict={rollout:rollout_status})
            writer.add_summary(summary, ((epoch-1) * data_steps) + step)
           
            #Print loss
            sys.stdout.write("\rEpoch: {}, Mini Batch: {}, Loss: {:.4f}".format(epoch, step, loss))
            sys.stdout.flush()
                
            #change rollout status
            if step % rollout_step == 0:
                rollout_status = not rollout_status
                print("Rollout status : {}".format(rollout_status))
                
        print("\n")
                
        #save model
        if epoch % save_epoch == 0:
            saver.save(sess, '/home/kalvik/shared/autoencoder/weights/autoencoder_loss-{}_epoch-{}'.format(loss, epoch), global_step=(((epoch-1) * data_steps) + step))
    
    #save model
    saver.save(sess, '/home/kalvik/shared/autoencoder/weights/autoencoder_loss-{}_epoch-{}'.format(loss, epoch), global_step=((epoch-1) * data_steps) + step)

    print("Finished!")   

# **Testing**

In [None]:
data_path = "/home/kalvik/shared/CSI_DATA/preprocessed_final/test"
rollout_status = True
sequence_length = 8000
input_width = 540
batch_size = 8
data_steps = 150
data_steps = data_steps//batch_size

#reset graph
tf.reset_default_graph()

with tf.name_scope('input'):
    # Graph input and label
    gen = Generator(data_path, (sequence_length, input_width))
    dataset = tf.data.Dataset().from_generator(lambda: gen, output_types=(tf.float32), output_shapes=(sequence_length, input_width)).prefetch(2 * batch_size).batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    inputs = iterator.get_next()
    inputs = tf.transpose(inputs, perm=[1, 0, 2])

with tf.name_scope('rollout'):
    #variable to control rnn rollout
    rollout = tf.placeholder(tf.bool)

# autoencoder model
outputs, _ = autoencoder(inputs=inputs, 
                         reuse=tf.AUTO_REUSE, 
                         sequence_length=sequence_length, 
                         input_width=input_width,
                         rollout=rollout)

with tf.name_scope('loss'):
    # Define loss
    loss_op = tf.reduce_mean(tf.square(inputs - outputs))
    tf.summary.scalar("loss", loss_op) 

# Initialize the variables, model saver, merge tensorboard summaries
init = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=10)
checkpoint = tf.train.latest_checkpoint('weights')
merged = tf.summary.merge_all()

print("Testing")
# Start
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)
    
    #load model weights from checkpoint
    saver.restore(sess, checkpoint)
    
    #tensorboard writer
    writer = tf.summary.FileWriter('/home/kalvik/shared/autoencoder/tensorboard/test', sess.graph)

    for step in range(1, data_steps + 1):

        # Run optimization op (backprop)
        loss, summary = sess.run([loss_op, merged], feed_dict={rollout:rollout_status})
        writer.add_summary(summary, step)
        print("Mini Batch: {}, Loss: {:.4f}".format(step, loss))

    print("Finished!")   

# **Sampling**

In [None]:
encoded_X = []
encoded_y = []

In [None]:
data_path = "/home/kalvik/shared/CSI_DATA/preprocessed_final/test"
rollout_status = True
sequence_length = 8000
input_width = 540
batch_size = 8
data_steps = 150
data_steps = data_steps//batch_size

#reset graph
tf.reset_default_graph()

with tf.name_scope('input'):
    # Graph input
    gen = Generator(data_path, (sequence_length, input_width), shuffle=False)
    dataset = tf.data.Dataset().from_generator(lambda: gen, output_types=(tf.float32, tf.int32)).prefetch(2 * batch_size).batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    inputs, labels = iterator.get_next()
    inputs.set_shape([None, sequence_length, input_width])
    inputs = tf.transpose(inputs, perm=[1, 0, 2])

with tf.name_scope('rollout'):
    #variable to control rnn rollout
    rollout = tf.placeholder(tf.bool)

# autoencoder model
_, encoder_cell_states = autoencoder(inputs=inputs, 
                         reuse=tf.AUTO_REUSE, 
                         sequence_length=sequence_length, 
                         input_width=input_width,
                         rollout=rollout)

# Initialize the variables, model saver
init = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=10)
checkpoint = tf.train.latest_checkpoint('weights')

print("Sampling")
# Start
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)
    
    #load model weights
    saver.restore(sess, checkpoint)
    
    for step in range(data_steps):
        # Run optimization op (backprop)
        X, y = sess.run([encoder_cell_states, labels], feed_dict={rollout:rollout_status})

        sys.stdout.write("\r{}% complete".format((step/data_steps)*100))
        sys.stdout.flush()
        
        encoded_X.append(X)
        encoded_y.append(y)

    print("Finished!")   

In [None]:
data_path = "/home/kalvik/shared/CSI_DATA/preprocessed_final/train"
rollout_status = True
sequence_length = 8000
input_width = 540
batch_size = 8
data_steps = 150
data_steps = data_steps//batch_size

#reset graph
tf.reset_default_graph()

with tf.name_scope('input'):
    # Graph input
    gen = Generator(data_path, (sequence_length, input_width), shuffle=False)
    dataset = tf.data.Dataset().from_generator(lambda: gen, output_types=(tf.float32, tf.int32)).prefetch(2 * batch_size).batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    inputs, labels = iterator.get_next()
    inputs.set_shape([None, sequence_length, input_width])
    inputs = tf.transpose(inputs, perm=[1, 0, 2])

with tf.name_scope('rollout'):
    #variable to control rnn rollout
    rollout = tf.placeholder(tf.bool)

# autoencoder model
_, encoder_cell_states = autoencoder(inputs=inputs, 
                         reuse=tf.AUTO_REUSE, 
                         sequence_length=sequence_length, 
                         input_width=input_width,
                         rollout=rollout)

# Initialize the variables, model saver
init = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=10)
checkpoint = tf.train.latest_checkpoint('weights')

print("Sampling")
# Start
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)
    
    #load model weights
    saver.restore(sess, checkpoint)
    
    for step in range(data_steps):
        # Run optimization op (backprop)
        X, y = sess.run([encoder_cell_states, labels], feed_dict={rollout:rollout_status})

        sys.stdout.write("\r{}% complete".format((step/data_steps)*100))
        sys.stdout.flush()
        
        encoded_X.append(X)
        encoded_y.append(y)

    print("Finished!")   

# **Saving Encoded Dataset**

In [None]:
encoded_X = np.array(encoded_X)
encoded_y = np.array(encoded_y)

print(encoded_X.shape, encoded_y.shape)

In [None]:
hf = h5py.File('encoded_dataset.h5', 'w')
hf.create_dataset('data', data=encoded_X)
hf.create_dataset('labels', data=encoded_y)
hf.close()