In [None]:
import tensorflow as tf
import numpy as np
import h5py

# interactive notebook session
sess = tf.InteractiveSession()

## utility functions

In [None]:
def chunk_iterator(array, chunk_size=1):
    # iterates and chunks along first dimension
    for i in range(array.shape[0] // chunk_size):
        yield array[chunk_size * i: (i + 1) * chunk_size]
        

def batch_iterator(source, batch_size=32, num_epochs=1):
    # shuffle batch iterator for hdf5 data
    tmp = h5py.File(source, 'r')
    seq, atac = tmp['seq'], tmp['atac']
    label = tmp['label']
    num_examples = label.shape[0]
    for epoch in range(num_epochs):
        permutation = np.random.permutation(num_examples)
        for indices in chunk_iterator(permutation, batch_size):
            # h5py datasets require sorted point-wise indexing
            indices = sorted(indices)
            yield seq[indices], atac[indices], label[indices]

## training setup

In [36]:
# training hyper-parameters
BATCH_SIZE = 256
NUM_EPOCHS = 50
LEARNING_RATE = 1e-3
LOGGING_FREQUENCY = 100

# import data as a shuffle-batch iterator
CEBPB_A549 = batch_iterator(
    source='../../deleteme/results-hdf5/CEBPB-A549.hdf5',
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS)

## logistic-regression sanity check

In [None]:
# define symbolic placeholders
sy_seq_n = tf.placeholder(dtype=tf.float32, shape=[None, 1000, 5])
sy_atac_n = tf.placeholder(dtype=tf.float32, shape=[None, 1000])
sy_label_n = tf.placeholder(dtype=tf.float32, shape=[None, 1])

# concatenate one-hot encoded seq with atac counts
sy_input_n = tf.concat([sy_seq_n, tf.expand_dims(sy_atac_n, axis=-1)], axis=-1)

# flatten input tensor data
sy_input_n = tf.contrib.layers.flatten(sy_input_n)
# logistic regression to sanity-check data
sy_logit_n = tf.layers.dense(sy_input_n, units=1, activation=None)

# optimizer configuration
sy_loss = tf.losses.sigmoid_cross_entropy(
    multi_class_labels=sy_label_n,
    logits=sy_logit_n)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE)
train_op = optimizer.minimize(sy_loss)

# define logging metrics
sy_accuracy = tf.reduce_mean(tf.cast(
    tf.equal(x=sy_label_n, y=sy_logit_n),
    dtype=tf.float32))

# initialize variables
tf.global_variables_initializer().run()

## training loop

In [None]:
# begin the training loop
for iteration, (seq, atac, label) in enumerate(CEBPB_A549):
    # perform a single gradient step for the batch
    loss, accuracy, _ = sess.run(
        fetches=[sy_loss, sy_accuracy, train_op],
        feed_dict={
            sy_seq_n: seq,
            sy_atac_n: atac,
            sy_label_n: label,
        })
    # log the cross-entropy loss and accuracy
    if iteration % LOGGING_FREQUENCY:
        print('iteration %i: loss=%.4f, accuracy=%.4f' % (iteration, loss, accuracy))