In [1]:
import collections

import numpy as np
import sklearn.linear_model as lm
import sklearn.metrics
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline

In [2]:
try:
    sess.close()
except:
    pass
tf.reset_default_graph()
sess = tf.InteractiveSession()

# Data Prep

In [3]:
train_data = {
    "deepsea_outputs": np.load("train_outputs.npy"),
    "deepsea_logits": np.load("train_logits.npy"),
    "y": np.load("train_targets.npy")
}

valid_data = {
    "deepsea_outputs": np.load("valid_outputs.npy"),
    "deepsea_logits": np.load("valid_logits.npy"),
    "y": np.load("valid_targets.npy")
}

test_data = {
    "deepsea_outputs": np.load("test_outputs.npy"),
    "deepsea_logits": np.load("test_logits.npy"),
    "y": np.load("test_targets.npy")
}

# The validation set isn't big enough, so we take some from the training data.

valid_data = {
    "deepsea_outputs": np.concatenate([train_data["deepsea_outputs"][2200000:2400000],train_data["deepsea_outputs"][4200000:4400000],valid_data["deepsea_outputs"]], axis=0),
    "deepsea_logits": np.concatenate([train_data["deepsea_logits"][2200000:2400000],train_data["deepsea_logits"][4200000:4400000],valid_data["deepsea_logits"]], axis=0),
    "y": np.concatenate([train_data["y"][2200000:2400000],train_data["y"][4200000:4400000],valid_data["y"]], axis=0),
}

train_data = {
    "deepsea_outputs": np.concatenate([train_data["deepsea_outputs"][0:2200000],train_data["deepsea_outputs"][2400000:4200000]], axis=0),
    "deepsea_logits": np.concatenate([train_data["deepsea_logits"][0:2200000],train_data["deepsea_logits"][2400000:4200000]], axis=0),
    "y": np.concatenate([train_data["y"][0:2200000],train_data["y"][2400000:4200000]], axis=0),
} 

In [5]:
def make_dataset(data,
                 batch_size,
                 shuffle_size,
                 prefetch_size,
                 generator_fn):
    
    x_shape, gen = generator_fn(data)
    
    dataset = tf.data.Dataset.from_generator(
        gen,
        output_types=(tf.float32,)*3,
        output_shapes=(data["y"].shape[1], x_shape, data["y"].shape[1],)
    )
    dataset = dataset.batch(batch_size)
    dataset = dataset.shuffle(shuffle_size)
    dataset = dataset.repeat()
    dataset = dataset.prefetch(prefetch_size)
    return dataset.make_one_shot_iterator()

# Generators output (DeepSea's probs, model inputs, ground truth)

def identity_gen(data):
    def g():
        for probs, logits, y in zip(data["deepsea_outputs"], data["deepsea_logits"], data["y"]):
            yield probs, logits, y
    return data["deepsea_logits"].shape[1], g

def random_mask(p, shape):
    return lambda: np.random.binomial(1, p, shape)

def dnase_mask():
    return np.array([1] + 18 * [0])

def position_mask(ids):
    out = np.zeros(19)
    out[ids] = 1
    return lambda: out

def imputation_gen(data, mask_fn=random_mask(.1, 19), x_shape=19*3):
    def g():
        for probs, logits, y in zip(data["deepsea_outputs"], data["deepsea_logits"], data["y"]):
            mask = mask_fn()
            x = y * mask + probs * (1 - mask)
            x = np.concatenate([logits, x, mask], axis=0)
            yield probs, x, y
    return x_shape, g
            
def paramaterized_imputation_gen(mask_fn, x_shape):
    return lambda data: imputation_gen(data, mask_fn, x_shape)



# Model definition

In [7]:
class Model():
    def __init__(self,
                 train_data,
                 valid_data,
                 num_outputs,
                 batch_size=64,
                 shuffle_size=10000,
                 prefetch_size=10,
                 l1=0.,
                 l2=0.,
                 lr=1e-3,
                 generator_fn=identity_gen,
                 masking=True):
        
        tf.logging.set_verbosity(tf.logging.INFO)
        
        train_iter = make_dataset(train_data, batch_size, shuffle_size, prefetch_size, generator_fn)
        valid_iter = make_dataset(valid_data, batch_size, 1           , prefetch_size, generator_fn)
        self.train_handle = train_iter.string_handle()
        self.valid_handle = valid_iter.string_handle()
        self.handle = tf.placeholder(tf.string, shape=[])
        iterator = tf.data.Iterator.from_string_handle(
            self.handle, train_iter.output_types, train_iter.output_shapes)
        self.deepsea_pred, self.x, self.y = iterator.get_next()
        
        self.num_outputs = num_outputs
        self.l1, self.l2 = l1, l2
        self.default_lr = lr
        self.lr = tf.placeholder(tf.float32)
        self.batch_size = batch_size
        self.prefetch_size = prefetch_size
        self.masking = masking
        
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.logits = self.body_fn()
        self.predictions = tf.sigmoid(self.logits)
        self.loss = self.loss_fn()
        self.min = self.minimizer_fn()
        
    def make_alt_dataset(self, sess, data, batch_size, shuffle_size, prefetch_size, generator_fn):
        h = make_dataset(data, batch_size, shuffle_size, prefetch_size, generator_fn).string_handle()
        return sess.run(h)
                
    def body_fn(self):
        raise NotImplementedError()
    
    def loss_fn(self):
        return tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(self.y, self.logits, 50))
    
    def minimizer_fn(self):
        self.opt = tf.train.AdamOptimizer(self.lr)
        return self.opt.minimize(self.loss, self.global_step)
        
    def train(self, sess, num_steps, lr=None):
        if lr == None:
            lr = self.default_lr
        try:
            sess.run(self.global_step)
        except:
            tf.logging.info("Initializing variables")
            sess.run(tf.global_variables_initializer())
            self.train_handle = sess.run(self.train_handle)
            self.valid_handle = sess.run(self.valid_handle)

        max_steps = sess.run(self.global_step) + num_steps

        tf.logging.info("Starting Training")

        while sess.run(self.global_step) < max_steps:
            _, loss = sess.run([self.min, self.loss], {self.handle: self.train_handle, self.lr: lr})
            step = sess.run(self.global_step)
            if step % 1000 == 0:
                tf.logging.info(str(step) + " " + str(loss))
                tf.logging.info("On validation")
                _, _, _, stop = self.test(sess, 8000, log=True, masking=self.masking)
                if stop: break
                tf.logging.info("")
                
    def test(self, sess, num_samples, log=False, iterator_handle=None, masking=True):
        vals = []
        for i in range(int(num_samples / self.batch_size)):
            vals.append(
                sess.run([self.deepsea_pred, self.predictions, self.x, self.y],
                         {self.handle: iterator_handle if iterator_handle else self.valid_handle})
            )
        deepsea = np.concatenate([v[0] for v in vals])
        preds = np.concatenate([v[1] for v in vals])
        truth = np.concatenate([v[3] for v in vals])
        if log and masking:
            mask = 1 - np.concatenate([v[2][:,19*2:] for v in vals]).reshape((-1))
            
            our_score = sklearn.metrics.roc_auc_score(truth.reshape((-1)), preds.reshape((-1)),   sample_weight=mask)
            deepsea_score = sklearn.metrics.roc_auc_score(truth.reshape((-1)), deepsea.reshape((-1)), sample_weight=mask)
            
            # Masked micro average
            tf.logging.info("Our AUC:     " + str(our_score))
            tf.logging.info("DeepSea AUC: " + str(deepsea_score))
            
#             return deepsea, preds, truth, our_score > deepsea_score
        
        return deepsea, preds, truth, False


In [8]:
class Logistic(Model):
    def body_fn(self):
        return tf.layers.dense(self.x, self.num_outputs, kernel_regularizer=tf.contrib.layers.l1_l2_regularizer(self.l1, self.l2))
    
class MLP(Model):
    def __init__(self,
             layers,
             num_units,
             activation,
             *args,
             **kwargs):

        self.layers = layers
        self.num_units = num_units
        self.activation = activation
        
        Model.__init__(self, *args, **kwargs)
            
    def body_fn(self):
        model = self.x
        
        if not isinstance(self.num_units, collections.Iterable):
            self.num_units = [self.num_units] * self.layers
            
        for i in range(self.layers):
            model = tf.layers.dense(model, self.num_units[i], self.activation, kernel_regularizer=tf.contrib.layers.l1_l2_regularizer(self.l1, self.l2))
            
        return tf.layers.dense(model, self.num_outputs, kernel_regularizer=tf.contrib.layers.l1_l2_regularizer(self.l1, self.l2))

        

# Training

In [9]:
logistic = Logistic(train_data, valid_data, 19, shuffle_size=1, generator_fn=imputation_fn)
logistic.train(sess, 10000, lr=1e-4)
imp8 = logistic.make_alt_dataset(sess, valid_data, 64, 1, 10, paramaterized_imputation_gen(random_mask(.8, 19), 3*19))
dnase = logistic.make_alt_dataset(sess, valid_data, 64, 1, 10, paramaterized_imputation_gen(dnase_mask, 3*19))
none = logistic.make_alt_dataset(sess, valid_data, 64, 1, 10, paramaterized_imputation_gen(random_mask(0, 19), 3*19))

# _ = logistic.test(sess, 40000, log=True)
# _ = logistic.test(sess, 40000, log=True, iterator_handle=imp8)
# _ = logistic.test(sess, 40000, log=True, iterator_handle=dnase)
# _ = logistic.test(sess, 40000, log=True, iterator_handle=none)

NameError: name 'imputation_fn' is not defined

In [500]:
# mlp = MLP(2, 20, tf.tanh, train_data, valid_data, 19, shuffle_size=1, generator_fn=imputation_fgen)
# mlp.train(sess, 20000, lr=1e-3)
# mlp.train(sess, 20000, lr=1e-4)
# mlp.train(sess, 20000, lr=1e-5)
# imp8 = mlp.make_alt_dataset(sess, valid_data, 64, 1, 10, paramaterized_imputation_gen(random_mask(.8, 19), 3*19))
# dnase = mlp.make_alt_dataset(sess, valid_data, 64, 1, 10, paramaterized_imputation_gen(dnase_mask, 3*19))
# _ = mlp.test(sess, 16000, log=True)
for _ in range(3):
    _ = mlp.test(sess, 16000, log=True, iterator_handle=imp8)

INFO:tensorflow:Our AUC:     0.9764405709451255
INFO:tensorflow:DeepSea AUC: 0.9692375728754375
INFO:tensorflow:Our AUC:     0.96106445226617
INFO:tensorflow:DeepSea AUC: 0.9558341948720454
INFO:tensorflow:Our AUC:     0.9749487263329542
INFO:tensorflow:DeepSea AUC: 0.9703153792822881


In [511]:
mlp = MLP(2, 20, tf.tanh, train_data, valid_data, 19, shuffle_size=1, generator_fn=paramaterized_imputation_gen(dnase_mask, 3*19))
mlp.train(sess, 20000, lr=1e-3)
# for i in range(19):
#     tf.logging.info(i)
#     mask = mlp.make_alt_dataset(sess, valid_data, 64, 1, 10, paramaterized_imputation_gen(position_mask([i]), 3*19))
#     _ = mlp.test(sess, 30000, log=True, iterator_handle=mask)

INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Initializing variables
INFO:tensorflow:Starting Training
INFO:tensorflow:1000 0.25679076
INFO:tensorflow:On validation
INFO:tensorflow:Our AUC:     0.9505799394538017
INFO:tensorflow:DeepSea AUC: 0.9659993520392898
INFO:tensorflow:
INFO:tensorflow:2000 0.0693297
INFO:tensorflow:On validation
INFO:tensorflow:Our AUC:     0.950915429979005
INFO:tensorflow:DeepSea AUC: 0.9696383207749399
INFO:tensorflow:
INFO:tensorflow:3000 0.9588029
INFO:tensorflow:On validation
INFO:tensorflow:Our AUC:     0.9611603753138751
INFO:tensorflow:DeepSea AUC: 0.9674656211640514
INFO:tensorflow:
INFO:tensorflow:4000 0.059683196
INFO:tensorflow:On validation
INFO:tensorflow:Our AUC:     0.9716469544871517
INFO:tensorflow:DeepSea AUC: 0.9804013362687495
INFO:tensorflow:
INFO:tensorflow:5000 0.26715124
INFO:tensorflow:On validation
INFO:tensorflow:Our 