In [47]:
import os
import pandas
import numpy
import utils
import tensorflow as tf
import tflearn
from amino_acid import Data

tensorboard_dir = '/tmp/tflearn_logs'
path_to_data = os.path.dirname(os.getcwd()) + '/mhcPreds/data/'

train_dat = Data(path_to_data + 'train.txt', allele='HLA-A-0101')
test_dat = Data(path_to_data + 'test.txt', allele='HLA-A*01:01')

kmer, aff_kmer, idx_kmer = train_dat.kmer_index_encoding(kmer_size=9)
aff_kmer = utils.ic50_to_regression_target(aff_kmer, max_ic50=50000)

kmer_test, aff_kmer_test, idx_kmer_test = test_dat.kmer_index_encoding(kmer_size=9)
aff_kmer_test = utils.ic50_to_regression_target(aff_kmer_test, max_ic50=50000)


xTr, xTe, yTr, yTe = kmer, kmer_test, aff_kmer, aff_kmer_test

yTr = numpy.reshape(yTr, (yTr.shape[0], 1))
yTe = numpy.reshape(yTe, (yTe.shape[0], 1))


print(xTr.shape, xTe.shape, yTr.shape, yTe.shape)

(11537, 9) (2215, 9) (11537, 1) (2215, 1)


In [59]:
import collections

def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(20 - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

In [60]:
as_lists = [list(i) for i in train_dat.peptides]
data, count, dictionary, reverse_dictionary = build_dataset(train_dat.peptides)

In [64]:
reverse_dictionary

{0: 'UNK',
 1: 'VWINNSWKF',
 2: 'HCSQVFLKM',
 3: 'DPKNWWHIL',
 4: 'ALPPPPPPP',
 5: 'EISTNIRQA',
 6: 'EKEVVPDFY',
 7: 'YFTFDLTAL',
 8: 'IFFASFYYI',
 9: 'ISEPTIHLV',
 10: 'KYLYFIKGL',
 11: 'AWIDNYNKF',
 12: 'IRLRPGGKK',
 13: 'VTYNCCDDDY',
 14: 'MFSPIVPFW',
 15: 'HSAEALQKY',
 16: 'YEVPAALIL',
 17: 'ELDEIGEDV',
 18: 'FHSRFVQAL',
 19: 'DTFGVIDTM'}

In [48]:
x = tf.placeholder(shape=(None, 9), dtype=tf.float32)
y_ = tf.placeholder(shape=(None, 1), dtype=tf.float32)
keep_prob = tf.placeholder(tf.float32)

batch_size = 75
epochs = 700
lr = 0.001

net = tflearn.input_data(placeholder=x)
net = tflearn.embedding(net, input_dim=21, output_dim=32, weights_init='xavier')
net = tflearn.bidirectional_rnn(net, tflearn.BasicLSTMCell(32), tflearn.BasicLSTMCell(32))
net = tflearn.dropout(net, 0.2)
net = tflearn.layers.normalization.batch_normalization(net)
net = tflearn.dropout(net, 0.1)
net = tflearn.fully_connected(net, 1, activation='sigmoid')

loss = tf.reduce_mean(tf.square(net - y_))
train_op = tf.train.RMSPropOptimizer(lr).minimize(loss)
accuracy = tf.contrib.metrics.streaming_root_mean_squared_error(net, y_)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    tflearn.is_training(True, session=sess)

    for step in range(epochs):
        total_batch = int(xTr.shape[0] / batch_size)

        for i in range(total_batch):
            batch_x, batch_y = utils.get_batch2d(xTr, yTr, batch_size)
            # batch_y = numpy.reshape(batch_y, (batch_x.shape[0], 1))

            sess.run(train_op, feed_dict={x: batch_x, y_: batch_y})

        if step % 10 == 0:
            # Calculate batch loss and accuracy
            # loss= sess.run(loss, feed_dict={x: batch_x, y_: batch_y})
            acc = sess.run(accuracy, feed_dict={x: batch_x, y_: batch_y})

            print ("Iter " + str(step * batch_size) + ", Training RMSE " + str(acc))

    tflearn.is_training(False, session=sess)
    acc = sess.run([accuracy], feed_dict={x: xTe, y_: yTe})
    print('Testing RMSE:' + str(acc))
    preds = sess.run(net, feed_dict={x: xTe})



Iter 0, Training RMSE (0.0, 0.35718116)
Iter 750, Training RMSE (0.35718116, 0.29317763)
Iter 1500, Training RMSE (0.29317763, 0.26485693)
Iter 2250, Training RMSE (0.26485693, 0.24267817)
Iter 3000, Training RMSE (0.24267817, 0.23147383)
Iter 3750, Training RMSE (0.23147383, 0.22322008)
Iter 4500, Training RMSE (0.22322008, 0.21512811)
Iter 5250, Training RMSE (0.21512811, 0.21200696)
Iter 6000, Training RMSE (0.21200696, 0.20870936)
Iter 6750, Training RMSE (0.20870936, 0.20284314)
Iter 7500, Training RMSE (0.20284314, 0.1991726)
Iter 8250, Training RMSE (0.1991726, 0.19856678)
Iter 9000, Training RMSE (0.19856678, 0.19605063)
Iter 9750, Training RMSE (0.19605063, 0.1938832)
Iter 10500, Training RMSE (0.1938832, 0.19139323)
Iter 11250, Training RMSE (0.19139323, 0.18948692)
Iter 12000, Training RMSE (0.18948692, 0.19026059)
Iter 12750, Training RMSE (0.19026059, 0.18889983)
Iter 13500, Training RMSE (0.18889983, 0.18752107)
Iter 14250, Training RMSE (0.18752107, 0.18637794)
Iter 1500

In [46]:
idx_kmer[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1])

In [50]:
preds_ = numpy.array([utils.regression_target_to_ic50(i[0]) for i in preds])
targs_ = numpy.array([utils.regression_target_to_ic50(i[0]) for i in yTe])
print(targs.shape, preds.shape)
print(utils.make_scores(targs_, preds_))
print(pandas.DataFrame([preds_, targs_]).T)


(2215,) (2215, 1)
{'f1': 0.15902140672782875, 'auc': 0.822149496561144, 'tau': 0.38979771345291786}
                 0             1
0     13015.334351   1540.276452
1      5946.535182   1540.276452
2      4340.378490   1540.276452
3      8505.968304   1540.276452
4      7643.574561   1540.276452
5      5281.672692   1540.276452
6      6321.121266   1540.276452
7     25575.574109   1540.276452
8     14901.037929  13599.816872
9     18351.959054  13599.816872
10    18536.320857  13599.816872
11    16790.411299  13599.816872
12    18106.040977  13599.816872
13    17836.419274  13599.816872
14    18916.884323  13599.816872
15    26683.457229  13599.816872
16     1394.943069  31449.651043
17     1491.331108  31449.651043
18     4405.983168  31449.651043
19     6009.022311  31449.651043
20     6153.519629  31449.651043
21     7807.458462  31449.651043
22     5894.834730  31449.651043
23    14697.488858  31449.651043
24      361.732573   1836.812966
25     1548.005107   1836.812966
26     89

In [20]:
import pandas
preds = [utils.regression_target_to_ic50(i) for i in preds]
targs = [utils.regression_target_to_ic50(i[0]) for i in yTe]
pandas.DataFrame([preds, targs]).T

Unnamed: 0,0,1
0,0.000000e+00,0.000000e+00
1,0.000000e+00,0.000000e+00
2,0.000000e+00,0.000000e+00
3,0.000000e+00,0.000000e+00
4,0.000000e+00,0.000000e+00
5,0.000000e+00,0.000000e+00
6,0.000000e+00,0.000000e+00
7,0.000000e+00,0.000000e+00
8,0.000000e+00,0.000000e+00
9,0.000000e+00,0.000000e+00


In [15]:
preds

array([ 110.37532216,   85.19675418,  105.04591578, ...,  888.75762169,
         89.26702684,  131.39279915])

In [18]:
import pandas

preds = [regression_target_to_ic50(i) for i in preds]
targs = [regression_target_to_ic50(i[0]) for i in yTe]
pandas.DataFrame([preds, targs]).T


NameError: name 'regression_target_to_ic50' is not defined

In [130]:
tf.reduce_mean(tf.square(10 - 1))
a = tf.train.RMSPropOptimizer(0.1).minimize(loss)
a.

<tf.Operation 'RMSProp_6' type=NoOp>

In [163]:
from amino_acid import Data
import os
import utils
import numpy
import tensorflow as tf
import tflearn
import pandas


class TFLearnPepPred(object):
    '''
    seq2seq recurrent neural network, implemented using TFLearn.
    '''
    AVAILABLE_MODELS = ["embedding_rnn", "embedding_attention"]

    def __init__(self, allele=None, kmer_size=9, batch_size=64, learning_rate=0.001, verbose=None, data_dir=None):

        self.path_to_data = os.path.dirname(os.getcwd()) + '/mhcPreds/data/'
        self.xTr, self.xTe, self.yTr, self.yTe = self.generate_train_test_data(allele=allele, kmer_size=kmer_size)
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.verbose = verbose or 0
        self.data_dir = data_dir

    def generate_train_test_data(self, allele=None, kmer_size=9):

        train_dat = Data(self.path_to_data + 'train.txt', allele=allele)
        test_dat = Data(self.path_to_data + 'test.txt', allele=allele)

        kmer, aff_kmer, idx_kmer = train_dat.kmer_index_encoding(kmer_size=9)
        aff_kmer = utils.ic50_to_regression_target(aff_kmer, max_ic50=50000)

        kmer_test, aff_kmer_test, idx_kmer_test = test_dat.kmer_index_encoding(kmer_size=kmer_size)
        aff_kmer_test = utils.ic50_to_regression_target(aff_kmer_test, max_ic50=50000)

        xTr, xTe, yTr, yTe = kmer, kmer_test, aff_kmer, aff_kmer_test

        yTr = numpy.reshape(yTr, (yTr.shape[0], 1))
        yTe = numpy.reshape(yTe, (yTe.shape[0], 1))

        return xTr, xTe, yTr, yTe

    def optimizer(self):
        opt =  tflearn.RMSProp(learning_rate=0.001, decay=0.9)
        return opt

    def loss_func(self, y_pred, y_true):
        return tf.reduce_mean(tf.square(y_pred - y_true))

    def accuracy(self, y_pred, y_true):
        return tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(y_pred, y_true))))
    
    def l1_norm(prediction, target, inputs):
        return tf.reduce_sum(tf.abs(prediction - target), name='l1')
                       
    def model(self, type=None, mode="train", num_layers=2, state_size=32, learning_rate=0.0001, tensorboard_verbose=3):

        net = tflearn.input_data(shape=[None, 9])
        net = tflearn.embedding(net, input_dim=21, output_dim=32, weights_init='xavier')

        if type == 'bi_rnn':
            out_rnn = tflearn.bidirectional_rnn(net, tflearn.BasicLSTMCell(32), tflearn.BasicLSTMCell(32))
            
        elif type == 'basic_lstm':
            out_rnn = tflearn.lstm(net, 40)

        elif type =='basic_rnn':
            out_rnn = tflearn.simple_rnn(net, 40)

        else:
            out_rnn = net

        net = tflearn.fully_connected(out_rnn, 100, activation='prelu')
        net = tflearn.layers.normalization.batch_normalization(net)
        net = tflearn.dropout(net, 0.1)
        net = tflearn.fully_connected(net, 1, activation='sigmoid')

        """
        single_cell = getattr(tf.contrib.rnn, cell_type)(cell_size, state_is_tuple=True)

        if num_layers == 1:
            cell = single_cell
        else:
            cell = tf.contrib.rnn.MultiRNNCell([single_cell] * num_layers)
        """

        with tf.name_scope("TargetsData"):  # placeholder for target variable (i.e. trainY input)
            targetY = tf.placeholder(shape=[None, 1], dtype=tf.float32, name="Y")

        network = tflearn.regression(net,
                                     placeholder=targetY,
                                     optimizer=self.optimizer(),
                                     learning_rate=learning_rate,
                                     loss=self.loss_func(net, targetY),
                                     metric=self.accuracy(net, targetY),
                                     name="Y")

        model = tflearn.DNN(network, tensorboard_verbose=tensorboard_verbose)
        return model

    def train(self, model, num_epochs=20, num_points=100000, model_params=None, weights_input_fn=None,
              validation_set=0.1, snapshot_step=5000,  weights_output_fn=None):
        '''
        Train model, with specified number of epochs, and dataset size.
        Use specified model, or create one if not provided.  Load initial weights from file weights_input_fn,
        if provided. validation_set specifies what to use for the validation.
        Returns logits for prediction, as an numpy array of shape [out_seq_len, n_output_symbols].
        '''

        model.fit(self.xTr, self.yTr,
                  n_epoch=num_epochs,
                  validation_set=validation_set,
                  batch_size=self.batch_size,
                  shuffle=True,
                  show_metric=True,
                  snapshot_step=snapshot_step,
                  snapshot_epoch=False,
                  run_id="TFLearnSeq2Seq"
                  )
        print ("Done!")
        return model

    def predict(self, model):
        '''
        Make a prediction, using the seq2seq model, for the given input sequence Xin.
        If model is not provided, create one (or use last created instance).
        Return prediction, y
        prediction = array of integers, giving output prediction.  Length = out_seq_len
        y = array of shape [out_seq_len, out_max_int], giving logits for output prediction
        '''

        res = model.predict(self.xTr)
        if self.verbose > 1: print ("prediction shape = %s" % str(res.shape))

        if self.verbose:
            print ("Predicted output sequence: %s" % str(res))

        return res

    def scoring(self, preds):

        preds = numpy.array([utils.regression_target_to_ic50(i[0]) for i in preds])
        targs = numpy.array([utils.regression_target_to_ic50(i[0]) for i in self.yTe])
        scores = utils.make_scores(targs, preds)
        as_df = pandas.DataFrame([preds, targs])

        return scores, as_df

In [157]:
md.accuracy(10,11)

(<tf.Tensor 'Sqrt_14:0' shape=() dtype=float32>,
 <tf.Tensor 'Sqrt_15:0' shape=() dtype=float32>)

In [147]:
md.optimizer

<bound method TFLearnPepPred.optimizer of <__main__.TFLearnPepPred object at 0x12d99fac8>>

In [164]:
md = TFLearnPepPred(allele=None, kmer_size=9, batch_size=64, learning_rate=0.001, verbose=3, data_dir=None)
model = md.model(type='basic_rnn', mode="train", num_layers=2, state_size=10)
trained = md.train(model)

IndexError: list index out of range

IndexError: list index out of range