# Predict Helix Capping Residues #

The goal is to identify residues just before an alpha helix begins or the residues just after the helix ends. This will improve secondary structure predictors becuase they often extend too far or do not start at the right place. 

The CapsDB has annoted sequences of structures of helix capping residues that can be used to train a deep nueral net. We will use a Bidirectional LSTM using phi/psi features to see if it will those will be good predictors.

In [1]:
import numpy as np
import pandas as pd
import math
import os

In [2]:
import tensorflow as tf
import keras
from keras.utils.io_utils import HDF5Matrix
from keras.layers import *# Input, Dense, Bidirectional, LSTM, Concatenate, CuDNNLSTM, Masking
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint, CSVLogger, TensorBoard
from keras.layers.normalization import BatchNormalization
from keras.utils import plot_model
import pickle

Using TensorFlow backend.


# Build generator for training data

In [17]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, batch_size=1, num_features=24, shuffle=True):
        'Initialization'
        
        data_chain_in = open("pickled_data/train_chains.pickle","rb")
        self.train_data = pickle.load(data_chain_in)
        label_chain_in = open("pickled_data/label_chains.pickle","rb")
        self.labels = pickle.load(label_chain_in)
        
        self.num_features = num_features
        self.dim = [None,num_features]
        self.batch_size = batch_size
        self.list_IDs = np.arange(len(self.labels)) # length of training set size
        self.shuffle = shuffle
        
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim)
        # Initialization
        
        # hardcode in sizes to test instead of using "*self.dim"

        # make IDs < 10 to intentially overfit on small data set
        #list_IDs_temp = [ID % 10 for ID in list_IDs_temp] 
        
        residues = len(self.labels[list_IDs_temp[0]]) # this only works for batch_size=1
        
        X = np.empty((self.batch_size, residues, self.num_features))
        y = np.empty((self.batch_size, residues, 2), dtype=int)
        
        for i, ID in enumerate(list_IDs_temp):
            X[i,] = self.train_data[ID]
#             print(ID)
#             print(y.shape)
#             print(self.labels[ID].shape)
            
            y[i,] = self.labels[ID]

        return X, y

In [26]:
# pad/batch data generator
class DataGeneratorBatch(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, batch_size=1, num_features=24, shuffle=True):
        'Initialization'
        
        # read in data
        data_chain_in = open("pickled_data/train_chains_sorted.pickle","rb")
        self.train_data = pickle.load(data_chain_in)
        label_chain_in = open("pickled_data/label_chains_sorted.pickle","rb")
        self.labels = pickle.load(label_chain_in)
        
        # set self vars
        self.num_features = num_features
        self.dim = [None,num_features]
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        # split lists of arrays into batch chunks
        n_batch = len(self.train_data) // batch_size
        self.train_batch_list = [self.train_data[i:i + n] for i in range(0, n_batch*batch_size, batch_size)]
        self.label_batch_list = [self.labels[i:i + n] for i in range(0, n_batch*batch_size, batch_size)]
        
        # add padding within batches and add to new list of 3D arrays
        self.train_batched = []
        for i, l in enumerate(self.train_batch_list):
            max_len = np.max([len(arr) for arr in l])
            batch_array = np.zeros((batch_size, max_len, num_features), dtype=float)
            for j, arr in enumerate(l):
                batch_array[j,:(arr.shape[0]),:] = arr
            self.train_batched.append(batch_array)
            
        # add padding within batches and add to new list of 3D arrays
        self.labels_batched = []
        for i, l in enumerate(self.label_batch_list):
            max_len = np.max([len(arr) for arr in l])
            batch_array = np.zeros((batch_size, max_len, 2), dtype=int)
            for j, arr in enumerate(l):
                batch_array[j,:(arr.shape[0]),:] = arr
            self.labels_batched.append(batch_array)
            
        self.list_IDs = np.arange(len(self.labels_batched)) # number of batches
        
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Find list of IDs
        list_ID_temp = self.indexes[index]

        # Generate data
        X, y = self.__data_generation(list_ID_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, batch_index):
        'Generates data containing batch_size samples' # X : (n_samples, *dim)
        # Initialization
        
        # hardcode in sizes to test instead of using "*self.dim"

        X = self.train_batch_list[batch_index]
        y = self.label_batch_list[batch_index]

        return X, y

# Build LSTM Classifier Model

In [19]:
def get_callbacks(model_obj, logging_file="training.log", verbosity=1, early_stopping_patience=None):
    callbacks = list()
    #callbacks.append(ModelCheckpoint(model_obj, save_best_only=True))
    #callbacks.append(CSVLogger(logging_file, append=True))
    #callbacks.append(LossHistory('losshistory.log', append=True))
    callbacks.append(TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=1, write_graph=False, write_grads=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, embeddings_data=None, update_freq='epoch'))
    return callbacks

In [18]:
def create_model(batch_size, num_features):

    #Xchain = tf.placeholder(tf.float32, [None, batch_size, num_features], name='InputSequence')

    model = Sequential()
    model.add(Bidirectional(CuDNNLSTM(units=100, # dimensionality of the output space, independent of # timesteps
                        return_sequences=True),input_shape=(None, num_features)))
    # model.add(LSTM(hidden_size, return_sequences=True))
    # if use_dropout:
    #     model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(2)))
    model.add(Activation('softmax'))
    return(model)


In [6]:
batch_size=32
num_features=24

model = create_model(batch_size, num_features)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model.get_weights()

[array([[ 0.05313232, -0.1363357 ,  0.06846244, ...,  0.16761883,
          0.02146317,  0.3163824 ],
        [ 0.06232766,  0.15247647, -0.16668853, ..., -0.11090114,
          0.28067577,  0.03881414],
        [-0.00549035,  0.01266282, -0.00971651, ...,  0.04193433,
          0.232213  ,  0.04238088],
        ...,
        [ 0.15277345,  0.08952329,  0.03367085, ..., -0.00629699,
         -0.2310982 ,  0.2460077 ],
        [ 0.17680295,  0.01072129,  0.17268078, ..., -0.03949181,
          0.18719561,  0.14881782],
        [ 0.07855725, -0.28328156,  0.13491125, ..., -0.12565579,
          0.22623992, -0.14254798]], dtype=float32),
 array([[-0.1358306 ,  0.07451328, -0.21504831, ..., -0.06372697,
          0.20877731,  0.26139355],
        [-0.1187883 , -0.07652966,  0.05265986, ..., -0.17445396,
          0.02563745,  0.13353227],
        [ 0.02180309,  0.01533592, -0.05264511, ..., -0.03236129,
          0.15429991,  0.06705286],
        ...,
        [-0.12803239, -0.00362704,  0.0

In [None]:
# Instantiate generator
training_generator = DataGenerator()

# Train model
model.fit_generator(generator=training_generator, epochs=150, callbacks=get_callbacks())

In [31]:
from keras.models import load_model

model.save('batchedBiLSTM.h5') 

In [47]:
model.evaluate_generator(training_generator)

[0.3249633530141508, 0.8752696192669378]

In [58]:
temp = np.empty((1,106,24), dtype=float)
temp[0,:,:] = train[0]

In [64]:
pred = model.predict(temp)

In [67]:
pred

array([[[0.995046  , 0.004954  ],
        [0.99083483, 0.00916519],
        [0.98009884, 0.01990118],
        [0.9578037 , 0.04219631],
        [0.93834233, 0.06165768],
        [0.93799746, 0.06200255],
        [0.95864457, 0.04135537],
        [0.93111336, 0.06888666],
        [0.93199545, 0.0680045 ],
        [0.87351197, 0.12648803],
        [0.9527731 , 0.04722686],
        [0.9760252 , 0.02397476],
        [0.9847578 , 0.01524222],
        [0.9735141 , 0.02648599],
        [0.96789414, 0.03210581],
        [0.97861576, 0.0213842 ],
        [0.96157277, 0.03842722],
        [0.95487994, 0.04512003],
        [0.9394409 , 0.06055906],
        [0.9418572 , 0.05814282],
        [0.97939336, 0.02060668],
        [0.95794684, 0.04205323],
        [0.9358441 , 0.06415593],
        [0.93786174, 0.06213827],
        [0.94631773, 0.05368228],
        [0.9562809 , 0.04371911],
        [0.939126  , 0.06087395],
        [0.9660818 , 0.03391821],
        [0.9748062 , 0.02519377],
        [0.971

In [72]:
np.sum(np.argmax(pred, axis=2))

0