# Predict Helix Capping Residues #

The goal is to identify residues just before an alpha helix begins or the residues just after the helix ends. This will improve secondary structure predictors becuase they often extend too far or do not start at the right place. 

The CapsDB has annoted sequences of structures of helix capping residues that can be used to train a deep nueral net. We will use a Bidirectional LSTM using phi/psi features to see if it will those will be good predictors.

In [3]:
import numpy as np
import pandas as pd
import math
import os

# Build generator for training data

In [70]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, batch_size=1, num_features=24, shuffle=True):
        'Initialization'
        
        data_chain_in = open("pickled_data/train_chains.pickle","rb")
        self.train_data = pickle.load(data_chain_in)
        label_chain_in = open("pickled_data/label_chains.pickle","rb")
        self.labels = pickle.load(label_chain_in)
        
        self.num_features = num_features
        self.dim = [None,num_features]
        self.batch_size = batch_size
        self.list_IDs = np.arange(len(self.labels)) # length of training set size
        self.shuffle = shuffle
        
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim)
        # Initialization
        
        # hardcode in sizes to test instead of using "*self.dim"
        
        residues = len(self.labels[list_IDs_temp[0]]) # this only works for batch_size=1
        
        X = np.empty((self.batch_size, residues, self.num_features))
        y = np.empty((self.batch_size, residues, 1), dtype=int)
        
        for i, ID in enumerate(list_IDs_temp):
            X[i,] = self.train_data[ID]
#             print(ID)
#             print(y.shape)
#             print(self.labels[ID].shape)
            
            y[i,] = self.labels[ID]

        return X, y

# Build LSTM Classifier Model

In [4]:
import tensorflow as tf
import keras
from keras.utils.io_utils import HDF5Matrix
from keras.layers import *# Input, Dense, Bidirectional, LSTM, Concatenate, CuDNNLSTM, Masking
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint, CSVLogger, TensorBoard
from keras.layers.normalization import BatchNormalization
from keras.utils import plot_model

Using TensorFlow backend.


In [73]:
def create_model(batch_size, num_features):

    #Xchain = tf.placeholder(tf.float32, [None, batch_size, num_features], name='InputSequence')

    model = Sequential()
    #model.add(Input(shape=(None, num_features), name="input"))
    model.add(Bidirectional(CuDNNLSTM(units=100, # dimensionality of the output space, independent of # timesteps
                        input_shape=(None, num_features),
                        return_sequences=True)))
    # model.add(LSTM(hidden_size, return_sequences=True))
    # if use_dropout:
    #     model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(1)))
    model.add(Activation('softmax'))
    return(model)


In [7]:
batch_size=1
num_features=24

model = create_model(batch_size, num_features)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [74]:
# Instantiate generator
training_generator = DataGenerator()

model.fit_generator(generator=training_generator)

Epoch 1/1


<keras.callbacks.History at 0x7efc82fed9b0>

In [72]:
model.get_weights()

[array([[ 0.00854491, -0.0161927 , -0.0601163 , ..., -0.09773389,
         -0.11893874,  0.0553289 ],
        [-0.10674282, -0.07672422,  0.09192923, ...,  0.06227181,
         -0.03067157, -0.03693574],
        [ 0.04261782,  0.05634183, -0.11564188, ...,  0.10435071,
         -0.00097653,  0.09903456],
        ...,
        [-0.05936329,  0.10208074,  0.08211298, ...,  0.02234525,
          0.07386395,  0.01436683],
        [-0.05875681, -0.04208422, -0.04683439, ...,  0.07434059,
         -0.03841694,  0.00487308],
        [-0.07576571, -0.09590036, -0.06673967, ...,  0.01174287,
         -0.01837976, -0.11706314]], dtype=float32),
 array([[-0.005871  ,  0.01902355, -0.01430513, ...,  0.01031065,
         -0.01314655, -0.00064888],
        [-0.00135805,  0.02088111,  0.01959058, ..., -0.05063406,
         -0.00305159, -0.01162037],
        [-0.01150192,  0.00695272,  0.00809403, ...,  0.10554451,
         -0.07422917, -0.011684  ],
        ...,
        [ 0.00197234, -0.05778326,  0.0