# Predict Helix Capping Residues #

The goal is to identify residues just before an alpha helix begins or the residues just after the helix ends. This will improve secondary structure predictors becuase they often extend too far or do not start at the right place. 

The CapsDB has annoted sequences of structures of helix capping residues that can be used to train a deep nueral net. We will use a Bidirectional LSTM using phi/psi features to see if it will those will be good predictors.

In [3]:
import numpy as np
import pandas as pd
import math
import os

In [4]:
import tensorflow as tf
import keras
from keras.utils.io_utils import HDF5Matrix
from keras.layers import *# Input, Dense, Bidirectional, LSTM, Concatenate, CuDNNLSTM, Masking
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint, CSVLogger, TensorBoard
from keras.layers.normalization import BatchNormalization
from keras.utils import plot_model
import pickle

Using TensorFlow backend.


# Build generator for training data

In [51]:
def split_data(frac_train = 0.75, sort=False):
    'Splits data into train and validation sets'
    
    data_chain_in = open("pickled_data/train_chains.pickle","rb")
    data = pickle.load(data_chain_in)
    label_chain_in = open("pickled_data/label_chains.pickle","rb")
    labels = pickle.load(label_chain_in)
    laglabel_chain_in = open("pickled_data/laglabel_chains.pickle","rb")
    labelslag = pickle.load(laglabel_chain_in)

    n_obs = len(labels)
    n_train = np.floor(n_obs*frac_train)
    list_IDs = np.arange(n_obs) # length of training set size
    np.random.shuffle(list_IDs)
    train_IDs = list_IDs[0:int(n_train)].astype(int)
    test_IDs = list_IDs[int(n_train):].astype(int)

    train_data = [data[i] for i in train_IDs]
    train_labels = [labels[i] for i in train_IDs]
    train_labelslag = [labelslag[i] for i in train_IDs]
    
    test_data = [data[i] for i in test_IDs]
    test_labels = [labels[i] for i in test_IDs]
    test_labelslag = [labelslag[i] for i in test_IDs]
    
    for typ in ["train", "test"]:
        for dataset in ["data", "labels", "labelslag"]:
            name = "{}_{}".format(typ, dataset)
            datalist = locals()[name]
            pickle_out = open("pickled_data/staged_data/{}.pickle".format(name),"wb")
            pickle.dump(datalist, pickle_out)
            pickle_out.close()


In [56]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, partition, batch_size=1, num_features=24, shuffle=True):
        'Initialization'
        
        data_path = "pickled_data/staged_data/{}_data.pickle".format(partition)
        label_path = "pickled_data/staged_data/{}_labels.pickle".format(partition)
        data_chain_in = open(data_path,"rb")
        self.data = pickle.load(data_chain_in)
        label_chain_in = open(label_path,"rb")
        self.labels = pickle.load(label_chain_in)
        
        self.num_features = num_features
        self.dim = [None,num_features]
        self.batch_size = batch_size
        self.list_IDs = np.arange(len(self.labels)) # length of training set size
        self.shuffle = shuffle
        
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim)
        # Initialization
        
        # hardcode in sizes to test instead of using "*self.dim"

        # make IDs < 10 to intentially overfit on small data set
        #list_IDs_temp = [ID % 10 for ID in list_IDs_temp] 
        
        residues = len(self.labels[list_IDs_temp[0]]) # this only works for batch_size=1
        
        X = np.empty((self.batch_size, residues, self.num_features))
        y = np.empty((self.batch_size, residues, 2), dtype=int)
        
        for i, ID in enumerate(list_IDs_temp):
            X[i,] = self.data[ID]
#             print(ID)
#             print(y.shape)
#             print(self.labels[ID].shape)
            
            y[i,] = self.labels[ID]

        return X, y

In [None]:
# pad/batch data generator
class DataGeneratorBatch(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, batch_size=1, num_features=24, shuffle=True):
        'Initialization'
        
        # read in data
        data_chain_in = open("pickled_data/train_chains_sorted.pickle","rb")
        self.train_data = pickle.load(data_chain_in)
        label_chain_in = open("pickled_data/label_chains_sorted.pickle","rb")
        self.labels = pickle.load(label_chain_in)
        
        # set self vars
        self.num_features = num_features
        self.dim = [None,num_features]
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        # split lists of arrays into batch chunks
        n_batch = len(self.train_data) // batch_size
        self.train_batch_list = [self.train_data[i:i + batch_size] for i in range(0, n_batch*batch_size, batch_size)]
        self.label_batch_list = [self.labels[i:i + batch_size] for i in range(0, n_batch*batch_size, batch_size)]
        
        # add padding within batches and add to new list of 3D arrays
        self.train_batched = []
        for i, l in enumerate(self.train_batch_list):
            max_len = np.max([len(arr) for arr in l])
            batch_array = np.zeros((batch_size, max_len, num_features), dtype=float)
            for j, arr in enumerate(l):
                batch_array[j,:(arr.shape[0]),:] = arr
            self.train_batched.append(batch_array)
            
        # add padding within batches and add to new list of 3D arrays
        self.labels_batched = []
        for i, l in enumerate(self.label_batch_list):
            max_len = np.max([len(arr) for arr in l])
            batch_array = np.zeros((batch_size, max_len, 2), dtype=int)
            for j, arr in enumerate(l):
                batch_array[j,:(arr.shape[0]),:] = arr
            self.labels_batched.append(batch_array)
            
        self.list_IDs = np.arange(len(self.labels_batched)) # number of batches
        
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Find list of IDs
        list_ID_temp = self.indexes[index]

        # Generate data
        X, y = self.__data_generation(list_ID_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, batch_index):
        'Generates data containing batch_size samples' # X : (n_samples, *dim)
        # Initialization
        
        # hardcode in sizes to test instead of using "*self.dim"

        X = self.train_batch_list[batch_index]
        y = self.label_batch_list[batch_index]

        return X, y

# Create train/test partition files on disk

In [53]:
split_data(0.8)

# Build LSTM Classifier Model

In [90]:
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, balanced_accuracy_score

class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_accuracy = []
        self.val_balanced_accuracy_score = []
        self.val_f1s = []

    def on_epoch_end(self, epoch, logs={}):
        metvals = eval_model(self.model, partition="test", metric="all", verbose=False)
        
        self.val_accuracy.append(metvals[0])
        self.val_balanced_accuracy_score.append(metvals[1])
        self.val_f1s.append(metvals[2])
        
        print(" — accuracy: {} — bal_accuracy: {} — f1: {}".format(metvals))
        return

#metrics = Metrics()

def get_callbacks(logging_file="training3.log", verbosity=1, early_stopping_patience=None):
    callbacks = list()
    #callbacks.append(ModelCheckpoint(model_obj, save_best_only=True))
    #callbacks.append(CSVLogger(logging_file, append=True))
    #callbacks.append(LossHistory('losshistory.log', append=True))
    callbacks.append(Metrics())
    callbacks.append(TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=1, write_graph=False, write_grads=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, embeddings_data=None, update_freq='epoch'))
    return callbacks

In [91]:
def create_model(batch_size, num_features):

    #Xchain = tf.placeholder(tf.float32, [None, batch_size, num_features], name='InputSequence')

    model = Sequential()
    model.add(Bidirectional(CuDNNLSTM(units=100, # dimensionality of the output space, independent of # timesteps
                        return_sequences=True),input_shape=(None, num_features)))
    # model.add(LSTM(hidden_size, return_sequences=True))
    # if use_dropout:
    #     model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(2)))
    model.add(Activation('softmax'))
    return(model)


In [92]:
batch_size=1
num_features=24

model = create_model(batch_size, num_features)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.get_weights()

In [93]:
from keras.models import *

# Instantiate generator
training_generator = DataGenerator(partition="train")
test_generator = DataGenerator(partition="test")
#training_generator = DataGeneratorBatch()

# Train model
model.fit_generator(generator=training_generator, epochs=150, callbacks=get_callbacks(), validation_data=test_generator)

model.save('batchedBiLSTM_split2.h5') 


FileNotFoundError: [Errno 2] No such file or directory: 'pickled_data/staged_data/{}_labels.pickle'

In [59]:
from keras.models import *
model = load_model('batchedBiLSTM_split.h5')

In [None]:
model.get_weights()

In [None]:
#model.summary()
from keras.models import *

# Instantiate generator
test_generator = DataGeneratorTest()
model.predict_generator(test_generator, steps=len(test_generator), verbose=1)
#for i, samp in enumerate(test_generator):
#    print(i, (samp[0].shape), (samp[1].shape))


In [85]:
def eval_model(model, partition="test", metric="accuracy", verbose=True):
    'Returns and optionally prints the selected metric calculated on partition of data.'
    
    data_path = "pickled_data/staged_data/{}_data.pickle".format(partition)
    label_path = "pickled_data/staged_data/{}_labels.pickle".format(partition)
    data_chain_in = open(data_path,"rb")
    label_chain_in = open(label_path,"rb")
    data = pickle.load(data_chain_in)
    labels = pickle.load(label_chain_in)
    
    if verbose:
        print("Calculating {} metric on {} samples".format(metric, len(labels)))
        
    pred_list = []
    label_list = []

    # Get predictions and concatenate into array
    for i in range(len(labels)):
        chain = data[i]
        temp = np.empty((1,chain.shape[0],24), dtype=float)
        temp[0,:,:] = data[i]

        pred = np.argmax(model.predict(temp), axis=2)[0]
        truth = np.argmax(labels[i], axis=1)
        pred_list.append(pred)
        label_list.append(truth)

    pred_array = np.concatenate(pred_list)
    label_array = np.concatenate(label_list)
   
    if metric == "accuracy":
        metval = accuracy_score(label_array, pred_array)
    elif metric == "balanced":
        metval = balanced_accuracy_score(label_array, pred_array)
    elif metric == "f1":
        metval = f1_score(label_array, pred_array)
    elif metric == "all":
        metval = [accuracy_score(label_array, pred_array),
                 balanced_accuracy_score(label_array, pred_array),
                 f1_score(label_array, pred_array)]
        if verbose:
            print(metval)
    else:
        return("bad metric argument: use ""accuracy"",""balanced"" or ""f1""")

    if (verbose and metric != "all"):
        print("{} on {} set is: {}".format(metric, partition, metval))
    return(metval)
 

In [84]:
# Evaluate metric on whole test set

from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score

eval_model(model, partition="test", metric="f1")

Calculating f1 metric on 1343 samples
f1 on test set is: 0.6415008062381169


0.6415008062381169

In [None]:
# Read data in for eval
data_chain_in = open("pickled_data/train_chains_sorted.pickle","rb")
train = pickle.load(data_chain_in)
label_chain_in = open("pickled_data/label_chains_sorted.pickle","rb")
labels = pickle.load(label_chain_in)
data_chain_in.close()
label_chain_in.close()

In [54]:
train[0].shape

NameError: name 'train' is not defined

In [None]:
# Evalutate on specific protein chain
index = 1008
length = train[index].shape[0]
temp = np.empty((1,length,24), dtype=float)
temp[0,:,:] = train[index]

print("Predicted caps")
print(np.argmax(model.predict(temp), axis=2)[0])

print("True caps")
print(np.argmax(labels[index], axis=1))

temp1 = np.argmax(model.predict(temp), axis=2)[0]
temp2 = np.argmax(labels[index], axis=1)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(temp2, temp1)
#model.predict_generator(DataGeneratorTest())