# Predict Helix Capping Residues #

The goal is to identify residues just before an alpha helix begins or the residues just after the helix ends. This will improve secondary structure predictors becuase they often extend too far or do not start at the right place. 

The CapsDB has annoted sequences of structures of helix capping residues that can be used to train a deep nueral net. We will use a Bidirectional LSTM using phi/psi features to see if it will those will be good predictors.

## 1. Download data ##

## 2. Generate Features ##
### MMTF Pyspark Imports ###

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.ml import ProteinSequenceEncoder
import numpy as np
import pandas as pd

### Custom imports ###

In [2]:
import secondaryStructureExtractorFull
#import mmtfToASA
import os
os.getcwd()

'/home/ec2-user/SageMaker/ProteinFragmenter'

### Configure Spark Context ###

In [3]:
spark = SparkSession.builder.master("local[8]").appName("DeepCap").getOrCreate()

### Filter out chains not in CapsDB ###

In [4]:
from pyspark.sql import SQLContext
from pyspark.sql.functions import concat, col, lit, array_contains

sqlContext = SQLContext(spark)
capsdb = sqlContext.read.parquet('caps_descriptors.parquet')
capsdb_pdbs = capsdb.select(concat(upper(col("pdbId")), lit("."), col("chain")).alias("id")).drop_duplicates()


In [5]:
capsdb_pdbs.count()

6749

In [35]:
#from pyspark.sql.types import BooleanType

# def hasHelixCapInfo(enc_obj, ids):
#     return(enc_obj[0] in ids)

# def udf_hasHelixCapInfo(ids):
#     return udf(lambda t: hasHelixCapInfo(t, ids))
#     # I'm not sure how to call these successfully - Sean

capsdb_ids = set()
[capsdb_ids.add(list[0]) for list in capsdb_pdbs.select("id").collect()]

class HasHelixCapInfo(object):
    '''This filter returns true if the structure is in CAPSDB'''
    def __call__(self, t):
        return(t[0] in capsdb_ids)
    # This works but I don't like that it refers to a global variable: not sure how to change that right now - Sean


### Read MMTF File and get a set of L-protein chains ###

In [7]:
if not os.path.isdir("full"):
    !wget https://mmtf.rcsb.org/v1.0/hadoopfiles/full.tar && tar -xvf full.tar
        
if not os.path.isdir("reduced"):
    !wget https://mmtf.rcsb.org/v1.0/hadoopfiles/reduced.tar && tar -xvf reduced.tar

In [36]:
pdb = mmtfReader.read_sequence_file('full') \
    .flatMap(StructureToPolymerChains()) \
    .filter(ContainsLProteinChain()) \
    .filter(HasHelixCapInfo())
#pdb.count()
#pdb.take(2)
#?pdb

### Get Torsion angle and secondary structure info ###

In [None]:
#from mmtfPyspark.datasets import secondaryStructureExtractor

data = secondaryStructureExtractorFull.get_dataset(pdb).toPandas()
#data = secondaryStructureExtractor.get_dataset(pdb).toPandas()

RUNNING
MAPPED


In [13]:
data.head(10)

Unnamed: 0,pdbId,chain,resi,resn,phi,psi
0,1byf,A,1,ASP,,142.984375
1,1byf,A,2,TYR,-151.753433,176.059341
2,1byf,A,3,GLU,-114.129326,124.031487
3,1byf,A,4,ILE,-122.864243,125.844177
4,1byf,A,5,LEU,-114.774071,127.154877
5,1byf,A,6,PHE,-111.492783,134.083374
6,1byf,A,7,SER,-129.723282,133.550552
7,1byf,A,8,ASP,-75.892632,-8.814808
8,1byf,A,9,GLU,-70.092163,131.786179
9,1byf,A,10,THR,-103.563698,151.883087


In [None]:
df1 = capsdb.toPandas()
df = pd.merge(data, df1, left_on=('pdbId','chain'), right_on=('pdbid','chain'), how='inner')
df = df[['pdbId', 'chain', 'resi', 'resn', 'phi', 'psi', 'startcap', 'endcap']]


In [None]:
df['is_cap'] = df.apply(lambda x: 1 if (x['resi'] >= x['startcap'] and x['resi'] <= x['endcap']) else 0, axis=1)
df_caps = df.groupby(["pdbId", "chain", "resi"])['is_cap'].max().reset_index()

In [None]:
data_caps = pd.merge(data, df_caps, left_on=('pdbId','chain', 'resi'), right_on=('pdbId','chain', 'resi'), how='inner')

In [None]:
def is_cap(pdbId, chain, resi, is_cap):
    if is_cap == 1:
        return([1,0])
    else:
        return([0,1])

In [120]:
groups = data_caps.groupby(["pdbId", "chain"])
                           # num pdbs,    max len of seqs, num features
training_data = np.zeros((groups.ngroups, 5000, 2), dtype=float)
                           # num pdbs,    max len of seqs, length of one-hot encoded target
truth = np.zeros((groups.ngroups, 5000, 2), dtype=int)
truth_lagged = np.zeros((groups.ngroups, 5000, 2), dtype=int)


In [119]:
data_caps.head()

Unnamed: 0,pdbId,chain,resi,resn,phi,psi,is_cap
0,4nec,A,19,HIS,,-56.920506,0
1,4nec,A,20,MET,-75.926338,-23.480333,0
2,4nec,A,21,THR,-55.579212,-47.003891,0
3,4nec,A,22,GLU,-65.051697,-42.223095,0
4,4nec,A,23,VAL,-60.412868,-54.407536,0


In [123]:
#count=0
for i, ((pdbid, chain), group) in enumerate(groups):
    for j, featuretuple in enumerate(group.itertuples()):
        if j>=5000: break
        #print(featuretuple)
        #print(count)
        #count =count+1
        training_data[i,j,:] = (featuretuple.phi, featuretuple.psi)
        truth[i,j,:] = is_cap(featuretuple.pdbId, featuretuple.chain, featuretuple.resi, is_cap)#, featuretuple.startcap, featuretuple.endcap)
        truth_lagged[i,j+1,:] = truth[i,j,:]
        

In [126]:
#training_data[10]
#truth.shape
training_data.shape
#os.getcwd()

(6714, 5000, 2)

In [127]:
import h5py
os.remove('features.h5')
os.remove('truth.h5')
os.remove('truthlag.h5')

h5f = h5py.File('features.h5', 'w')
h5f.create_dataset('training_data', data=training_data)
h5f.close()

h5tr = h5py.File('truth.h5', 'w')
h5tr.create_dataset('truth', data=truth)
h5tr.close()

h5trl = h5py.File('truthlag.h5', 'w')
h5trl.create_dataset('truthlag', data=truth_lagged)
h5trl.close()

#os.remove("features.h5")

### Write features to H5 file ###

In [None]:
#caps_pdb.write.mode('overwrite').format('hdf').save('./features.h5')
#training_data.write.mode('overwrite').format('hdf').save('./features.h5')

### Get truth labels and Save to H5 ###

In [None]:
#Write out truth.h5...

### Terminate Spark ###

In [128]:
spark.stop()

## 4. Build Bidirectional LSTM ##

In [None]:
def get_callbacks(model_file, initial_learning_rate=0.0001, learning_rate_drop=0.5, learning_rate_epochs=None,
                  learning_rate_patience=50, logging_file="training.log", verbosity=1,
                  early_stopping_patience=None):
    callbacks = list()
    callbacks.append(ModelCheckpoint(model_file, save_best_only=True))
    callbacks.append(CSVLogger(logging_file, append=True))
    if learning_rate_epochs:
        callbacks.append(LearningRateScheduler(partial(step_decay, initial_lrate=initial_learning_rate,
                                                       drop=learning_rate_drop, epochs_drop=learning_rate_epochs)))
    else:
        callbacks.append(ReduceLROnPlateau(factor=learning_rate_drop, patience=learning_rate_patience,
                                           verbose=verbosity))
    if early_stopping_patience:
        callbacks.append(EarlyStopping(verbose=verbosity, patience=early_stopping_patience))
    return callbacks

In [3]:
def create_model(num_features, num_outputs=2, latent_dim=100):
    """Create a Seq2Seq Bidirectional LSTM
    From: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
    
    Parameters
    ----------
    num_features : int
        The number of features in your trianing data
    num_outputs : int
        Number of outputs to rpedict, i.e. number of classes or 2 for binary
        
    Returns
    -------
    A new Keras Seq2Seq Bidirectional LSTM
    """
    
    # Define an input sequence and process it.
    encoder_inputs = Input(shape=(None, num_features))
    encoder = LSTM(latent_dim, return_state=True)
    #encoder = Bidirectional(LSTM(latent_dim, return_state=True))
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    #encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(encoder_inputs)
    #state_h = Concatenate()([forward_h, backward_h])
    #state_c = Concatenate()([forward_c, backward_c])
    
    # We discard `encoder_outputs` and only keep the states.
    encoder_states = [state_h, state_c]

    # Set up the decoder, using `encoder_states` as initial state.
    #decoder_inputs = Input(shape=(None, num_outputs))
        
    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.
    #decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    #decoder_lstm = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True))
    #decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    
    decoder_inputs = Input(shape=(None, num_outputs))
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    #decoder_lstm = LSTM(latent_dim * 2, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    
    decoder_dense = Dense(num_outputs, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    #return model
    return encoder_inputs, encoder_states, decoder_inputs, decoder_lstm, decoder_dense, model

In [4]:
def train():
    encoder_input_data = HDF5Matrix('features.h5', 'training_data')
    decoder_input_data = HDF5Matrix('truth.h5', 'truth')
    decoder_target_data = HDF5Matrix('truthlag.h5', 'truthlag')
    encoder_inputs, encoder_states, decoder_inputs, decoder_lstm, decoder_dense, model = create_model(num_features=2)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    #Automicatlly determine batch sizes, train/test splits
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data, shuffle="batch")
    
    # serialize model to JSON
    model_json = model.to_json()
    with open("model_StS_BiLSTM.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("model_StS_BiLSTM.h5")
    print("Saved model to disk")
    
    return encoder_inputs, encoder_states, decoder_inputs, decoder_lstm, decoder_dense, model

In [None]:
import tensorflow as tf
import keras
from keras.utils.io_utils import HDF5Matrix
from keras.layers import Input, Dense, Bidirectional, LSTM, Concatenate
from keras.models import Model

encoder_inputs, encoder_states, decoder_inputs, decoder_lstm, decoder_dense, model = train()

Using TensorFlow backend.


Epoch 1/1
Saved model to disk


  '. They will not be included '


In [6]:
# load json and create model
json_file = open('model_StS_BiLSTM.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = keras.models.model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model_StS_BiLSTM.h5")
print("Loaded model from disk")

Loaded model from disk


In [30]:
decoder_lstm.weights
#model.layers[5].weights

[<tf.Variable 'lstm_13/kernel:0' shape=(2, 400) dtype=float32_ref>,
 <tf.Variable 'lstm_13/recurrent_kernel:0' shape=(100, 400) dtype=float32_ref>,
 <tf.Variable 'lstm_13/bias:0' shape=(400,) dtype=float32_ref>]

In [6]:
# Do steps to prepare for inference
latent_dim=100
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [7]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, 2))
    # Populate the first character of target sequence with the start residue.
    target_seq[0, 0, :] = [0, 1] # decoder tokens one-hot encoding hardcoded here

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_chain = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        decoded_chain.append(sampled_token_index)

        # Exit condition: either hit max length
        # or find stop character.
        if len(decoded_chain) > 5000:
            stop_condition = True

        # Update the target sequence (of length 1).
        sampled_residue = [0, 0]
        sampled_residue[sampled_token_index] = 1
        target_seq = np.zeros((1, 1, 2))
        target_seq[0, 0, :] = sampled_residue

        # Update states
        states_value = [h, c]

    return decoded_chain

In [10]:
#training_data = HDF5Matrix('features.h5', 'training_data')
import h5py
hf = h5py.File('features.h5', 'r')
df_training = hf.get('training_data').value



In [11]:
df_training.shape

(6714, 5000, 2)

In [12]:
temp = df_training[4:5,:,:]

In [81]:
temp.shape

(1, 5000, 2)

In [13]:
result = decode_sequence(temp)

In [14]:
i = 0
for j in result:
    i += j
i

0

In [17]:
sum(result)

AttributeError: 'NoneType' object has no attribute '_jvm'