# Predict Helix Capping Residues #

The goal is to identify residues just before an alpha helix begins or the residues just after the helix ends. This will improve secondary structure predictors becuase they often extend too far or do not start at the right place. 

The CapsDB has annoted sequences of structures of helix capping residues that can be used to train a deep nueral net. We will use a Bidirectional LSTM using phi/psi features to see if it will those will be good predictors.

## 1. Download data ##

## 2. Generate Features ##
### MMTF Pyspark Imports ###

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.ml import ProteinSequenceEncoder

### Custom imports ###

In [4]:
import secondaryStructureExtractorFull
#import mmtfToASA
import os
os.getcwd()

'/home/ec2-user/SageMaker/ProteinFragmenter'

### Configure Spark Context ###

In [5]:
spark = SparkSession.builder.master("local[8]").appName("DeepCap").getOrCreate()

### Filter out chains not in CapsDB ###

In [7]:
from pyspark.sql import SQLContext
from pyspark.sql.functions import concat, col, lit, array_contains

sqlContext = SQLContext(spark)
capsdb = sqlContext.read.parquet('caps_descriptors.parquet')
capsdb_pdbs = capsdb.select(concat(upper(col("pdbId")), lit("."), col("chain")).alias("id")).drop_duplicates()


In [12]:
#from pyspark.sql.types import BooleanType

# def hasHelixCapInfo(enc_obj, ids):
#     return(enc_obj[0] in ids)

# def udf_hasHelixCapInfo(ids):
#     return udf(lambda t: hasHelixCapInfo(t, ids))
#     # I'm not sure how to call these successfully - Sean

capsdb_ids = [list[0] for list in capsdb_pdbs.select("id").collect()]
class HasHelixCapInfo(object):
    '''This filter returns true if the structure is in CAPSDB'''
    def __call__(self, t):
        return(t[0] in capsdb_ids)
    # This works but I don't like that it refers to a global variable: not sure how to change that right now - Sean

### Read MMTF File and get a set of L-protein chains ###

In [6]:
if not os.path.isdir("full"):
    !wget https://mmtf.rcsb.org/v1.0/hadoopfiles/full.tar && tar -xvf full.tar
        
if not os.path.isdir("reduced"):
    !wget https://mmtf.rcsb.org/v1.0/hadoopfiles/reduced.tar && tar -xvf reduced.tar

In [13]:
pdb = mmtfReader.read_sequence_file('full/part-00001') \
    .flatMap(StructureToPolymerChains()) \
    .filter(ContainsLProteinChain()) \
    .filter(HasHelixCapInfo())
pdb.count()
#pdb.take(2)
#?pdb

2

In [9]:
#pdb2.write.parquet("output/pdb_filteredToCaps.parquet") # this doesn't work -Sean

In [190]:
# pdb is an MMTFEncoder object: need to turn this into something secondaryStructureExtractorFull expects or modify secondaryStructureExtractorFull to make use of the MMTFEncoder

# from mmtfPyspark.utils import mmtfStructure

obj = pdb.take(1)[0][1].encode_data()
structure = mmtfStructure.MmtfStructure(obj)


In [191]:
obj.keys()
#structure.group_type_list
structure.group_list[0]['atomNameList']
#obj['structureId']
#obj['entityList']
#obj['bioAssemblyList']
#obj['atomIdList']
#temp = type(pdb.take(10))

['N', 'CA', 'C', 'O', 'CB', 'H1', 'HA', 'HB1', 'HB2', 'HB3']

### Get Torsion angle and secondary structure info ###

In [204]:
#from mmtfPyspark.datasets import secondaryStructureExtractor

data = secondaryStructureExtractorFull.get_dataset(pdb).toPandas()
#data = secondaryStructureExtractor.get_dataset(pdb).toPandas()

RUNNING
MAPPED


In [205]:
data.head(10)

Unnamed: 0,pdbId,chain,resi,resn,phi,psi
0,1buj,A,0,ALA,,-60.69939
1,1buj,A,1,VAL,-146.134125,57.770145
2,1buj,A,2,ILE,-111.048912,138.046036
3,1buj,A,3,ASN,-159.172882,43.077713
4,1buj,A,4,THR,-118.766075,165.706207
5,1buj,A,5,PHE,-60.730888,-49.27467
6,1buj,A,6,ASP,-71.140984,-37.861027
7,1buj,A,7,GLY,-69.076538,-48.363979
8,1buj,A,8,VAL,-69.851631,-40.401875
9,1buj,A,9,ALA,-59.562462,-37.430336


In [9]:
groups = data.groupby(["pdb", "chain"])
                           # num pdbs,    max len of seqs, num features
training_data = np.empty((groups.ngroups, 5000,            22))
truth = np.empty((groups.ngroups, 5000,            2))
for i, (pdb, chain), group in enumerate(groups):
    for j, aa in enumerate(group):
        if j>=5000: break
        training_data[i,j,:] = aa[4:]
        truth[i,j,:] = is_cap(aa[0], aa[1])

def is_cap(pdb, chain, resi):
    if capsdb_pdbs.filter(capsdb.pdbId.like(pdb), capsdb.chain.like(chain), resi>=caps.startcap, resi<=caps.endcap)>0:
        return [0, 1]
    else:
        return [1, 0]

NameError: name 'data' is not defined

In [None]:
data.show()

### Write features to H5 file ###

In [None]:
caps_pdb.write.mode('overwrite').format('hdf').save('./features.h5')

### Get truth labels and Save to H5 ###

In [None]:
#Write out truth.h5...

### Terminate Spark ###

In [14]:
spark.stop()

## 4. Build Bidirectional LSTM ##

In [None]:
def create_model(num_features, num_outputs=2, latent_dim=100):
    """Create a Seq2Seq Bidirectional LSTM
    From: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
    
    Parameters
    ----------
    num_features : int
        The number of features in your trianing data
    num_outputs : int
        Number of outputs to rpedict, i.e. number of classes or 2 for binary
        
    Returns
    -------
    A new Keras Seq2Seq Bidirectional LSTM
    """
    
    # Define an input sequence and process it.
    encoder_inputs = Input(shape=(None, num_features))
    encoder = Bidirectional(LSTM(latent_dim, return_state=True))
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    
    # We discard `encoder_outputs` and only keep the states.
    encoder_states = [state_h, state_c]

    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None, num_outputes))
    
    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.
    decoder_lstm = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True))
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                         initial_state=encoder_states)
    decoder_dense = Dense(num_outputs, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    
    model = Model(inputs=inp, outputs=x)
    
    return model

In [None]:
def train():
    X_data = HDF5Matrix('features.hdf5', 'data')
    y_data = HDF5Matrix('truth.h5', 'data')
    model = create_model()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    #Automicatlly determine batch sizes, train/test splits
    model.fit(X_data, y_data)

In [None]:
import numpy as np
import keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=32, dim=(32,32,32), n_channels=1,
                 n_classes=10, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = np.load('data/' + ID + '.npy')

            # Store class
            y[i] = self.labels[ID]

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)