In [1]:
import pandas as pd 
import numpy as np 
from protein_data import ProteinData


In [4]:
# load data

# updated data with 25% identity and 2.0 Angstrom cutoffs 
ss_2022_25_20 = pd.read_csv('2022-08-06-pdb-intersect-pisces_pc25_r2.0.csv')

# updated data with 25% identity and 2.5 Angstrom cutoffs 
ss_2022_25_25 = pd.read_csv('2022-08-06-pdb-intersect-pisces_pc25_r2.5.csv')

# updated data with 30% identity and 2.5 Angstrom cutoffs 
ss_2022_30_25 = pd.read_csv('2022-08-06-pdb-intersect-pisces_pc30_r2.5.csv')

sst3_data = ProteinData(df = ss_2022_25_20, target = 'sst3', n = 3, maxlen = 300)
sst8_data = ProteinData(df = ss_2022_25_20, target = 'sst8', n = 3, maxlen = 300)



(732, 300, 4)

In [6]:
# Build the model for predicting the SST3 sequence
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Dropout


model = Sequential([
    # the embedding layer converts the input sequences into higher-dimensional embeddings
    # that will be fed to the LSTM layer 
    Embedding(input_dim = sst3_data.n_words, output_dim = 256, input_length = sst3_data.maxlen),
    # the LSTM layer processes the sequences, retaining relevant information about the
    # context of each input that it processes from both earlier and later in the sequence 
    Bidirectional(LSTM(units = 128, return_sequences = True)),
    # this layer converts the output from the LSTM layer into the output sequence
    TimeDistributed(Dense(sst3_data.n_ssts, activation = 'softmax'))])

model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 300, 256)          2168576   
                                                                 
 bidirectional_1 (Bidirectio  (None, 300, 256)         394240    
 nal)                                                            
                                                                 
 time_distributed_1 (TimeDis  (None, 300, 4)           1028      
 tributed)                                                       
                                                                 
Total params: 2,563,844
Trainable params: 2,563,844
Non-trainable params: 0
_________________________________________________________________


In [7]:
# Train the model
from sklearn.model_selection import train_test_split
from keras import backend as K

# Q3 Accuracy Implementation from https://www.kaggle.com/code/helmehelmuto/secondary-structure-prediction-with-keras/notebook
# "SS prediction is usually evaluated by Q3 or Q8 accuracy, which measures the percent of residues for which 3-state or 8-state 
# secondary structure is correctly predicted"  (doi: 10.1038/srep18962)
def q3_acc(y_true, y_pred):
    y = tf.argmax(y_true, axis=-1)
    y_ = tf.argmax(y_pred, axis=-1)
    mask = tf.greater(y, 0)
    return K.cast(K.equal(tf.boolean_mask(y, mask), tf.boolean_mask(y_, mask)), K.floatx())

model.compile(optimizer = "rmsprop", loss = "categorical_crossentropy", metrics = ["accuracy", q3_acc])
model.fit(sst3_data.train_sequences, 
          sst3_data.y_train_sequences, 
          batch_size = 128, 
          epochs = 10, 
          validation_data = (sst3_data.valid_sequences, 
                             sst3_data.y_valid_sequences), 
          verbose = 1)




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

KeyboardInterrupt: 

In [30]:
model2 = Sequential([
    # the embedding layer converts the input sequences into higher-dimensional embeddings
    # that will be fed to the LSTM layer 
    Embedding(input_dim = y['sst8']['n_words'], output_dim = 256, input_length = maxlen),
    # the LSTM layer processes the sequences, retaining relevant information about the
    # context of each input that it processes from both earlier and later in the sequence 
    Bidirectional(LSTM(units = 128, return_sequences = True)),
    TimeDistributed(Dropout(0.1)),
    # this layer converts the output from the LSTM layer into the output sequence
    TimeDistributed(Dense(y['sst3']['n_ssts'], activation = 'softmax'))])

model2.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 2128, 256)         2176512   
                                                                 
 bidirectional_12 (Bidirecti  (None, 2128, 256)        394240    
 onal)                                                           
                                                                 
 time_distributed_14 (TimeDi  (None, 2128, 256)        0         
 stributed)                                                      
                                                                 
 time_distributed_15 (TimeDi  (None, 2128, 4)          1028      
 stributed)                                                      
                                                                 
Total params: 2,571,780
Trainable params: 2,571,780
Non-trainable params: 0
___________________________________________

In [31]:
model2.compile(optimizer = "rmsprop", loss = "categorical_crossentropy", metrics = ["accuracy", q3_acc])
model2.fit(train_sequences, 
          y['sst3']['train_sequences'], 
          batch_size = 128, 
          epochs = 10, 
          validation_data = (valid_sequences, 
                             y['sst3']['valid_sequences']), 
          verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x38a586cb0>

In [34]:
model3 = Sequential([
    # the embedding layer converts the input sequences into higher-dimensional embeddings
    # that will be fed to the LSTM layer 
    Embedding(input_dim = y['sst8']['n_words'], output_dim = 256, input_length = maxlen),
    # the LSTM layer processes the sequences, retaining relevant information about the
    # context of each input that it processes from both earlier and later in the sequence 
    Bidirectional(LSTM(units = 128, return_sequences = True)),
    TimeDistributed(Dropout(0.1)),
    Bidirectional(LSTM(units=56, return_sequences = True)),
    # this layer converts the output from the LSTM layer into the output sequence
    TimeDistributed(Dense(y['sst3']['n_ssts'], activation = 'softmax'))])

model3.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 2128, 256)         2176512   
                                                                 
 bidirectional_14 (Bidirecti  (None, 2128, 256)        394240    
 onal)                                                           
                                                                 
 time_distributed_17 (TimeDi  (None, 2128, 256)        0         
 stributed)                                                      
                                                                 
 bidirectional_15 (Bidirecti  (None, 2128, 112)        140224    
 onal)                                                           
                                                                 
 time_distributed_18 (TimeDi  (None, 2128, 4)          452       
 stributed)                                          

In [35]:
model3.compile(optimizer = "rmsprop", loss = "categorical_crossentropy", metrics = ["accuracy", q3_acc])
model3.fit(train_sequences, 
          y['sst3']['train_sequences'], 
          batch_size = 128, 
          epochs = 10, 
          validation_data = (valid_sequences, 
                             y['sst3']['valid_sequences']), 
          verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x324d59870>

In [37]:
model4 = Sequential([
    # the embedding layer converts the input sequences into higher-dimensional embeddings
    # that will be fed to the LSTM layer 
    Embedding(input_dim = y['sst8']['n_words'], output_dim = 256, input_length = maxlen),
    # the LSTM layer processes the sequences, retaining relevant information about the
    # context of each input that it processes from both earlier and later in the sequence 
    Bidirectional(LSTM(units = 128, return_sequences = True)),
    # this layer converts the output from the LSTM layer into the output sequence
    TimeDistributed(Dense(y['sst8']['n_ssts'], activation = 'softmax'))])

model4.summary()

model4.compile(optimizer = "rmsprop", loss = "categorical_crossentropy", metrics = ["accuracy", q3_acc])
model4.fit(train_sequences, 
          y['sst8']['train_sequences'], 
          batch_size = 128, 
          epochs = 10, 
          validation_data = (valid_sequences, 
                             y['sst8']['valid_sequences']), 
          verbose = 1)

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_16 (Embedding)    (None, 2128, 256)         2176512   
                                                                 
 bidirectional_17 (Bidirecti  (None, 2128, 256)        394240    
 onal)                                                           
                                                                 
 time_distributed_20 (TimeDi  (None, 2128, 9)          2313      
 stributed)                                                      
                                                                 
Total params: 2,573,065
Trainable params: 2,573,065
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x32e5f78e0>