In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
!pip install tensorflow==1.14.0

Collecting tensorflow==1.14.0
[?25l  Downloading https://files.pythonhosted.org/packages/de/f0/96fb2e0412ae9692dbf400e5b04432885f677ad6241c088ccc5fe7724d69/tensorflow-1.14.0-cp36-cp36m-manylinux1_x86_64.whl (109.2MB)
[K     |████████████████████████████████| 109.2MB 91kB/s 
[?25hCollecting tensorboard<1.15.0,>=1.14.0
[?25l  Downloading https://files.pythonhosted.org/packages/91/2d/2ed263449a078cd9c8a9ba50ebd50123adf1f8cfbea1492f9084169b89d9/tensorboard-1.14.0-py3-none-any.whl (3.1MB)
[K     |████████████████████████████████| 3.2MB 64.3MB/s 
Collecting tensorflow-estimator<1.15.0rc0,>=1.14.0rc0
[?25l  Downloading https://files.pythonhosted.org/packages/3c/d5/21860a5b11caf0678fbc8319341b0ae21a07156911132e0e71bffed0510d/tensorflow_estimator-1.14.0-py2.py3-none-any.whl (488kB)
[K     |████████████████████████████████| 491kB 74.0MB/s 
Installing collected packages: tensorboard, tensorflow-estimator, tensorflow
  Found existing installation: tensorboard 2.2.0
    Uninstalling tensorbo

In [0]:
%tensorflow_version 1.14
import tensorflow as tf
import keras

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `1.14`. This will be interpreted as: `1.x`.


TensorFlow 1.x selected.


Using TensorFlow backend.


In [0]:
print(tf.__version__)
print(keras.__version__)

1.15.2
2.2.5


In [0]:
import warnings 
warnings.filterwarnings("ignore")

In [0]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [0]:
from google.colab import drive

In [0]:
drive.mount('/content/drive')

In [0]:
import zipfile

In [0]:
#unzipping the cb6133 filtered file
with zipfile.ZipFile("drive/My Drive/Major Project/cullpdb+profile_6133_filtered.npy.gz.zip", 'r') as zip_ref:
    zip_ref.extractall("drive/My Drive/Major Project/cullpdb+profile_6133_filtered.npy.gz")

In [0]:
#unzipping the cb6133 file
with zipfile.ZipFile("drive/My Drive/Major Project/cullpdb+profile_6133.npy.gz.zip", 'r') as zip_ref:
    zip_ref.extractall("drive/My Drive/Major Project/cullpdb+profile_6133.npy.gz")

## Preprocessing the data

In [0]:
def preprocess(npy_file, max_len):
    
    data = np.load(npy_file)
    
    residue_list = ['A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y', 
                    'X', 'NoSeq']
    q8_list = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T','NoSeq']
    
    data = data.reshape(data.shape[0], 700, -1)
    
    residue_onehot = data[:,:,0:22]
    q8_onehot = data[:,:,22:31]
    nc_terminal = data[:,:,31:33]
    profile = data[:,:,35:57]
    
    zero_arr = np.zeros((profile.shape[0], max_len - profile.shape[1], profile.shape[2]))
    profile_padded = np.concatenate([profile, zero_arr], axis=1)
    
    residue_str = np.array(residue_list)[residue_onehot.argmax(2)]
    q8_str = np.array(q8_list)[q8_onehot.argmax(2)]
    
    residue_array = []
    for vec in residue_str:
        x = ''.join(vec[vec!='NoSeq'])
        residue_array.append(x)
        
    
    q8_array = []
    for vec in q8_str:
        x = ''.join(vec[vec!='NoSeq'])
        q8_array.append(x)
    
    id_list = np.arange(1, len(q8_array) + 1)
    len_list = np.array([len(x) for x in residue_array])
    
    protein_dataset = pd.DataFrame({'id': id_list, 'len': len_list, 'primary structure': residue_array, 
                                    'secondary structure': q8_array})
        
    return protein_dataset, profile_padded

In [0]:
cb6133, cb6133_profile = preprocess("drive/My Drive/Major Project/cullpdb+profile_6133.npy.gz/cullpdb+profile_6133.npy.gz", 700)

In [0]:
cb6133_filtered, cb6133_filtered_profile = preprocess("drive/My Drive/Major Project/cullpdb+profile_6133_filtered.npy.gz/cullpdb+profile_6133_filtered.npy.gz", 700)

In [0]:
cb513, cb513_profile = preprocess("drive/My Drive/Major Project/cb513+profile_split1.npy.gz", 700)

In [0]:
cb6133_profile_shape = cb6133_profile.reshape(cb6133_profile.shape[0], 700*22)
cb6133_filtered_profile_shape = cb6133_filtered_profile.reshape(cb6133_filtered.shape[0], 700*22)
cb513_profile_shape = cb513_profile.reshape(cb513_profile.shape[0], 700*22)

## Let us make csv file of cb6133, cb6133_filtered and cb513 dataset

In [0]:
cb6133_profile_df = pd.DataFrame(cb6133_profile_shape)
cb6133_filtered_profile_df = pd.DataFrame(cb6133_filtered_profile_shape)
cb513_profile_df = pd.DataFrame(cb513_profile_shape)

NameError: ignored

In [0]:
cb6133_csv = cb6133.to_csv("drive/My Drive/Major Project/cb6133.csv", sep=",", encoding='utf-8', index=False)

In [0]:
cb6133_profile_csv = cb6133_profile_df.to_csv("drive/My Drive/Major Project/cb6133_profile.csv", index=False)

In [0]:
cb6133_filtered_csv = cb6133_filtered.to_csv("drive/My Drive/Major Project/cb6133_filtered.csv", sep=",", encoding='utf-8', index=False)

In [0]:
cb6133_filtered_profile_csv = cb6133_filtered_profile_df.to_csv("drive/My Drive/Major Project/cb6133_filtered_profile.csv", index=False)

In [0]:
cb513_csv = cb513.to_csv("drive/My Drive/Major Project/cb513.csv", sep=",", encoding='utf-8', index=False)

In [0]:
cb513_profile_csv = cb513_profile_df.to_csv("drive/My Drive/Major Project/cb513_profile.csv", index=False)

## Splitting the dataset into training, testing and validation dataset

In [0]:
train_df, val_df, test_df = cb6133_filtered, cb6133[5877:6133], cb513

In [0]:
train_profile_df, val_profile_df, test_profile_df = cb6133_filtered_profile, cb6133_profile[5877:6133], cb513_profile

In [0]:
X_train, y_train = train_df[['primary structure', 'secondary structure']][(train_df.len <= 700)].values.T

In [0]:
X_val, y_val = val_df[['primary structure', 'secondary structure']][(val_df.len <= 700)].values.T

In [0]:
X_test, y_test = test_df[['primary structure', 'secondary structure']][(test_df.len <= 700)].values.T

## Converting text to integers i.e tokenizing the dataset

In [0]:
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [0]:
X_train_tokenizer = Tokenizer(char_level=True)
X_train_tokenizer.fit_on_texts(X_train)
X_train_seqs = X_train_tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(X_train_seqs, padding="post", maxlen=700)

In [0]:
X_val_tokenizer = Tokenizer(char_level=True)
X_val_tokenizer.fit_on_texts(X_val)
X_val_seqs = X_val_tokenizer.texts_to_sequences(X_val)
X_val = sequence.pad_sequences(X_val_seqs, padding="post", maxlen=700)

In [0]:
X_test_tokenizer = Tokenizer(char_level=True)
X_test_tokenizer.fit_on_texts(X_test)
X_test_seqs = X_test_tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(X_test_seqs, padding="post", maxlen=700)

In [0]:
y_train_tokenizer = Tokenizer(char_level = True)
y_train_tokenizer.fit_on_texts(y_train)
y_train_seqs = y_train_tokenizer.texts_to_sequences(y_train)
y_train = sequence.pad_sequences(y_train_seqs, padding="post", maxlen=700)
y_train = to_categorical(y_train)

In [0]:
y_val_tokenizer = Tokenizer(char_level=True)
y_val_tokenizer.fit_on_texts(y_val)
y_val_seqs = y_val_tokenizer.texts_to_sequences(y_val)
y_val = sequence.pad_sequences(y_val_seqs, padding="post", maxlen=700)
y_val = to_categorical(y_val)

In [0]:
y_test_tokenizer = Tokenizer(char_level=True)
y_test_tokenizer.fit_on_texts(y_test)
y_test_seqs = y_test_tokenizer.texts_to_sequences(y_test)
y_test = sequence.pad_sequences(y_test_seqs, padding="post", maxlen=700)
y_test = to_categorical(y_test)

In [0]:
y_train.shape

(5534, 700, 9)

In [0]:
x_word_index = len(X_train_tokenizer.word_index) + 1
y_word_index = len(y_train_tokenizer.word_index) + 1

## Training the model saving weights into the format of .hdf5 file

In [0]:
from keras.models import Input, Model
from keras.layers import Embedding
from keras.layers import Conv1D, LSTM, Bidirectional, dot, concatenate
from keras.layers import TimeDistributed, Dense, Activation

In [0]:
tf.keras.backend.clear_session()

In [0]:
input1 = Input(shape=(700,))
input2 = Input(shape=(700, 22))





In [0]:
input_embedd = Embedding(input_dim = x_word_index, output_dim = 128, input_length = 700)(input1)
input_embedding = concatenate([input_embedd, input2], axis = 2)

conv1d_1 = Conv1D(64, 17, strides = 1, padding="same")(input_embedding)
conv1d_2 = Conv1D(64, 15, strides = 1, padding="same")(input_embedding)
conv1d_inner_concat_1 = concatenate([input_embedding, conv1d_1], axis = 2)
conv1d_concat_1 = concatenate([conv1d_inner_concat_1, conv1d_2], axis = 2)

conv1d_3 = Conv1D(64, 11, strides = 1, padding="same")(conv1d_concat_1)
conv1d_4 = Conv1D(64, 7, strides = 1, padding="same")(conv1d_concat_1)
conv1d_inner_concat_2 = concatenate([conv1d_concat_1, conv1d_3], axis = 2)
conv1d_concat_2 = concatenate([conv1d_inner_concat_2, conv1d_4], axis = 2)

conv1d_5 = Conv1D(64, 5, strides = 1, padding="same")(conv1d_concat_2)
conv1d_6 = Conv1D(64, 2, strides = 1, padding="same")(conv1d_concat_2)
conv1d_inner_concat_3 = concatenate([conv1d_concat_2, conv1d_5], axis = 2)
conv1d_concat_3 = concatenate([conv1d_inner_concat_3, conv1d_6], axis = 2)

lstm_1 = LSTM(64, return_sequences=True, activation='tanh', recurrent_activation='sigmoid', use_bias = True,
              dropout = 0.3, recurrent_dropout = 0.1, implementation = 1)(input_embedding)
lstm_2 = LSTM(64, return_sequences=True, activation='tanh', recurrent_activation='sigmoid', use_bias=True,
             dropout = 0.3, recurrent_dropout = 0.1, implementation = 1)(lstm_1)
lstm_3 = LSTM(64, return_sequences=True, activation='tanh', recurrent_activation='sigmoid', use_bias=True,
             dropout = 0.3, recurrent_dropout = 0.1, implementation = 1)(lstm_2)

dot_1 = dot([lstm_3, conv1d_concat_3], axes=[1,1])
dot_1 = Activation("softmax")(dot_1)
context = dot([conv1d_concat_3, dot_1], axes=[2,2])
lstm_conv1d_concat = concatenate([context, conv1d_concat_3])

bilstm_1 = Bidirectional(LSTM(64, return_sequences=True, activation='tanh', recurrent_activation='sigmoid', use_bias = True, 
                              dropout = 0.2, implementation = 1), merge_mode='concat')(lstm_conv1d_concat)
bilstm_2 = Bidirectional(LSTM(64, return_sequences=True, activation='tanh', recurrent_activation='sigmoid', use_bias=True,
                             dropout = 0.2, implementation = 1), merge_mode='concat')(bilstm_1)

dense_1 = TimeDistributed(Dense(150, activation='relu'))(bilstm_2)
dense_2 = TimeDistributed(Dense(75, activation='relu'))(dense_1)
output_dense = TimeDistributed(Dense(y_word_index, activation='softmax'))(dense_2)



Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
model = Model([input1, input2], output_dense)

In [0]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])





In [0]:
from keras.callbacks import ModelCheckpoint 

In [0]:
checkpointer = ModelCheckpoint("drive/My Drive/Major Project/weights.project_secondary_structure_tanh.hdf5", monitor='val_loss', 
                            save_best_only = True, verbose = 1)

In [0]:
model_fit = model.fit([X_train, train_profile_df], y_train, batch_size = 64, epochs = 60, validation_data = ([X_val, val_profile_df], y_val), 
                      callbacks = [checkpointer], verbose = 1)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 5534 samples, validate on 256 samples
Epoch 1/60






Epoch 00001: val_loss improved from inf to 0.48419, saving model to drive/My Drive/Major Project/weights.project_secondary_structure_tanh.hdf5
Epoch 2/60

Epoch 00002: val_loss improved from 0.48419 to 0.48138, saving model to drive/My Drive/Major Project/weights.project_secondary_structure_tanh.hdf5
Epoch 3/60

Epoch 00003: val_loss improved from 0.48138 to 0.40231, saving model to drive/My Drive/Major Project/weights.project_secondary_structure_tanh.hdf5
Epoch 4/60

Epoch 00004: val_loss improved from 0.40231 to 0.33107, saving model to drive/My Drive/Major Project/weights.project_secondary_structure_tanh.hdf5
Epoch 5/60

Epoch 00005: val_loss did not improve from 0.33107
Epoch 6/60

Epoch 00006: val_loss improved from 0.33107 to 0.30532, saving model to drive/My Drive/Major Project/weights.project_secondary_structure_tanh.hd

In [0]:
weights = model.load_weights("drive/My Drive/Major Project/weights.project_secondary_structure_tanh.hdf5")

In [0]:
y_pred = model.predict([X_test, test_profile_df])

## For Single Input Amino Acid Prediction or in the form of FASTA Sequence
## Useful for Web Application

In [0]:
def amino_preprocess(amino_acid):

    amino_acid_tokenize = Tokenizer(char_level = True)
    amino_acid_tokenize.fit_on_texts(amino_acid)
    amino_acid_seqs = amino_acid_tokenize.texts_to_sequences(amino_acid)
    amino_acid_seqs = sequence.pad_sequences(amino_acid_seqs, padding="post", maxlen=700)

    return amino_acid_seqs

In [0]:
def secondary_preprocess(secondary_sequence):

    secondary_sequence_tokenize = Tokenizer(char_level = True)
    secondary_sequence_tokenize.fit_on_texts(secondary_sequence)
    secondary_sequence_seqs = secondary_sequence_tokenize.texts_to_sequences(secondary_sequence)
    secondary_sequence_seqs = sequence.pad_sequences(secondary_sequence_seqs, padding="post", maxlen=700)
    secondary_sequence_seqs = to_categorical(secondary_sequence_seqs)

    return secondary_sequence_seqs

In [0]:
def string_sequence(predicted_sequence):

    q8_list = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T','NoSeq']

    q8_str = np.array(q8_list)[predicted_sequence.argmax(2)]

    predicted_list = []

    for vec in q8_str:
        x = ''.join(vec[vec!='NoSeq'])
        predicted_list.append(x)

    return predicted_list

In [0]:
amino_acid_sequence, secondary_acid_sequence = test_df[['primary structure', 'secondary structure']][test_df.len<=700].values.T

In [0]:
predict_preprocess = amino_preprocess(amino_acid_sequence)
true_preprocess = secondary_preprocess(secondary_acid_sequence)
print(true_preprocess.shape[2])

9


In [0]:
predict = model.evaluate([predict_preprocess, test_profile_df], true_preprocess)



In [0]:
predict

NameError: ignored