In [None]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.backend as K
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import *


In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
trainDataPath = 'Data/Secondary_Structure_Train_Dataset.npz'
testDataPath = 'Data/Secondary_Structure_Test_Dataset.npz'

num_aminoAcids = {0:'A', 1:'C', 2:'E', 3:'D', 4:'G', 5:'F', 6:'I', 7:'H', 8:'K', 9:'M', 10:'L',
            11:'N', 12:'Q', 13:'P', 14:'S', 15:'R', 16:'T', 17:'W', 18:'V', 19:'Y', 20:'X'}
num_ss = {0:'L',1:'B',2:'E',3:'G',4:'I',5:'H',6:'S',7:'T'}
aminoAcid_I = {j:i+1 for i,j in num_aminoAcids.items()}
aminoAcid_I['<pad>'] = 0
aminoAcid_I['<SOS>'] = 0#len(aminoAcid_I)
aminoAcid_I['<EOS>'] = 0#len(aminoAcid_I)
ss_I = {j:i+1 for i,j in num_ss.items()}
ss_I['<pad>'] = 0
ss_I['<SOS>'] = len(ss_I)
ss_I['<EOS>'] = len(ss_I)

tmp = np.load('Data/Secondary_Structure_Motif_Antimotif.npz')
motifs = tmp['motifs']
antiMotifs = tmp['antimotifs']
len(motifs),len(antiMotifs)

In [None]:
def get_inputs(seq_, pssm_, label, winSize):
    winApp = winSize // 2
    
    input1 = np.zeros((702, winSize))
    input2 = np.zeros((702, winSize, 22))
    input3 = np.zeros((702, winSize, len(motifs)))
    input4 = np.zeros((702, winSize, len(antiMotifs)))
    input5 = np.zeros((702, winSize))
    input6 = np.zeros((702, 1))
    output = np.zeros((702))
    
    seq = '-' * winApp
    pssm = []
    seq += 'O'
    output[0] = ss_I['<SOS>']
    input6[0,0] = 1
    input5[0,:] = 1
    for j in range(seq_.shape[0]):
        if np.sum(seq_[j, :]) == 0:
            break
        seq += num_aminoAcids[np.argmax(seq_[j, :])]
        pssm += [pssm_[j, :]]
        output[j+1] = np.argmax(label[j, :]) + 1
        input6[j+1,0] = 1
        if np.sum(label[j]) == 0:
            output[j+1] = 9
        input5[j+1,:] = j+2
    output[j+2] = ss_I['<EOS>']
    input6[j+2,0] = 1
    input5[j+2,:] = j+2
    seq += 'J'
    seq += '-'*winApp
    for _ in range(winApp+1):
        pssm = [np.zeros((22,))] + pssm
        pssm += [np.zeros((22,))]
    for j in range(len(seq) - winSize + 1):
        a = seq[j : j + winSize]
        c = pssm[j : j + winSize]
        for t in range(winSize):
            if a[t] == 'O':
                input1[j, t] = aminoAcid_I['<SOS>']
            elif a[t] == 'J':
                input1[j, t] = aminoAcid_I['<EOS>']
            elif a[t] != '-':
                input1[j, t] = aminoAcid_I[a[t]]
            input2[j, t] = c[t]
            for p,m in enumerate(motifs):
                k_ = int(m[3])
                if winSize-t > k_:
                    if a[t] == m[0] and a[t+k_] == m[1]:
                        input3[j, t:t+k_+1, p] += 1.
            for p,m in enumerate(antiMotifs):
                k_ = int(m[3])
                if winSize-t > k_:
                    if a[t] == m[0] and a[t+k_] == m[1]:
                        input4[j, t:t+k_+1, p] += 1.
    return input1, input2, input5, input3, input4, input6, output

In [None]:
class dataGenerator(tfk.utils.Sequence):
    def __init__(self, file_path, batch_size=8, win_size=11, seq_len=702, shuffle=False, valData=0):
        self.__getData__(file_path)
        self.__seq_len__ = seq_len
        self.__win_size__ = win_size
        self.__batch_size__ = batch_size
        self.__shuffle__ = shuffle
        self.__valData__ = valData
        self.__n_examples__ = self.__sequences.shape[0]
        if self.__valData__ == 1:
            self.__n_examples__ -= 12000
        elif self.__valData__ == 0:
            self.__n_examples__ -= 448
        self.on_epoch_end()
        np.random.seed(42)
    def __len__(self):
        return self.__n_examples__ // self.__batch_size__
    def on_epoch_end(self):
        self.__indexes__ = np.arange(self.__n_examples__)
        if self.__valData__ == 1:
            self.__indexes__ += 12000
        if self.__shuffle__:
            np.random.shuffle(self.__indexes__)
    def __getData__(self, file_path):
        data = np.load(file_path)
        self.__sequences = data['sequences']
        self.__pssms = data['pssms']
        self.__secondary_structure = data['secondaryStrucs']
    def __getitem__(self, index):
        indexes = self.__indexes__[index*self.__batch_size__:(index+1)*self.__batch_size__]
        X1, X2, X3, X4, X5, X6, y = self.__data_generation(indexes)
        return [X1, X2, X3, X4, X5, X6], y
    def __data_generation(self, indexes):
        x1 = np.empty((self.__batch_size__, self.__seq_len__, self.__win_size__))
        x2 = np.empty((self.__batch_size__, self.__seq_len__, self.__win_size__, 22))
        x3 = np.empty((self.__batch_size__, self.__seq_len__, self.__win_size__))
        x4 = np.empty((self.__batch_size__, self.__seq_len__, self.__win_size__, len(motifs)))
        x5 = np.empty((self.__batch_size__, self.__seq_len__, self.__win_size__, len(antiMotifs)))
        x6 = np.empty((self.__batch_size__, self.__seq_len__, 1))
        y = np.empty((self.__batch_size__, self.__seq_len__))
        for k in range(len(indexes)):
            ind = indexes[k]
            i1,i2,i3,i4,i5,i6,i7 = get_inputs(self.__sequences[ind], self.__pssms[ind],
                                              self.__secondary_structure[ind], self.__win_size__)
            x1[k,] = i1
            x2[k,] = i2
            x3[k,] = i3
            x4[k,] = i4
            x5[k,] = i5
            x6[k,] = i6
            y[k,] = i7
        y = tfk.utils.to_categorical(y,num_classes=12)
        return x1, x2, x3, x4, x5, x6, y[:,:,1:9]


In [None]:
def shape_list(x):
    tmp = list(K.int_shape(x))
    tmp[0] = -1
    return tmp

def selfAttention(V, mask):
    units = int(V.shape[2])
    Q = tfk.layers.TimeDistributed( tfk.layers.Dense(units, activation=None, use_bias=False))(V)
    K_ = tfk.layers.TimeDistributed( tfk.layers.Dense(units, activation=None, use_bias=False))(Q)
    SoftAtten = tfk.layers.Dot(axes=-1, normalize=False)([Q, K_])
    SoftAtten = tfk.layers.Lambda(lambda inp: inp[0]/K.sqrt(K.cast(shape_list(inp[1])[-1], K.floatx())))([SoftAtten, V])
    SoftAtten = tfk.layers.Softmax(axis=-1)(SoftAtten)
    SoftAtten = tfk.layers.Multiply()([SoftAtten, mask])
    
    V = tfk.layers.Permute([2,1])(V)
    SA = tfk.layers.Dot(axes=-1, normalize=False)([SoftAtten, V])
    return SA,SoftAtten

def _getPosEncodingMat(length, dim):
    posEnc = np.array([[pos/np.power(10000, 2*(j//2)/dim) for j in range(dim)]
                        if pos!=0 else np.zeros(dim) for pos in range(length)], dtype=np.float32)
    posEnc[1:, 0::2] = np.sin(posEnc[1:, 0::2])
    posEnc[1:, 1::2] = np.cos(posEnc[1:, 1::2])
    return posEnc

def to_q3(x):
    y = []
    for i in x:
        if i in [0,6,7]:
            y += [1]
        elif i in [1,2]:
            y += [2]
        else:
            y += [3]
    return y

In [None]:
with strategy.scope():
    input1 = tfk.layers.Input(shape=(702,11 ), name='sequence_input')
    input2 = tfk.layers.Input(shape=(702,11,22, ), name='pssm_input')
    input3 = tfk.layers.Input(shape=(702,11, ), name='pids_input')
    input4 = tfk.layers.Input(shape=(702,11,527, ), name='motif_input')
    input5 = tfk.layers.Input(shape=(702,11,710, ), name='antimotif_input')
    input6 = tfk.layers.Input(shape=(702,1, ), name='mask_input')
    
    pidsEmbd = tfk.layers.Embedding(input_dim=702, output_dim=100, trainable=False,
                                  weights=[_getPosEncodingMat(702, 100)], name='pids_embds')(input3)
    emb = tfk.layers.Embedding(input_dim=22, output_dim=100, name='embds')(input1)
    seq_embd = tfk.layers.Add(name='seq_embdAdd')([emb, pidsEmbd])
    
    x1 = tfk.layers.TimeDistributed(tfk.layers.Conv1D(100, 11, strides=1, padding='same', activation='relu'),
                                    name='conv1')(seq_embd)
    x1 = tfk.layers.TimeDistributed(tfk.layers.GlobalMaxPooling1D())(x1)
    x2 = tfk.layers.concatenate([input2, input4, input5], axis=-1, name='con1')
    x2 = tfk.layers.TimeDistributed(tfk.layers.Conv1D(100, 11, strides=1, padding='same', activation='relu'),
                                    name='conv2')(x2)
    x2 = tfk.layers.TimeDistributed(tfk.layers.GlobalMaxPooling1D())(x2)
    x1 = tfk.layers.LSTM(units=100, return_sequences=True, name='lstm1')(x1)
    
    model = tfk.layers.concatenate([x1, x2], axis=-1, name='con2')
    model,_ = selfAttention(model, input6)
    model = tfk.layers.Bidirectional( tfk.layers.LSTM(units=200, return_sequences=True), name='lstm2')(model)
    model = tfk.layers.TimeDistributed( tfk.layers.Dense(200, activation='relu'), name='output1')(model)
    output = tfk.layers.TimeDistributed( tfk.layers.Dense(8, activation='softmax') ,name='output')(model)
    
    model = tfk.models.Model([input1, input2, input3, input4, input5, input6], output)
    model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')

    model.summary()

In [None]:
# train_gen = dataGenerator(trainDataPath, batch_size=4, seq_len=702)

In [None]:
# model.fit(train_gen, verbose=1, epochs=10)
# model.save_weights('Weights/WB-ALSTM.h5')

In [None]:
model.load_weights('Weights/WB-ALSTM.h5')

In [None]:
val_gen = dataGenerator(trainDataPath, batch_size=8, valData=1)

In [None]:
trues = None
preds = None
for i in tqdm(range(val_gen.__len__()), total=val_gen.__len__()):
    inputs, outputs = val_gen.__getitem__(i)
    preds_ = model.predict(inputs)
    if trues is None:
        trues = outputs
        preds = preds_
    else:
        trues = np.append(trues, outputs, axis=0)
        preds = np.append(preds, preds_, axis=0)
np.savez_compressed('wb-alstm-vals', val_tr=trues, val_pr=preds)

In [None]:
m = np.sum(trues, axis=-1)
y_t = np.argmax(trues[m==1],axis=-1)
y_p = np.argmax(preds[m==1],axis=-1)
print(classification_report(y_t,y_p))
print(accuracy_score(to_q3(y_t),to_q3(y_p)),accuracy_score(y_t,y_p),precision_score(y_t,y_p,average='weighted'),
      recall_score(y_t,y_p,average='weighted'), f1_score(y_t,y_p,average='weighted'))

In [None]:
test_gen = dataGenerator(testDataPath, batch_size=4, valData=-1)

In [None]:
trues = None
preds = None
for i in tqdm(range(test_gen.__len__()), total=test_gen.__len__()):
    inputs, outputs = test_gen.__getitem__(i)
    preds_ = model.predict(inputs)
    if trues is None:
        trues = outputs
        preds = preds_
    else:
        trues = np.append(trues, outputs, axis=0)
        preds = np.append(preds, preds_, axis=0)
np.savez_compressed('wb-alstm-test', te_tr=trues, te_pr=preds)

In [None]:
m = np.sum(trues, axis=-1)
y_t = np.argmax(trues[m==1],axis=-1)
y_p = np.argmax(preds[m==1],axis=-1)
print(classification_report(y_t,y_p))
print(accuracy_score(to_q3(y_t),to_q3(y_p)),accuracy_score(y_t,y_p),precision_score(y_t,y_p,average='weighted'),
      recall_score(y_t,y_p,average='weighted'), f1_score(y_t,y_p,average='weighted'))