In [None]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.backend as K
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import *


In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
trainDataPath = 'Data/Disordered_Regions_Train_Dataset.npz'
testDataPath = 'Data/Disordered_Regions_Test_Dataset.npz'

residues = {'A':1, 'C':2, 'E':3, 'D':4, 'G':5, 'F':6, 'I':7, 'H':8, 'K':9, 
            'M':10, 'L':11, 'N':12, 'Q':13, 'P':14, 'S':15, 'R':16, 'T':17, 
            'W':18, 'V':19, 'Y':20, 'X':21, '<SOS>':22, '<EOS>':23}
residues_i = {j:i for i,j in residues.items()}

tmp = np.load('Data/Secondary_Structure_Motif_Antimotif.npz')
motifs = tmp['motifs']
antimotifs = tmp['antimotifs']
len(motifs),len(antimotifs)

In [None]:
def getRowData(seq_, pssm, winSize):
    seq_ = np.argmax(seq_, axis=-1)
    winApp = winSize // 2
    max_len = seq_.shape[0]
    
    input1 = np.zeros((max_len, winSize))
    input2 = np.zeros((max_len, winSize, 20))
    input3 = np.zeros((max_len, winSize, len(motifs)))
    input4 = np.zeros((max_len, winSize, len(antimotifs)))
    input5 = np.zeros((max_len, winSize))
    input6 = np.zeros((max_len, 1))
    
    seq = '-' * winApp
    seq += 'O'
    input6[0,0] = 1
    input5[0,:] = 1
    for j in range(1,seq_.shape[0]):
        if seq_[j] == 23:
            break
        seq += residues_i[seq_[j]]
        input6[j,0] = 1
        input5[j,:] = j+1
    input6[j,0] = 1
    input5[j,:] = j+1
    seq += 'J'
    seq += '-'*winApp
    for _ in range(winApp):
        pssm = np.append([np.zeros((20,))], pssm, axis=0)
        pssm = np.append(pssm, [np.zeros((20,))], axis=0)
    for j in range(len(seq) - winSize + 1):
        a = seq[j : j + winSize]
        c = pssm[j : j + winSize]
        for t in range(winSize):
            if a[t] == 'O':
                input1[j, t] = residues['<SOS>']
            elif a[t] == 'J':
                input1[j, t] = residues['<EOS>']
            elif a[t] != '-':
                input1[j, t] = residues[a[t]]
            input2[j, t] = c[t]
            for p,m in enumerate(motifs):
                k_ = int(m[3])
                if winSize-t > k_:
                    if a[t] == m[0] and a[t+k_] == m[1]:
                        input3[j, t:t+k_+1, p] += 1.
            for p,m in enumerate(antimotifs):
                k_ = int(m[3])
                if winSize-t > k_:
                    if a[t] == m[0] and a[t+k_] == m[1]:
                        input4[j, t:t+k_+1, p] += 1.
    return input1, input2, input5, input3, input4, input6

In [None]:
class dataGenerator(tfk.utils.Sequence):
    def __init__(self, file_path, batch_size=4, win_size=11, seq_len=1864, valData=0):
        self.__getData__(file_path)
        self.__seq_len__ = seq_len
        self.__win_size__ = win_size
        self.__batch_size__ = batch_size
        self.__valData__ = valData
        if self.__valData__ == 0:
            train_ids = []
            with open('Data/DM3000_id.txt') as f:
                for l in f.readlines():
                    if l[:3] == 'Dis':
                        k = l.split('|')[1].split()[0]
                    else:
                        k = l.split()[0]
                    train_ids += [k]
            self.__indexes__ = [k for k,i in enumerate(self.__a_id__) if i in train_ids]
            self.__n_examples__ = len(self.__indexes__)
        elif self.__valData__ == 1:
            val_ids = []
            with open('Data/DM1229_id.txt') as f:
                for l in f.readlines():
                    if l[:3] == 'Dis':
                        k = l.split('|')[1].split()[0]
                    else:
                        k = l.split()[0]
                    val_ids += [k]
            self.__indexes__ = [k for k,i in enumerate(self.__a_id__) if i in val_ids]
            self.__n_examples__ = len(self.__indexes__)
        else:
            test_ids = []
            with open('Data/SL329_id.txt') as f:
                for l in f.readlines():
                    k = l.split('|')[1].split()[0]
                    test_ids += [k]
            self.__indexes__ = [k for k,i in enumerate(self.__a_id__) if i in test_ids]
            self.__n_examples__ = len(self.__indexes__)
        np.random.seed(42)
    def __len__(self):
        return self.__n_examples__ // self.__batch_size__
    def __getData__(self, file_path):
        data = np.load(file_path)
        self.__sequences__ = data['sequences']
        self.__pssms__ = data['pssms']
        self.__labels__ = data['regions']
        self.__a_id__ = data['seq_ids']
    def __getitem__(self, index):
        indexes = self.__indexes__[index*self.__batch_size__:(index+1)*self.__batch_size__]
        X1, X2, X3, X4, X5, X6, y = self.__data_generation(indexes)
        return [X1, X2, X3, X4, X5, X6], y
    def __data_generation(self, indexes):
        x1 = np.empty((self.__batch_size__, self.__seq_len__, self.__win_size__))
        x2 = np.empty((self.__batch_size__, self.__seq_len__, self.__win_size__, 20))
        x3 = np.empty((self.__batch_size__, self.__seq_len__, self.__win_size__))
        x4 = np.empty((self.__batch_size__, self.__seq_len__, self.__win_size__, len(motifs)))
        x5 = np.empty((self.__batch_size__, self.__seq_len__, self.__win_size__, len(antimotifs)))
        x6 = np.empty((self.__batch_size__, self.__seq_len__, 1))
        y = np.empty((self.__batch_size__, self.__seq_len__, 2))
        for k in range(len(indexes)):
            ind = indexes[k]
            i1,i2,i3,i4,i5,i6 = getRowData(self.__sequences__[ind],self.__pssms__[ind],self.__win_size__)
            x1[k,] = i1
            x2[k,] = i2
            x3[k,] = i3
            x4[k,] = i4
            x5[k,] = i5
            x6[k,] = i6
            y[k,] = self.__labels__[ind]
        return x1, x2, x3, x4, x5, x6, y


In [None]:
def shape_list(x):
    tmp = list(K.int_shape(x))
    tmp[0] = -1
    return tmp

def selfAttention(V, mask):
    units = int(V.shape[2])
    Q = tfk.layers.TimeDistributed( tfk.layers.Dense(units, activation=None, use_bias=False))(V)
    K_ = tfk.layers.TimeDistributed( tfk.layers.Dense(units, activation=None, use_bias=False))(Q)
    SoftAtten = tfk.layers.Dot(axes=-1, normalize=False)([Q, K_])
    SoftAtten = tfk.layers.Lambda(lambda inp: inp[0]/K.sqrt(K.cast(shape_list(inp[1])[-1], K.floatx())))([SoftAtten, V])
    SoftAtten = tfk.layers.Softmax(axis=-1)(SoftAtten)
    SoftAtten = tfk.layers.Multiply()([SoftAtten, mask])
    V = tfk.layers.Permute([2,1])(V)
    SA = tfk.layers.Dot(axes=-1, normalize=False)([SoftAtten, V])
    return SA,SoftAtten

def _getPosEncodingMat(length, dim):
    posEnc = np.array([[pos/np.power(10000, 2*(j//2)/dim) for j in range(dim)]
                        if pos!=0 else np.zeros(dim) for pos in range(length)], dtype=np.float32)
    posEnc[1:, 0::2] = np.sin(posEnc[1:, 0::2])
    posEnc[1:, 1::2] = np.cos(posEnc[1:, 1::2])
    return posEnc


In [None]:
with strategy.scope():
    input1 = tfk.layers.Input(shape=(1864,11 ), name='sequence_input')
    input2 = tfk.layers.Input(shape=(1864,11,20, ), name='pssm_input')
    input3 = tfk.layers.Input(shape=(1864,11, ), name='pids_input')
    input4 = tfk.layers.Input(shape=(1864,11,len(motifs), ), name='motif_input')
    input5 = tfk.layers.Input(shape=(1864,11,len(antimotifs), ), name='antimotif_input')
    input6 = tfk.layers.Input(shape=(1864,1, ), name='mask_input')
    
    pidsEmbd = tfk.layers.Embedding(input_dim=1864, output_dim=100, trainable=False,
                                  weights=[_getPosEncodingMat(1864, 100)], name='pids_embds')(input3)
    emb = tfk.layers.Embedding(input_dim=24, output_dim=100, name='embds')(input1)
    seq_embd = tfk.layers.Add(name='seq_embdAdd')([emb, pidsEmbd])
    
    x1 = tfk.layers.TimeDistributed(tfk.layers.Conv1D(100, 11, strides=1, padding='same', activation='relu'), name='conv1')(seq_embd)
    x1 = tfk.layers.TimeDistributed(tfk.layers.GlobalMaxPooling1D())(x1)
    x2 = tfk.layers.concatenate([input2, input4, input5], axis=-1, name='con1')
    x2 = tfk.layers.TimeDistributed(tfk.layers.Conv1D(100, 11, strides=1, padding='same', activation='relu'), name='conv2')(x2)
    x2 = tfk.layers.TimeDistributed(tfk.layers.GlobalMaxPooling1D())(x2)

    x1 = tfk.layers.LSTM(units=100, return_sequences=True, name='lstm1')(x1)
    model = tfk.layers.concatenate([x1, x2], axis=-1, name='con2')
    
    model,_ = selfAttention(model, input6)
    
    model = tfk.layers.Bidirectional( tfk.layers.LSTM(units=200, return_sequences=True), name='lstm2')(model)

    model = tfk.layers.TimeDistributed( tfk.layers.Dense(200, activation='relu'), name='output1')(model)
    model = tfk.layers.TimeDistributed( tfk.layers.Dense(200, activation='relu'), name='output2')(model)
    output = tfk.layers.TimeDistributed( tfk.layers.Dense(2, activation='softmax') ,name='output')(model)

    model = tfk.models.Model([input1, input2, input3, input4, input5, input6], output)
    model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')

    model.summary()

In [None]:
# train_gen = dataGenerator(trainDataPath, batch_size=4)

In [None]:
# model.fit(train_gen, verbose=1, epochs=50)
# model.save_weights('Weights/WB-ALSTM.h5')

In [None]:
model.load_weights('Weights/WB-ALSTM.h5')

In [None]:
val_gen = dataGenerator(trainDataPath, batch_size=1, valData=1)

In [None]:
trues = None
preds = None
for i in tqdm(range(val_gen.__len__()), total=val_gen.__len__()):
    inputs, outputs = val_gen.__getitem__(i)
    preds_ = model.predict(inputs)
    if trues is None:
        trues = outputs
        preds = preds_
    else:
        trues = np.append(trues, outputs, axis=0)
        preds = np.append(preds, preds_, axis=0)
np.savez_compressed('wb-alstm-vals', val_tr=trues, val_pr=preds)

In [None]:
m = np.sum(trues, axis=-1)
y_t = np.argmax(trues[m==1],axis=-1).flatten()
y_p = np.argmax(preds[m==1],axis=-1).flatten()
print(classification_report(y_t,y_p))
t = confusion_matrix(y_t,y_p)
print(t,accuracy_score(y_t,y_p))
tp = np.array(t[1][1],dtype=np.float64)
fp = np.array(t[1][0],dtype=np.float64)
tn = np.array(t[0][0],dtype=np.float64)
fn = np.array(t[0][1],dtype=np.float64)
sp = tp/(tp+fn)
sn = tn/(tn+fp)
bacc = (sp+sn)/2
mcc = ((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
sp,sn,bacc,mcc

In [None]:
test_gen = dataGenerator(testDataPath, batch_size=1, valData=-1)

In [None]:
trues = None
preds = None
for i in tqdm(range(test_gen.__len__()), total=test_gen.__len__()):
    inputs, outputs = test_gen.__getitem__(i)
    preds_ = model.predict(inputs)
    if trues is None:
        trues = outputs
        preds = preds_
    else:
        trues = np.append(trues, outputs, axis=0)
        preds = np.append(preds, preds_, axis=0)
np.savez_compressed('wb-alstm-test', te_tr=trues, te_pr=preds)

In [None]:
m = np.sum(trues, axis=-1)
y_t = np.argmax(trues[m==1],axis=-1).flatten()
y_p = np.argmax(preds[m==1],axis=-1).flatten()
print(classification_report(y_t,y_p))
t = confusion_matrix(y_t,y_p)
print(t,accuracy_score(y_t,y_p))
tp = np.array(t[1][1],dtype=np.float64)
fp = np.array(t[1][0],dtype=np.float64)
tn = np.array(t[0][0],dtype=np.float64)
fn = np.array(t[0][1],dtype=np.float64)
sp = tp/(tp+fn)
sn = tn/(tn+fp)
bacc = (sp+sn)/2
mcc = ((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
sp,sn,bacc,mcc,f1_score(y_t,y_p,average='weighted')