In [None]:
# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = "1"

import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.backend as K
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import *


In [None]:
tmp = np.load('Data/Disordered_Regions_Train_Dataset.npz')
X = tmp['sequences']
X = np.argmax(X,axis=-1)
X_p = tmp['pssms']
Y = tmp['regions']
X_m = X!=0
X_m = X_m.astype(np.int32)
X_i = np.repeat(np.arange(1,X.shape[1]+1)[None,:], X.shape[0], axis=0)
X_i[X==0] = 0
max_len = X.shape[1]

train_ids = []
with open('Data/DM3000_id.txt') as f:
    for l in f.readlines():
        if l[:3] == 'Dis':
            k = l.split('|')[1].split()[0]
        else:
            k = l.split()[0]
        train_ids += [k]
val_ids = []
with open('Data/DM1229_id.txt') as f:
    for l in f.readlines():
        if l[:3] == 'Dis':
            k = l.split('|')[1].split()[0]
        else:
            k = l.split()[0]
        val_ids += [k]
a = tmp['seq_ids']
tr_ids_m = [True if i in train_ids else False for i in a]
val_ids_m = [True if i in val_ids else False for i in a]
len(val_ids),len(train_ids)

In [None]:
X_val = X[val_ids_m]
X_m_val = X_m[val_ids_m]
X_p_val = X_p[val_ids_m]
X_i_val = X_i[val_ids_m]
Y_val = Y[val_ids_m]
X = X[tr_ids_m]
X_m = X_m[tr_ids_m]
X_p = X_p[tr_ids_m]
X_i = X_i[tr_ids_m]
Y = Y[tr_ids_m]


In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
def shape_list(x):
    tmp = list(K.int_shape(x))
    tmp[0] = -1
    return tmp

def selfAttention(V, mask):
    units = int(V.shape[2])
    Q = tfk.layers.TimeDistributed( tfk.layers.Dense(units, activation=None, use_bias=True))(V)
    K_ = tfk.layers.TimeDistributed( tfk.layers.Dense(units, activation=None, use_bias=True))(Q)
    SoftAtten = tfk.layers.Dot(axes=-1, normalize=False)([Q, K_])
    SoftAtten = tfk.layers.Multiply()([SoftAtten, mask])
    SoftAtten = tfk.layers.Lambda(lambda inp: inp[0]/K.sqrt(K.cast(shape_list(inp[1])[-1], K.floatx())))([SoftAtten, V])
    SoftAtten = tf.where(SoftAtten==0, 1e-6, SoftAtten)
    SoftAtten = tfk.layers.Softmax(axis=-1)(SoftAtten)
    V = tfk.layers.Permute([2,1])(V)
    SA = tfk.layers.Dot(axes=-1, normalize=False)([SoftAtten, V])
    return SA

def _getPosEncodingMat(length, dim):
    posEnc = np.array([[pos/np.power(10000, 2*(j//2)/dim) for j in range(dim)]
                        if pos!=0 else np.zeros(dim) for pos in range(length)], dtype=np.float32)
    posEnc[1:, 0::2] = np.sin(posEnc[1:, 0::2])
    posEnc[1:, 1::2] = np.cos(posEnc[1:, 1::2])
    return posEnc

In [None]:
with strategy.scope():
    input1 = tfk.layers.Input(shape=(max_len, ), name='sequence_input')
    input2 = tfk.layers.Input(shape=(max_len, 20, ), name='pssm_input')
    input3 = tfk.layers.Input(shape=(max_len, ), name='pid_input')
    input4 = tfk.layers.Input(shape=(max_len, ), name='mask_input')
    
    emb = tfk.layers.Embedding(input_dim=24, output_dim=100, name='embds')(input1)
    pidsEmbd = tfk.layers.Embedding(input_dim=max_len, output_dim=100, trainable=False, 
                                    weights=[_getPosEncodingMat(max_len, 100)], name='pids_embds')(input3)
    emb = tfk.layers.Add(name='seq_embdAdd')([emb, pidsEmbd])
    
    x = tfk.layers.concatenate([emb, input2], axis=-1, name='con1')

    x1 = tfk.layers.Conv1D( 100, 11, strides=1, padding='same', activation='relu', name='conv1')(x)
    x2 = tfk.layers.Conv1D( 100, 7, strides=1, padding='same', activation='relu', name='conv2')(x)
    x3 = tfk.layers.Conv1D( 100, 3, strides=1, padding='same', activation='relu', name='conv3')(x)
    x4 = tfk.layers.Conv1D( 100, 1, strides=1, padding='same', activation='relu', name='conv4')(x)

    x = tfk.layers.concatenate([x1, x2, x3,x4], axis=-1, name='con2')
    xm = tfk.layers.Reshape((-1,1))(input4)
    x = tfk.layers.Multiply()([x, xm])

    model = tfk.layers.Bidirectional( tfk.layers.GRU(units=300, return_sequences=True), name='bigru1')(x)
    model = tfk.layers.Bidirectional( tfk.layers.GRU(units=300, return_sequences=True), name='bigru2')(model)
    model = tfk.layers.Bidirectional( tfk.layers.GRU(units=300, return_sequences=True), name='bigru3')(model)

    model = tfk.layers.concatenate([model, x], axis=-1, name='con3')

    model = tfk.layers.TimeDistributed( tfk.layers.Dense(200, activation='relu'), name='output1')(model)
    model = tfk.layers.TimeDistributed( tfk.layers.Dense(200, activation='relu'), name='output2')(model)

    output_5 = tfk.layers.TimeDistributed( tfk.layers.Dense(2, activation='softmax') ,name='output_3')(model)

    model = tfk.models.Model([input1, input2, input3, input4], output_5 )
    model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')

    model.summary()

In [None]:
# model.fit([X,X_p,X_i,X_m],Y,verbose=1,batch_size=8,epochs=10)
# model.save_weights('Weights/CGRN.h5')


In [None]:
model.load_weights('Weights/CGRN.h5')

preds = model.predict([X_val,X_p_val,X_i_val,X_m_val],verbose=1,batch_size=16)
np.savez_compressed('cgrn-vals', val_tr=Y_val, val_pr=preds)

In [None]:
m = np.sum(Y_val, axis=-1)
y_t = np.argmax(Y_val[m==1],axis=-1).flatten()
y_p = np.argmax(preds[m==1],axis=-1).flatten()
print(classification_report(y_t,y_p))
t = confusion_matrix(y_t,y_p)
print(t,accuracy_score(y_t,y_p))
tp = np.array(t[1][1],dtype=np.float64)
fp = np.array(t[1][0],dtype=np.float64)
tn = np.array(t[0][0],dtype=np.float64)
fn = np.array(t[0][1],dtype=np.float64)
sp = tp/(tp+fn)
sn = tn/(tn+fp)
bacc = (sp+sn)/2
mcc = ((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
sp,sn,bacc,mcc

In [None]:
tmp = np.load('Data/Disordered_Regions_Test_Dataset.npz')
test_ids = []
with open('Data/SL329_id.txt') as f:
    for l in f.readlines():
        k = l.split('|')[1].split()[0]
        test_ids += [k]
te_ids_m = [True if i in test_ids else False for i in tmp['seq_ids']]
X_te = np.argmax(tmp['sequences'],axis=-1)[te_ids_m]
X_p_te = tmp['pssms'][te_ids_m]
Y_te = tmp['regions'][te_ids_m]
X_m_te = X_te!=0
X_m_te = X_m_te.astype(np.int32)
X_i_te = np.repeat(np.arange(1,X_te.shape[1]+1)[None,:], X_te.shape[0], axis=0)
X_i_te[X_te==0] = 0

In [None]:
preds = model.predict([X_te,X_p_te,X_i_te,X_m_te],verbose=1,batch_size=32)
np.savez_compressed('cgrn-test', te_tr=Y_te, te_pr=preds)

In [None]:
m = np.sum(Y_te, axis=-1)
y_t = np.argmax(Y_te[m==1],axis=-1).flatten()
y_p = np.argmax(preds[m==1],axis=-1).flatten()
print(classification_report(y_t,y_p))
t = confusion_matrix(y_t,y_p)
print(t,accuracy_score(y_t,y_p))
tp = np.array(t[1][1],dtype=np.float64)
fp = np.array(t[1][0],dtype=np.float64)
tn = np.array(t[0][0],dtype=np.float64)
fn = np.array(t[0][1],dtype=np.float64)
sp = tp/(tp+fn)
sn = tn/(tn+fp)
bacc = (sp+sn)/2
mcc = ((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
sp,sn,bacc,mcc,f1_score(y_t,y_p,average='weighted')