In [None]:
# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = "1"

import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.backend as K
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import *


In [None]:
num_aminoAcids = {0:'A', 1:'C', 2:'E', 3:'D', 4:'G', 5:'F', 6:'I', 7:'H', 8:'K', 9:'M', 10:'L',
            11:'N', 12:'Q', 13:'P', 14:'S', 15:'R', 16:'T', 17:'W', 18:'V', 19:'Y', 20:'X'}
num_ss = {0:'L',1:'B',2:'E',3:'G',4:'I',5:'H',6:'S',7:'T'}
aminoAcid_I = {j:i+1 for i,j in num_aminoAcids.items()}
aminoAcid_I['<pad>'] = 0
aminoAcid_I['<S>'] = len(aminoAcid_I)
aminoAcid_I['<EOS>'] = len(aminoAcid_I)
ss_I = {j:i+1 for i,j in num_ss.items()}
ss_I['<pad>'] = 0
ss_I['X'] = len(ss_I)
ss_I['<S>'] = len(ss_I)
ss_I['<EOS>'] = len(ss_I)

trainDataPath = 'Data/Secondary_Structure_Train_Dataset.npz'
testDataPath = 'Data/Secondary_Structure_Test_Dataset.npz'

tmp = np.load('Data/Secondary_Structure_Motif_Antimotif.npz')
motifs = tmp['motifs']
antiMotifs = tmp['antimotifs']
len(motifs),len(antiMotifs)

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
def load_data(file_path):
    data = np.load(file_path)
    sequences = data['sequences']
    pssms = data['pssms']
    secondary_structure = data['secondaryStrucs']

    in1 = np.zeros((sequences.shape[0], 702), dtype=np.int32)
    in2 = np.zeros((sequences.shape[0], 702,22), dtype=np.float32)
    in3 = np.zeros((sequences.shape[0], 702), dtype=np.int32)
    out = np.zeros((sequences.shape[0], 702), dtype=np.int32)
    for i in tqdm(range(sequences.shape[0])):
        seq = '-'
        in1[i,0] = aminoAcid_I['<S>']
        in3[i,0] = 1
        out[i,0] = ss_I['<S>']
        for j in range(sequences.shape[1]):
            if np.sum(sequences[i,j,:]) == 0:
                in1[i,j+1] = aminoAcid_I['<EOS>']
                in3[i,j+1] = j+2
                out[i,j+1] = ss_I['<EOS>']
                break
            in3[i,j+1] = j+2
            t = num_aminoAcids[np.argmax(sequences[i,j,:])]
            seq += t
            in1[i,j+1] = aminoAcid_I[t]
            out[i,j+1] = ss_I[num_ss[np.argmax(secondary_structure[i,j,:])]]
            if np.sum(secondary_structure[i,j,:]) == 0:
                out[i,j+1] = ss_I['X']
            in2[i,j+1] = pssms[i,j]
    in4 = np.where(in1!=0, 1, 0)[:,:,None]
    return in1, in2, in3, in4, out


In [None]:
X1, X2, X3, X4, Y = load_data(trainDataPath)
X1_val = X1[12000:]
X2_val = X2[12000:]
X3_val = X3[12000:]
X4_val = X4[12000:]
Y_val = tf.one_hot(Y[12000:], 9)[:,:,1:]

X1 = X1[:12000]
X2 = X2[:12000]
X3 = X3[:12000]
X4 = X4[:12000]
Y = tf.one_hot(Y[:12000], 9)[:,:,1:]


In [None]:
def shape_list(x):
    tmp = list(K.int_shape(x))
    tmp[0] = -1
    return tmp

def _getPosEncodingMat(length, dim):
    posEnc = np.array([[pos/np.power(10000, 2*(j//2)/dim) for j in range(dim)]
                        if pos!=0 else np.zeros(dim) for pos in range(length)], dtype=np.float32)
    posEnc[1:, 0::2] = np.sin(posEnc[1:, 0::2])
    posEnc[1:, 1::2] = np.cos(posEnc[1:, 1::2])
    return posEnc

In [None]:
with strategy.scope():
    input1_ = tfk.layers.Input(shape=(702, ), name='sequence_input')
    input2_ = tfk.layers.Input(shape=(702, 22, ), name='pssm_input')
    input3_ = tfk.layers.Input(shape=(702, ), name='pid_input')
    input4_ = tfk.layers.Input(shape=(702, 1, ), name='mask_input')
    
    emb = tfk.layers.Embedding(input_dim=24, output_dim=100, input_length=700, name='embds')(input1_)
    pidsEmbd = tfk.layers.Embedding(input_dim=702, output_dim=100, trainable=False, 
                                    weights=[_getPosEncodingMat(702, 100)], name='pids_embds')(input3_)
    emb = tfk.layers.Add(name='seq_embdAdd')([emb, pidsEmbd])
    x = tfk.layers.concatenate([emb, input2_], axis=-1, name='con1')
    
    x1 = tfk.layers.Conv1D( 100, 11, strides=1, padding='same', activation='relu', name='conv1')(x)
    x2 = tfk.layers.Conv1D( 100, 7, strides=1, padding='same', activation='relu', name='conv2')(x)
    x3 = tfk.layers.Conv1D( 100, 3, strides=1, padding='same', activation='relu', name='conv3')(x)
    x = tfk.layers.concatenate([x1, x2, x3], axis=-1, name='con2')
    x = tfk.layers.Multiply()([x, input4_])
    
    model = tfk.layers.Bidirectional( tfk.layers.GRU(units=300, return_sequences=True), name='bigru1')(x)
    model = tfk.layers.Multiply()([model, input4_])
    model = tfk.layers.Bidirectional( tfk.layers.GRU(units=300, return_sequences=True), name='bigru2')(model)
    model = tfk.layers.Multiply()([model, input4_])
    model = tfk.layers.Bidirectional( tfk.layers.GRU(units=300, return_sequences=True), name='bigru3')(model)
    model = tfk.layers.concatenate([model, x], axis=-1, name='con3')
    model = tfk.layers.TimeDistributed( tfk.layers.Dense(200, activation='relu'), name='output1')(model)
    model = tfk.layers.Multiply()([model, input4_])
    model = tfk.layers.TimeDistributed( tfk.layers.Dense(200, activation='relu'), name='output2')(model)
    output_ = tfk.layers.TimeDistributed( tfk.layers.Dense(8, activation='softmax') ,name='output')(model)
    
    model = tfk.models.Model([input1_, input2_, input3_, input4_], output_)
    model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')

    model.summary()

In [None]:
# model.fit([X1, X2, X3, X4], Y, verbose=1, batch_size=8, epochs=50)
# model.save_weights('Weights/CGRN.h5')


In [None]:
model.load_weights('Weights/CGRN.h5')

In [None]:
preds = model.predict([X1_val, X2_val, X3_val, X4_val], verbose=1, batch_size=8)
np.savez_compressed('cgrn-vals', val_tr=Y_val, val_pr=preds)

In [None]:
def to_q3(x):
    y = []
    for i in x:
        if i in [0,6,7]:
            y += [1]
        elif i in [1,2]:
            y += [2]
        else:
            y += [3]
    return y

m = np.sum(Y_val, axis=-1)
y_t = np.argmax(Y_val[m==1],axis=-1)
y_p = np.argmax(preds[m==1],axis=-1)
print(classification_report(y_t,y_p))
print(accuracy_score(to_q3(y_t),to_q3(y_p)),accuracy_score(y_t,y_p),precision_score(y_t,y_p,average='weighted'),
      recall_score(y_t,y_p,average='weighted'), f1_score(y_t,y_p,average='weighted'))

In [None]:
X1_te, X2_te, X3_te, X4_te, Y_te = load_data(testDataPath)
Y_te = tf.one_hot(Y_te, 9)[:,:,1:]


In [None]:
preds = model.predict([X1_te, X2_te, X3_te, X4_te], verbose=1, batch_size=8)
np.savez_compressed('cgrn-tests', te_tr=Y_te, te_pr=preds)

In [None]:
m = np.sum(Y_te, axis=-1)
y_t = np.argmax(Y_te[m==1],axis=-1)
y_p = np.argmax(preds[m==1],axis=-1)
print(classification_report(y_t,y_p))
print(accuracy_score(to_q3(y_t),to_q3(y_p)),accuracy_score(y_t,y_p),precision_score(y_t,y_p,average='weighted'),
      recall_score(y_t,y_p,average='weighted'), f1_score(y_t,y_p,average='weighted'))