In [1]:
import h5py
import sys
import numpy as np
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import tensorflow as tf
from tensorflow import keras
import sys
sys.path.append('/home/ztang/multitask_RNA/evaluation/')
from sklearn import model_selection
import scipy.stats as stats
import pandas as pd

data_file = '/home/ztang/multitask_RNA/data/RNA_loc/RNAloc.h5'

2023-06-01 13:00:15.091809: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
f = h5py.File(data_file,'r')
x = f['x'][()]
y = f['y'][()]
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.1,random_state=42)

In [3]:
def rna_loc_cnn (input_shape):
    #initializer
    initializer = keras.initializers.RandomNormal(mean=0.0, stddev=0.005)
    #input layer
    inputs = keras.Input(shape=input_shape, name='sequence')

    #first conv block
    nn = keras.layers.Conv1D(filters=196,
                             kernel_size=11,
                             padding='same',
                             kernel_initializer = initializer)(inputs)
    nn = keras.layers.BatchNormalization()(nn)
    nn = keras.layers.Activation('exponential', name='filter_activation')(nn)
    nn = keras.layers.Dropout(0.2)(nn)
    nn = keras.layers.MaxPooling1D(pool_size=5)(nn)

    #second conv block
    nn = keras.layers.Conv1D(filters=256,
                             kernel_size=7,
                             padding='same',
                             kernel_initializer = initializer)(nn)
    nn = keras.layers.BatchNormalization()(nn)
    nn = keras.layers.Activation('relu')(nn)
    nn = keras.layers.MaxPool1D(pool_size=4)(nn)
    nn = keras.layers.Dropout(0.2)(nn)

    #output block
    nn = keras.layers.Flatten()(nn)
    nn = keras.layers.Dense(512,kernel_initializer=initializer)(nn)
    nn = keras.layers.BatchNormalization()(nn)
    nn = keras.layers.Activation('relu')(nn)
    nn = keras.layers.Dropout(0.5)(nn)

    nn = keras.layers.Dense(128,kernel_initializer=initializer)(nn)
    nn = keras.layers.BatchNormalization()(nn)
    nn = keras.layers.Activation('relu')(nn)
    nn = keras.layers.Dropout(0.5)(nn)

    outputs = keras.layers.Dense(1,activation = 'sigmoid',kernel_initializer=initializer)(nn)
    model =  keras.Model(inputs=inputs, outputs=outputs)
    return model

earlyStopping_callback = tf.keras.callbacks.EarlyStopping(
            patience=10, restore_best_weights=True
        )
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss', factor=0.2,
            patience=5, min_lr=1e-6)
auroc = tf.keras.metrics.AUC(curve='ROC', name='auroc')
aupr = tf.keras.metrics.AUC(curve='PR', name='aupr')
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False, label_smoothing=0)


2023-06-01 13:00:17.736685: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 78865 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:07:00.0, compute capability: 8.0


In [4]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model_neurite = rna_loc_cnn((150,4))
model_neurite.compile(
                loss = loss,
                metrics=['accuracy',auroc,aupr],
                optimizer=optimizer,
            )
result = model_neurite.fit(x_train,y_train[:,0],
        batch_size=128,
        validation_split=0.1,
        epochs=100,
        shuffle=True,
        verbose=0,
        callbacks=[earlyStopping_callback,reduce_lr],
    )
# import matplotlib.pyplot as plt
# plt.plot(result.history['accuracy'])
# plt.plot(result.history['val_accuracy'])

2023-06-01 13:00:20.461902: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-06-01 13:00:20.932033: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-06-01 13:00:20.935258: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7fe6080182b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-06-01 13:00:20.935274: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA A100 80GB PCIe, Compute Capability 8.0
2023-06-01 13:00:20.938831: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-06-01 13:00:21.047603: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the pro

In [5]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model_soma = rna_loc_cnn((150,4))
model_soma.compile(
                loss = loss,
                metrics=['accuracy',auroc,aupr],
                optimizer=optimizer,
            )
result = model_soma.fit(x_train,y_train[:,1],
        batch_size=128,
        validation_split=0.1,
        epochs=100,
        shuffle=True,
        verbose=0,
        callbacks=[earlyStopping_callback,reduce_lr],
    )
# import matplotlib.pyplot as plt
# plt.plot(result.history['accuracy'])
# plt.plot(result.history['val_accuracy'])

In [6]:
nurite_pred = model_neurite.predict(x_test)
soma_pred = model_soma.predict(x_test)
local_p = np.abs(nurite_pred - soma_pred)
target_p = y_test.sum(axis = 1)



In [7]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(target_p, local_p, pos_label=1)
print('AUROC')
print(metrics.auc(fpr, tpr))
print('AUPR')
print(metrics.average_precision_score(target_p,local_p))

AUROC
0.7682299701738147
AUPR
0.1837449304559851


In [8]:
y_pred = model_neurite.evaluate(x_test,y_test[:,0])



In [9]:
y_pred = model_soma.evaluate(x_test,y_test[:,1])

