In [1]:
import h5py 
import numpy as np
import pandas as pd
import scipy
import glob
import tensorflow as tf
from tensorflow import keras
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import itertools
import os
os.environ['CUDA_VISIBLE_DEVICES'] ='1'
file_list = glob.glob('../data/eclip/*_200.h5')

2024-02-13 18:08:36.191373: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def di_nuc_frequency(seq_list):
    key_list = list(itertools.product([0,1,2,3], [0,1,2,3]))
    freq_array = []
    pos_seq = np.argmax(seq_list,axis=1)
    for seq in pos_seq:
        count_dict =  dict(zip(key_list, [0]*len(key_list)))
        for i in range(len(seq)-1):
            entry = (seq[i],seq[i+1])
            count_dict[entry] += 1
        freq_array.append(list(count_dict.values()))
    return np.array(freq_array)

def rep_mlp(input_shape,output_shape = 1):
     #initializer
    initializer = keras.initializers.RandomNormal(mean=0.0, stddev=0.005)
    #input layer
    inputs = keras.Input(shape=input_shape, name='sequence')
    nn = keras.layers.Dense(512,kernel_initializer=initializer)(inputs)
    nn = keras.layers.BatchNormalization()(nn)
    nn = keras.layers.Activation('relu')(nn)
    nn = keras.layers.Dropout(0.5)(nn)

    nn = keras.layers.Dense(256,kernel_initializer=initializer)(nn)
    nn = keras.layers.BatchNormalization()(nn)
    nn = keras.layers.Activation('relu')(nn)
    nn = keras.layers.Dropout(0.5)(nn)

    outputs = keras.layers.Dense(output_shape,activation = 'linear',kernel_initializer=initializer)(nn)

    model =  keras.Model(inputs=inputs, outputs=outputs)
    return model

earlyStopping_callback = tf.keras.callbacks.EarlyStopping(
            patience=10, restore_best_weights=True
        )
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss', factor=0.2,
            patience=5, min_lr=1e-6)
auroc = tf.keras.metrics.AUC(curve='ROC', name='auroc')
aupr = tf.keras.metrics.AUC(curve='PR', name='aupr')
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False, label_smoothing=0)

2024-02-13 18:08:38.470782: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 78973 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:47:00.0, compute capability: 8.0


In [3]:
## Nucleotide Logistic Regression
exp = []
test_accuracy = []
test_auroc = []
test_aupr = []
model_list = []


for file in file_list:
    tf_name = file.split('/')[-1][:-7]
    data = h5py.File(file,'r')
    x_train = data['x_train'][:,:4,:]
    y_train = data['y_train'][:]
    x_valid = data['x_valid'][:,:4,:]
    y_valid = data['y_valid'][:]
    x_test = data['x_test'][:,:4,:]
    y_test = data['y_test'][:]
    #Train Regression Model
    mean_train = np.mean(np.concatenate((x_train,x_valid)),axis=-1)
    target_train = np.concatenate((y_train,y_valid))
    mean_model = LogisticRegression(random_state=0).fit(mean_train,np.squeeze(target_train))
    #Predict + Eval
    mean_predict = mean_model.predict(np.mean(x_test,axis=-1))
    test_accuracy.append(metrics.accuracy_score(y_test,mean_predict))
    test_auroc.append(metrics.roc_auc_score(y_test,mean_predict))
    test_aupr.append(metrics.average_precision_score(y_test,mean_predict))
    model_list.append('Mean One-hot logistic regression')
    exp.append(tf_name)

perf = pd.DataFrame({'TF':exp,'Accuracy':test_accuracy,'AUROC':test_auroc,'AUPR':test_aupr,'Model':model_list})
perf.to_csv('./result/chip_result/seq_perf_logistic.csv')

In [4]:
## Nucleotide MLP
exp = []
test_accuracy = []
test_auroc = []
test_aupr = []
model_list = []
for file in file_list:
    tf_name = file.split('/')[-1][:-7]
    data = h5py.File(file,'r')
    x_train = data['x_train'][:,:4,:]
    y_train = data['y_train'][:]
    x_valid = data['x_valid'][:,:4,:]
    y_valid = data['y_valid'][:]
    x_test = data['x_test'][:,:4,:]
    y_test = data['y_test'][:]

    mean_train = np.mean(x_train,axis=-1)
    mean_valid = np.mean(x_valid,axis=-1)
    mean_test = np.mean(x_test,axis=-1)

    #Train MLP Model
    for i in range(5):
        optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        model = rep_mlp((4),1)
        model.compile(loss = loss,
                    metrics=['accuracy',auroc,aupr],
                    optimizer=optimizer)
        
        result = model.fit(mean_train,y_train,
            batch_size=256,
            validation_data=(mean_valid,y_valid),
            epochs=100,
            verbose=0,
            callbacks=[earlyStopping_callback,reduce_lr]
        )
        _, acc, roc, pr = model.evaluate(mean_test,y_test)
        exp.append(tf_name)
        model_list.append('One-hot MLP')
        test_accuracy.append(acc)
        test_auroc.append(roc)
        test_aupr.append(pr)

perf = pd.DataFrame({'TF':exp,'Accuracy':test_accuracy,'AUROC':test_auroc,'AUPR':test_aupr,'Model':model_list})
perf.to_csv('./result/chip_result/seq_perf_MLP.csv')

2024-02-13 17:08:18.948059: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2024-02-13 17:08:18.951258: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x556cb2737480 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-13 17:08:18.951278: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA A100 80GB PCIe, Compute Capability 8.0
2024-02-13 17:08:18.954606: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-02-13 17:08:19.054147: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8800
2024-02-13 17:08:19.150135: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the pro



In [3]:
## Nucleotide Logistic Regression
exp = []
test_accuracy = []
test_auroc = []
test_aupr = []
model_list = []

for file in file_list:
    tf_name = file.split('/')[-1][:-12]
    data = h5py.File(file,'r')
    x_train = data['X_train'][:,:4,:]
    y_train = data['Y_train'][:]
    x_valid = data['X_valid'][:,:4,:]
    y_valid = data['Y_valid'][:]
    x_test = data['X_test'][:,:4,:]
    y_test = data['Y_test'][:]
    #Train Regression Model
    x_train = np.concatenate((x_train,x_valid))
    y_train = np.concatenate((y_train,y_valid))
    x_freq = di_nuc_frequency(x_train)
    dinuc_model = LogisticRegression(random_state=0).fit(x_freq,np.squeeze(y_train))
    #Predict + Eval
    mean_predict = dinuc_model.predict(di_nuc_frequency(x_test))
    test_accuracy.append(metrics.accuracy_score(y_test,mean_predict))
    test_auroc.append(metrics.roc_auc_score(y_test,mean_predict))
    test_aupr.append(metrics.average_precision_score(y_test,mean_predict))
    model_list.append('Dinucleotide logistic regression')
    exp.append(tf_name)

perf = pd.DataFrame({'TF':exp,'Accuracy':test_accuracy,'AUROC':test_auroc,'AUPR':test_aupr,'Model':model_list})
perf.to_csv('./result/eclip_result/dinuc_perf_logistic.csv')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [4]:
## Nucleotide MLP
exp = []
test_accuracy = []
test_auroc = []
test_aupr = []
model_list = []

for file in file_list:
    tf_name = file.split('/')[-1][:-7]
    data = h5py.File(file,'r')
    x_train = data['X_train'][:,:4,:]
    y_train = data['Y_train'][:]
    x_valid = data['X_valid'][:,:4,:]
    y_valid = data['Y_valid'][:]
    x_test = data['X_test'][:,:4,:]
    y_test = data['Y_test'][:]
    #Train Regression Model
    x_train = di_nuc_frequency(x_train)
    x_valid = di_nuc_frequency(x_valid)
    x_test = di_nuc_frequency(x_test)
    #Train MLP Model
    for i in range(5):
        optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        model = rep_mlp((16),1)
        model.compile(loss = loss,
                    metrics=['accuracy',auroc,aupr],
                    optimizer=optimizer)
        
        result = model.fit(x_train,y_train,
            batch_size=256,
            validation_data=(x_valid,y_valid),
            epochs=100,
            verbose=0,
            callbacks=[earlyStopping_callback,reduce_lr]
        )
        _, acc, roc, pr = model.evaluate(x_test,y_test)
        exp.append(tf_name)
        model_list.append('Dinucletodie MLP')
        test_accuracy.append(acc)
        test_auroc.append(roc)
        test_aupr.append(pr)

perf = pd.DataFrame({'TF':exp,'Accuracy':test_accuracy,'AUROC':test_auroc,'AUPR':test_aupr,'Model':model_list})
perf.to_csv('./result/eclip_result/dinuc_perf_MLP.csv')

2024-02-13 18:10:31.734455: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2024-02-13 18:10:31.737773: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f374806e5f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-13 18:10:31.737793: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA A100 80GB PCIe, Compute Capability 8.0
2024-02-13 18:10:31.741319: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-02-13 18:10:31.840507: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8800
2024-02-13 18:10:31.937264: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the pro

