In [17]:
import importlib
import tensorflow as tf
import pandas as pd
import mpra_model
import h5py
importlib.reload(mpra_model)
import numpy as np
from tqdm import tqdm
import sklearn
from sklearn import model_selection
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'
cell_type = 'K562'

In [18]:
cnn_config = {
    'input_shape': (41,2560),
    'activation':'exponential',
    'reduce_dim': 128,
    'conv1_filter':196,
    'conv1_kernel':7,
    'dropout1':0.2,
    'res_filter':5,
    'res_layers':3,
    'res_pool':5,
    'res_dropout':0.2,
    'conv2_filter':256,
    'conv2_kernel':7,
    'pool2_size':4,
    'dropout2':0.2,
    'dense':512,
    'dense2':256,
    'l_rate':0.0001
}

In [3]:
tf.keras.backend.clear_session()

data_dir = '/home/ztang/multitask_RNA/data/lenti_MPRA_embed/HepG2_seq_2B5_1000G/'

trainset = mpra_model.make_dataset(data_dir, 'train', mpra_model.load_stats(data_dir),
                            batch_size=128,seqs = False)
validset = mpra_model.make_dataset(data_dir, 'valid', mpra_model.load_stats(data_dir),
                            batch_size=128,seqs = False)
testset = mpra_model.make_dataset(data_dir, 'test', mpra_model.load_stats(data_dir),
                            batch_size=128,seqs = False)

2023-05-16 13:02:20.344832: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 78865 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:07:00.0, compute capability: 8.0


In [19]:
file = h5py.File('/home/ztang/multitask_RNA/data/lenti_MPRA_embed/gpn_'+cell_type+'.h5','r')
seq = file['seq'][()]
target = file['mean'][()]
x_train,x_test,y_train,y_test=model_selection.train_test_split(seq,target,random_state=42,test_size=0.1)
x_train,x_valid,y_train,y_valid = model_selection.train_test_split(x_train,y_train,random_state=42,test_size=0.1)
with tf.device("CPU"):
        trainset = tf.data.Dataset.from_tensor_slices((x_train,y_train)).shuffle(256*4).batch(256)
        validset = tf.data.Dataset.from_tensor_slices((x_valid,y_valid)).shuffle(256*4).batch(256)
        testset = tf.data.Dataset.from_tensor_slices((x_test,y_test)).shuffle(256*4).batch(256)

2023-05-18 13:49:04.084304: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 86325145600 exceeds 10% of free system memory.


In [20]:
model = mpra_model.rep_cnn((230,512),cnn_config)
#model = mpra_model.rep_cnn(cnn_config['input_shape'],cnn_config)
#model = mpra_model.rep_onehot(onehot_config['input_shape'],onehot_config)
#model = mpra_model.ResNet((230,512),1)

In [21]:
optimizer = tf.keras.optimizers.Adam(learning_rate=cnn_config['l_rate'])
loss = tf.keras.losses.MeanSquaredError()
model.compile(optimizer=optimizer,
                loss=loss,
                metrics=['mse'])
earlyStopping_callback = tf.keras.callbacks.EarlyStopping(
        patience=10, restore_best_weights=True
    )
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.2,
        patience=5, min_lr=1e-8)
checkpoint = tf.keras.callbacks.ModelCheckpoint(
                                    '/home/ztang/multitask_RNA/model/lenti_MPRA_embed/'+cell_type+'/gpn.h5',
                                    monitor='val_loss',
                                    save_best_only=True,
                                    mode = 'min',
                                    save_freq='epoch',)
model.fit(
        trainset,
        epochs=100,
        batch_size=512,
        shuffle=True,
        validation_data = validset,
        callbacks=[earlyStopping_callback,reduce_lr
                   ,checkpoint
                    #,TuneReportCallback({"loss": "loss","val_loss":'val_loss'})
                    ]
    )

2023-05-18 13:50:34.222827: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype float and shape [183265,230,512]
	 [[{{node Placeholder/_0}}]]
2023-05-18 13:50:34.223130: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype double and shape [183265]
	 [[{{node Placeholder/_1}}]]


Epoch 1/100

2023-05-18 13:51:04.380090: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype float and shape [20363,230,512]
	 [[{{node Placeholder/_0}}]]
2023-05-18 13:51:04.380329: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype double and shape [20363]
	 [[{{node Placeholder/_1}}]]


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


<keras.callbacks.History at 0x7f92680942e0>

In [14]:
#model.evaluate(testset)

In [8]:
pred_y = model.predict(x_test)



In [22]:
pred_y = []
y_test = []
for i,(x,y) in enumerate(testset):
    pred_y.extend(model.predict(x))
    y_test.extend(y)

2023-05-18 13:56:20.876397: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype float and shape [22626,230,512]
	 [[{{node Placeholder/_0}}]]
2023-05-18 13:56:20.876665: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype double and shape [22626]
	 [[{{node Placeholder/_1}}]]




In [23]:
import scipy.stats
scipy.stats.pearsonr(np.squeeze(pred_y),np.squeeze(y_test))

PearsonRResult(statistic=0.7128779753912131, pvalue=0.0)

## CAGI test?