In [4]:
import importlib
import tensorflow as tf
import pandas as pd
import mpra_model
import h5py
importlib.reload(mpra_model)
import numpy as np
from tqdm import tqdm
import sklearn
from sklearn import model_selection
import scipy.stats
import os
os.environ['CUDA_VISIBLE_DEVICES']='3'

In [5]:
data_file = '../data/lenti_MPRA_embed/sei_K562.h5'
model_file = '../model/lenti_MPRA/lenti_MPRA_embed/K562/sei.h5'

In [10]:
cnn_config = {
    'activation':'exponential',
    'reduce_dim': 128,
    'conv1_filter':196,
    'conv1_kernel':7,
    'dropout1':0.2,
    'res_filter':5,
    'res_layers':3,
    'res_pool':5,
    'res_dropout':0.2,
    'conv2_filter':256,
    'conv2_kernel':7,
    'pool2_size':4,
    'dropout2':0.2,
    'dense':512,
    'dense2':256,
    'l_rate':0.0001
}

file = h5py.File(data_file,'r')
seq = file['seq'][()]
target = file['mean'][()]
x_train,x_test,y_train,y_test=model_selection.train_test_split(seq,target,random_state=42,test_size=0.1)

model = mpra_model.rep_cnn(seq[0].shape,cnn_config)
optimizer = tf.keras.optimizers.Adam(learning_rate=cnn_config['l_rate'])
loss = tf.keras.losses.MeanSquaredError()
model.compile(optimizer=optimizer,loss=loss,metrics=['mse'])
earlyStopping_callback = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.2,
        patience=5, min_lr=1e-8)
checkpoint = tf.keras.callbacks.ModelCheckpoint(
                                    model_file,
                                    monitor='val_loss',
                                    save_best_only=True,
                                    mode = 'min',
                                    save_freq='epoch',)
model.fit(
        x_train,y_train,
        epochs=100,
        batch_size=512,
        shuffle=True,
        verbose=0,
        validation_split=0.1,
        callbacks=[earlyStopping_callback,reduce_lr,checkpoint])

y_pred = model.predict(x_test)
print(scipy.stats.pearsonr(np.squeeze(y_pred),np.squeeze(y_test)))

PearsonRResult(statistic=0.7715274284093354, pvalue=0.0)


## Testing

In [4]:
data_file = '../data/lenti_MPRA_embed/sei_HepG2.h5'
model = tf.keras.models.load_model('../model/lenti_MPRA/lenti_MPRA_embed/HepG2/sei.h5')

In [5]:
file = h5py.File(data_file,'r')
seq = file['seq'][()]
target = file['mean'][()]
x_train,x_test,y_train,y_test=model_selection.train_test_split(seq,target,random_state=42,test_size=0.1)
x_train,x_valid,y_train,y_valid = model_selection.train_test_split(x_train,y_train,random_state=42,test_size=0.1)
with tf.device("CPU"):
        trainset = tf.data.Dataset.from_tensor_slices((x_train,y_train)).shuffle(256*4).batch(256)
        validset = tf.data.Dataset.from_tensor_slices((x_valid,y_valid)).shuffle(256*4).batch(256)
        testset = tf.data.Dataset.from_tensor_slices((x_test,y_test)).shuffle(256*4).batch(256)

In [6]:
pred_y = []
y_test = []
for i,(x,y) in enumerate(testset):
    pred_y.extend(model.predict(x,verbose=0))
    y_test.extend(y)

print(scipy.stats.pearsonr(np.squeeze(pred_y),np.squeeze(y_test)))

2024-02-05 17:41:43.377782: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype float and shape [13988,960,16]
	 [[{{node Placeholder/_0}}]]
2024-02-05 17:41:43.378090: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype double and shape [13988]
	 [[{{node Placeholder/_1}}]]


PearsonRResult(statistic=0.7516205692678941, pvalue=0.0)
