In [2]:
import importlib
import tensorflow as tf
import pandas as pd
import mpra_model
import h5py
importlib.reload(mpra_model)
import numpy as np
from tqdm import tqdm
import sklearn
from sklearn import model_selection
import scipy.stats
from sklearn.linear_model import Ridge
import os
os.environ['CUDA_VISIBLE_DEVICES']='2'

## Ridge/MLP model Train

In [2]:
ct_list = ['HepG2','K562']
model_list = []
LLM_list = []
perf_list = []
celltype_list = []
for model_name in ['hyena','gpn','dnabert','randbert']:
    for ct in ct_list:
        print('#######Starting with ' + model_name + ' embeddings for ' + ct +'########')
        f = h5py.File('../data/lenti_MPRA_embed/'+model_name+'_'+ct+'.h5', 'r')

        x_train = f['x_train']
        x_valid = f['x_valid']
        x_test = f['x_test']
        y_train = f['y_train']
        y_valid = f['y_valid']
        y_test = f['y_test']

        #CLS token embedding doens't represent summary representation for conv based methods
        if model_name == 'gpn' or model_name == 'hyena':
            mean_train = np.mean(x_train,axis=1)
            mean_valid = np.mean(x_valid,axis=1)
            mean_test = np.mean(x_test,axis=1)
            cls_train = None
        else:
            mean_train = np.mean(x_train[:,1:,:],axis=1)
            mean_valid = np.mean(x_valid[:,1:,:],axis=1)
            mean_test = np.mean(x_test[:,1:,:],axis=1)
            cls_train = np.squeeze(x_train[:,:1,:])
            cls_valid = np.squeeze(x_valid[:,:1,:])
            cls_test = np.squeeze(x_test[:,:1,:])
        
        # Ridge regression
        print('Ridge regression for CLS and Mean Embed')
        embed_model = Ridge(0.001).fit(mean_train, y_train)

        LLM_list.append(model_name)
        model_list.append('Mean-embed-Ridge')
        perf_list.append(scipy.stats.pearsonr(embed_model.predict(mean_test)[:,0],y_test[:,0])[0])
        celltype_list.append(ct)

        if cls_train is not None:
            embed_model = Ridge(0.001).fit(cls_train, y_train)

            LLM_list.append(model_name)
            model_list.append('CLS-Ridge')
            perf_list.append(scipy.stats.pearsonr(embed_model.predict(cls_test)[:,0],y_test[:,0])[0])
            celltype_list.append(ct)

        ## MLP model
        for factor in [0.5,1,2]:
            print('MLP for mean embedding training...')
            model = mpra_model.rep_mlp(mean_train.shape[1],factor=factor)
            optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
            earlyStopping_callback = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
            reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=5, min_lr=1e-8)
            model.compile(optimizer=optimizer,
                            loss='mean_squared_error',
                            metrics=['mse'])
            model.fit(
                    mean_train,y_train[:,0],
                    epochs=100,
                    batch_size=512,
                    shuffle=True,
                    validation_data=(mean_valid,y_valid[:,0]),
                    callbacks=[earlyStopping_callback,reduce_lr],
                    verbose=0,)
            y_pred = model.predict(mean_test)

            perf_list.append(scipy.stats.pearsonr(np.squeeze(y_pred),y_test[:,0])[0])
            LLM_list.append(model_name)
            celltype_list.append(ct)
            model_list.append('Mean-embed-MLP%1.1f'%factor)
            if cls_train is not None:
                print('MLP for cls training...')
                model = mpra_model.rep_mlp(cls_train.shape[1],factor=factor)
                optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
                earlyStopping_callback = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
                reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=5, min_lr=1e-8)
                model.compile(optimizer=optimizer,
                                loss='mean_squared_error',
                                metrics=['mse'])
                model.fit(
                        cls_train,y_train[:,0],
                        epochs=100,
                        batch_size=512,
                        shuffle=True,
                        validation_data = (cls_valid,y_valid[:,0]),
                        callbacks=[earlyStopping_callback,reduce_lr],
                        verbose=0,)
                y_pred = model.predict(cls_test)

                perf_list.append(scipy.stats.pearsonr(np.squeeze(y_pred),y_test[:,0])[0])
                LLM_list.append(model_name)
                celltype_list.append(ct)
                model_list.append('CLS-MLP%1.1f'%factor)

            del(model)
            tf.keras.backend.clear_session()
        

#######Starting with hyena embeddings for HepG2########
MLP for mean embedding training...


2024-05-28 15:01:00.427457: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 79078 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:85:00.0, compute capability: 8.0
2024-05-28 15:01:01.582622: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2024-05-28 15:01:01.602813: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f52e452ef20 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-05-28 15:01:01.602849: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA A100 80GB PCIe, Compute Capability 8.0
2024-05-28 15:01:01.606237: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-05-28 15:01:01.62200

#######Starting with hyena embeddings for K562########
MLP for mean embedding training...
#######Starting with gpn embeddings for HepG2########
MLP for mean embedding training...
#######Starting with gpn embeddings for K562########
MLP for mean embedding training...
#######Starting with dnabert embeddings for HepG2########
MLP for mean embedding training...
MLP for cls training...
MLP for cls training...
MLP for cls training...
#######Starting with dnabert embeddings for K562########
MLP for mean embedding training...
MLP for cls training...
MLP for cls training...
MLP for cls training...
#######Starting with randbert embeddings for HepG2########
MLP for mean embedding training...
MLP for cls training...
MLP for cls training...
MLP for cls training...
#######Starting with randbert embeddings for K562########
MLP for mean embedding training...
MLP for cls training...
MLP for cls training...
MLP for cls training...


In [4]:
perf_df = pd.DataFrame({'LLM':LLM_list,'Model':model_list,'Performance':perf_list,'Cell Type':celltype_list})
perf_df.to_csv('./results/LLM_regression.csv',index=False)

## CNN Model Train

In [3]:
## CNN structure hyperparameters
cnn_config = {
    'activation':'exponential',
    'reduce_dim': 128,
    'conv1_filter':196,
    'conv1_kernel':7,
    'dropout1':0.2,
    'res_filter':5,
    'res_layers':3,
    'res_pool':5,
    'res_dropout':0.2,
    'conv2_filter':256,
    'conv2_kernel':7,
    'pool2_size':4,
    'dropout2':0.2,
    'dense':512,
    'dense2':256,
    'l_rate':0.0001
}

In [4]:
def CNN_downstream_training(data_file,cnn_config,model_save_path,factor=1):
    #Load dataset
    f = h5py.File(data_file,'r')
    x_train = f['x_train']
    x_valid = f['x_valid']
    x_test = f['x_test']
    y_train = f['y_train']
    y_valid = f['y_valid']
    y_test = f['y_test']
    #Construct model and training choices
    model = mpra_model.rep_cnn(x_train[0].shape,cnn_config,factor = factor)
    optimizer = tf.keras.optimizers.Adam(learning_rate=cnn_config['l_rate'])
    loss = tf.keras.losses.MeanSquaredError()
    model.compile(optimizer=optimizer,loss=loss,metrics=['mse'])
    earlyStopping_callback = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss', factor=0.2,
                patience=5, min_lr=1e-8)
    #save trained model
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
                                    model_save_path,
                                    monitor='val_loss',
                                    save_best_only=True,
                                    mode = 'min',
                                    save_freq='epoch',)
    model.fit(
        x_train,y_train,
        epochs=100,
        batch_size=64,
        shuffle=False,
        verbose=0,
        validation_data = (x_valid,y_valid),
        callbacks=[earlyStopping_callback,reduce_lr,checkpoint])
    
    y_pred = model.predict(x_test)

    del model
    del x_train,y_train,x_valid,y_valid

    return scipy.stats.pearsonr(np.squeeze(y_pred),np.squeeze(y_test))


In [None]:
llm = []
ds_model = []
perf = []
c_t = []

for model in ['gpn','dnabert','hyena','randbert','sei']:
    for ct in ['HepG2','K562']:
        for factor in [0.5,1,2]:
            print('Training for %s on %s data'%(model,ct))
            data_file = ('../data/lenti_MPRA_embed/%s_%s.h5'%(model,ct))
            model_save = ('../model/lenti_MPRA/%s%1.1f_%s.h5'%(model,factor,ct))
            pr_perf = CNN_downstream_training(data_file,cnn_config,model_save,factor=factor)[0]
            llm.append(model)
            ds_model.append('CNN%1.1f'%factor)
            perf.append(pr_perf)
            c_t.append(ct)

Training for gpn on HepG2 data


2024-05-28 16:16:21.946214: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 79078 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:85:00.0, compute capability: 8.0
2024-05-28 16:16:23.834890: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8800
2024-05-28 16:16:24.150785: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2024-05-28 16:16:24.160100: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x55f4324f84e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-05-28 16:16:24.160132: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA A100 80GB PCIe, Compute Capability 8.0
2024-05-28 16:16:24.238514: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:26

Training for gpn on HepG2 data
Training for gpn on HepG2 data


In [6]:
perf_df = pd.DataFrame({'LLM':llm,'Model':ds_model,'Performance':perf,'Cell Type':c_t})
perf_df.to_csv('./results/LLM_CNN.csv',index=False)