In [None]:
import gpn.model
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import torch
from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer
import h5py
import sys
from tqdm import tqdm
import numpy as np
sys.path.append('../')
import utils
import os
from tqdm import tqdm
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

model = AutoModel.from_pretrained("../model/GPN_human/checkpoint-2000000").to('cuda')
tokenizer = AutoTokenizer.from_pretrained("../model/GPN_human/checkpoint-2000000")
model.eval();

## Lenti-MPRA

In [None]:
celltype = 'HepG2'

file = h5py.File('../data/lenti_MPRA/'+celltype+'_data.h5','r')
gpn_output = h5py.File('../data/lenti_MPRA_embed/gpn_'+celltype+'.h5','w')
batch_size = 32
output_cache = []
for i in tqdm(range(0,len(file['seq']),batch_size)):

    seq = file['seq'][i:i+batch_size].astype('U230')
    input_ids = tokenizer(seq.tolist(), return_tensors="pt", return_attention_mask=False, return_token_type_ids=False)["input_ids"]
    with torch.no_grad():
        output_seq = model(input_ids.to('cuda')).last_hidden_state.cpu().detach().numpy()
    output_cache.extend(output_seq)
gpn_output.create_dataset(name='seq',data = np.array(output_cache))
gpn_output.create_dataset(name='mean',data = file['mean'][:])
gpn_output.close()

## Chip/Clip

In [None]:
file_list = glob.glob('../data/chip/*.h5')
for file in file_list:
    tf_name = file.split('/')[-1][:-7]
    gpn_output = h5py.File('../data/chip/GPN/'+tf_name+'_200.h5','w')
    batch_size = 32
    file = h5py.File(file,'r')
    for label in ('train','valid','test'):
        output_cache = []  
        for i in tqdm(range(0,len(file['x_'+label]),batch_size)):
            seq = file['x_'+label][i:i+batch_size].astype('int')
            seq = np.transpose(seq,(0,2,1))
            seq = utils.onehot_to_seq(seq)
            input_ids = tokenizer(seq, return_tensors="pt", return_attention_mask=False, return_token_type_ids=False)["input_ids"]
            with torch.no_grad():
                output_seq = model(input_ids.to('cuda')).last_hidden_state.cpu().detach().numpy()
            output_cache.extend(output_seq)
        gpn_output.create_dataset(name='x_'+label,data = np.array(output_cache),dtype = 'float32')
        gpn_output.create_dataset(name='x_'+label,data = file['y_'+label][:],dtype='int') 
    gpn_output.close()

In [None]:
file_list = glob.glob('../data/eclip/*.h5')
for file in file_list:
    tf_name = file.split('/')[-1][:-7]
    gpn_output = h5py.File('../data/eclip/GPN/'+tf_name+'_200.h5','w')
    batch_size = 32
    file = h5py.File(file,'r')
    for label in ('train','valid','test'):
        output_cache = []  
        for i in tqdm(range(0,len(file['X_'+label]),batch_size)):
            seq = file['X_'+label][i:i+batch_size].astype('int')
            seq = np.transpose(seq,(0,2,1))
            seq = utils.onehot_to_seq(seq)
            input_ids = tokenizer(seq, return_tensors="pt", return_attention_mask=False, return_token_type_ids=False)["input_ids"]
            with torch.no_grad():
                output_seq = model(input_ids.to('cuda')).last_hidden_state.cpu().detach().numpy()
            output_cache.extend(output_seq)
        gpn_output.create_dataset(name='x_'+label,data = np.array(output_cache),dtype = 'float32')
        gpn_output.create_dataset(name='y_'+label,data = file['Y_'+label][:],dtype='int') 
    gpn_output.close()

## MTSplice

In [None]:
file = h5py.File('../data/mtsplice/delta_logit.h5','r')
gpn_output = h5py.File('../data/mtsplice/gpn_mt.h5','w')
batch_size = 32
for label in ('valid','test','train'):
    l_cache = []
    r_cache = [] 
    for i in tqdm(range(0,len(file['x_'+label]),batch_size)):
        seq = file['x_'+label][i:i+batch_size].astype('int')
        seq = utils.onehot_to_seq(seq)
        clean_seq = [s if 'N' not in s else s.replace('N','[PAD]') for s in seq ]
        input_ids = tokenizer(clean_seq, return_tensors="pt", return_attention_mask=False, return_token_type_ids=False)["input_ids"]
        l_input = input_ids[:,:400]
        r_input = input_ids[:,400:]
        with torch.no_grad():
            l_output = model(l_input.to('cuda')).last_hidden_state.cpu().detach().numpy()
            r_output = model(r_input.to('cuda')).last_hidden_state.cpu().detach().numpy()
        l_cache.extend(l_output)
        r_cache.extend(r_output)
    gpn_output.create_dataset(name='xl_'+label,data = np.array(l_cache),dtype = 'float32')
    gpn_output.create_dataset(name='xr_'+label,data = np.array(r_cache),dtype = 'float32')
    gpn_output.create_dataset(name='y_'+label,data = file['y_'+label][:],dtype='float32') 
gpn_output.close()

## INSERT-seq

In [None]:
file = h5py.File('../data/rna_stable/insert_dataset.h5','r')
gpn_output = h5py.File('../data/rna_stable/gpn_human_embed.h5','w')
batch_size = 32
for dataset in ['test','train','valid']:
    key = 'X_'+dataset
    onehot = file[key]
    string_seq = utils.onehot_to_seq(onehot)

    token_seq = tokenizer.batch_encode_plus(string_seq, max_length=512,padding = 'max_length')
    output_cache = []
    for seq_i in tqdm(range(0,len(token_seq['input_ids']),batch_size)):
        seq_batch = torch.tensor(token_seq['input_ids'][seq_i:seq_i+batch_size]).to('cuda')
        output_seq = model(seq_batch).last_hidden_state.cpu().detach().numpy()
        output_cache.extend(output_seq[:,:173,:])
    gpn_output.create_dataset(name=key,data = np.array(output_cache))
    gpn_output.create_dataset(name='Y_'+dataset,data = file['Y_'+dataset][:])
    gpn_output.close()