In [1]:
import torch
import os
os.environ['CUDA_VISIBLE_DEVICES'] ='2'
import sys
sys.path.append('../../model/sei_model/')
import sei
from torchinfo import summary
import random
import numpy as np
import h5py
from tqdm import tqdm
import math
import random
import glob

In [2]:
file_dict = torch.load('../../model/sei_model/sei.pth')
clean_dict = {}
for key in file_dict:
    clean_key = key[13:]
    clean_dict[clean_key] = file_dict[key]
model = sei.Sei();
model.load_state_dict(clean_dict)
model.to('cuda').eval();

In [3]:
class embed_extractor():
    def __init__(self):
        self.activation = {}
    def get_activation(self,name):
        def hook(model, input, output):
            if name not in self.activation.keys():
                self.activation[name] = []
            self.activation[name].extend(output.detach().cpu().numpy())
        return hook

## Lenti-MPRA

In [4]:
celltype = 'K562'
embed_output = h5py.File('../../data/lenti_MPRA_embed/sei_'+celltype+'.h5','w')
file = h5py.File('../../data/lenti_MPRA/'+celltype+'_data.h5','r')
batch_size = 128
pad_size = (4096-file['onehot_test'].shape[1])/2
#LentiMPRA
for dataset in ['onehot_train','onehot_valid','onehot_test']:
    embed = embed_extractor()
    model.spline_tr.register_forward_hook(embed.get_activation('s_out'))
    for i in tqdm(range(0,len(file[dataset]),batch_size)):
        seq = file[dataset][i:i+batch_size].transpose(0,2,1).astype('float32')
        pad_seq = np.pad(seq,((0,0),(0,0),(math.floor(pad_size),math.ceil(pad_size))))
        with torch.no_grad():
            output_seq = model(torch.from_numpy(pad_seq).to('cuda'))
    embed_output.create_dataset(name='x'+dataset[6:],data = np.array(embed.activation['s_out']))
    embed_output.create_dataset(name='y'+dataset[6:],data = file['y'+dataset[6:]][:])
embed_output.close()   

100%|██████████| 2459/2459 [06:54<00:00,  5.93it/s]
100%|██████████| 308/308 [00:52<00:00,  5.89it/s]
100%|██████████| 308/308 [00:52<00:00,  5.84it/s]


## Chip-seq/Clip-seq

In [None]:
file_list = glob.glob('../data/chip/*.h5')
pad_size = (4096-200)/2
for file in file_list:
    tf_name = file.split('/')[-1][:-7]
    sei_output = h5py.File('../data/chip/sei/'+tf_name+'_200.h5','w')
    batch_size = 128
    file = h5py.File(file,'r')
    for label in ('train','valid','test'):
        embed = embed_extractor()
        model.spline_tr.register_forward_hook(embed.get_activation('s_out'))
        for i in tqdm(range(0,len(file['x_'+label]),batch_size)):
            seq = file['x_'+label][i:i+batch_size].astype('float32')
            pad_seq = np.pad(seq,((0,0),(0,0),(math.floor(pad_size),math.ceil(pad_size))))
            with torch.no_grad():
                output_seq = model(torch.from_numpy(pad_seq).to('cuda'))
        #sanity check
        assert len(embed.activation['s_out']) == file['y_'+label].shape[0]
        sei_output.create_dataset(name='x_'+label,data = np.array(embed.activation['s_out']),dtype = 'float32')
        sei_output.create_dataset(name='y_'+label,data = file['y_'+label][:],dtype='int') 
    sei_output.close()


In [None]:
file_list = glob.glob('../data/eclip/*.h5')
pad_size = (4096-200)/2
for file in file_list:
    tf_name = file.split('/')[-1][:-7]
    sei_output = h5py.File('../data/eclip/sei/'+tf_name+'_200.h5','w')
    batch_size = 128
    file = h5py.File(file,'r')
    for label in ('train','valid','test'):
        embed = embed_extractor()
        model.spline_tr.register_forward_hook(embed.get_activation('s_out'))
        for i in tqdm(range(0,len(file['X_'+label]),batch_size)):
            seq = file['X_'+label][i:i+batch_size][:,:4,:].astype('float32')
            pad_seq = np.pad(seq,((0,0),(0,0),(math.floor(pad_size),math.ceil(pad_size))))
            with torch.no_grad():
                output_seq = model(torch.from_numpy(pad_seq).to('cuda'))
        #sanity check
        assert len(embed.activation['s_out']) == file['Y_'+label].shape[0]
        sei_output.create_dataset(name='x_'+label,data = np.array(embed.activation['s_out']),dtype = 'float32')
        sei_output.create_dataset(name='y_'+label,data = file['Y_'+label][:],dtype='int')
        embed.activation['s_out'] = []
    sei_output.close()


## MT Splice

In [None]:
file = h5py.File('../data/alternative_splicing/delta_logit.h5','r')
sei_output = h5py.File('../data/alternative_splicing/sei_splice.h5','w')
batch_size = 32
pad_size = (4096-400)/2
for label in ('valid','test','train'):
    embed = embed_extractor()
    model.spline_tr.register_forward_hook(embed.get_activation('s_out'))
    l_cache = []
    r_cache = []  
    for i in tqdm(range(0,len(file['x_'+label]),batch_size)):
        l_seq = []
        r_seq = []
        seq = file['x_'+label][i:i+batch_size].astype('float32')
        seq = np.swapaxes(seq,1,2)
        for s in seq:
            l_seq.append(s[:,:400])
            r_seq.append(s[:,400:])
        l_pad = np.pad(l_seq,((0,0),(0,0),(math.floor(pad_size),math.ceil(pad_size))))
        r_pad = np.pad(r_seq,((0,0),(0,0),(math.floor(pad_size),math.ceil(pad_size))))
        with torch.no_grad():
            output_seq = model(torch.from_numpy(l_pad).to('cuda'))
            l_cache.extend(embed.activation['s_out'])
            embed.activation={}
            output_seq = model(torch.from_numpy(r_pad).to('cuda'))
            r_cache.extend(embed.activation['s_out'])
            embed.activation={}
        
    sei_output.create_dataset(name='xl_'+label,data = np.array(l_cache),dtype = 'float32')
    sei_output.create_dataset(name='xr_'+label,data = np.array(r_cache),dtype = 'float32')
    sei_output.create_dataset(name='y_'+label,data = file['y_'+label][:],dtype='float32') 
    sei_output.close()


# INSERT-seq

In [4]:
sei_output = h5py.File('../data/RNAenlong/sei_embed.h5','w')
file = h5py.File('../data/RNAenlong/insert_dataset.h5','r')
batch_size = 32
pad_size = (4096-file['X_train'].shape[1])/2
for dataset in ['test','train','valid']:
    key = 'X_'+dataset
    onehot = file[key]
    embed = embed_extractor()
    model.spline_tr.register_forward_hook(embed.get_activation('s_out'))
    for i in tqdm(range(0,len(onehot),batch_size)):
        seq = onehot[i:i+batch_size].astype('float32')
        seq = np.swapaxes(seq,1,2)
        pad_seq = np.pad(seq,((0,0),(0,0),(math.floor(pad_size),math.ceil(pad_size))))
        with torch.no_grad():
            output_seq = model(torch.from_numpy(pad_seq).to('cuda'))
    sei_output.create_dataset(name=key,data = np.array(embed.activation['s_out']))
    sei_output.create_dataset(name='Y_'+dataset,data = file['Y_'+dataset][:])
sei_output.close()

  return F.conv1d(input, weight, bias, self.stride,
100%|██████████| 36/36 [00:01<00:00, 22.96it/s]
100%|██████████| 286/286 [00:12<00:00, 23.75it/s]
100%|██████████| 36/36 [00:01<00:00, 23.51it/s]
