In [1]:
import pandas as pd
import numpy as np
import h5py
import sys
sys.path.append('../')
import utils
cell_type =['K562','HepG2','WTC11']
from tqdm import tqdm

## initial data creation

In [2]:
for ct in cell_type:
    out_file = h5py.File('/home/amber/multitask_RNA/data/lenti_MPRA/'+ct+'_data.h5','w')
    data_file = '/home/amber/multitask_RNA/data/lenti_MPRA/'+ct+'_data.csv'
    seq_file = '/home/amber/multitask_RNA/data/lenti_MPRA/'+ct+'_seq.csv'
    data_df = pd.read_csv(data_file,usecols=['name','mean'])
    seq_df = pd.read_csv(seq_file,usecols=['name',"230nt sequence (15nt 5' adaptor - 200nt element - 15nt 3' adaptor)"])
    select_df = pd.concat([data_df.set_index('name'),seq_df.set_index('name')], axis=1, join='inner')
    select_df.rename(columns = {"230nt sequence (15nt 5' adaptor - 200nt element - 15nt 3' adaptor)":'seq'}, inplace = True)
    out_file.create_dataset('seq',data=select_df['seq'].values)
    out_file.create_dataset('mean',data=select_df['mean'].values)
    out_file.close()

## onehot conversion

In [3]:
for ct in cell_type:
    onehot_list = []
    in_file = h5py.File('/home/amber/multitask_RNA/data/lenti_MPRA/'+ct+'_data.h5','r')
    out_file = h5py.File('/home/amber/multitask_RNA/data/lenti_MPRA/'+ct+'_onehot.h5','w')
    seq = in_file['seq']
    for i in tqdm(range(len(seq))):
        seq_onehot = utils.seq_to_onehot(seq[i].decode())
        onehot_list.append(seq_onehot.T)
    target = in_file['mean'][()]
    out_file.create_dataset('onehot',data=np.array(onehot_list))
    out_file.create_dataset('target',data=target)
    out_file.close()
    

100%|██████████| 226254/226254 [00:29<00:00, 7591.46it/s]
100%|██████████| 139877/139877 [00:18<00:00, 7670.98it/s]
100%|██████████| 55989/55989 [00:07<00:00, 7719.33it/s]


## reverse complement dataset

In [4]:
for ct in cell_type:
    onehot_list = []
    target_list = []
    in_file = h5py.File('/home/amber/multitask_RNA/data/lenti_MPRA/'+ct+'_onehot.h5','r')
    out_file = h5py.File('/home/amber/multitask_RNA/data/lenti_MPRA/'+ct+'_onehot_rc.h5','w')
    seq = in_file['onehot']
    target = in_file['target']
    for i in tqdm(range(len(seq))):
        rc_onehot = utils.onehot_rc(seq[i],rc_range=(15,215))
        onehot_list.append(rc_onehot)
        target_list.append(target[i])
        onehot_list.append(seq[i])
        target_list.append(target[i])
        
    out_file.create_dataset('onehot',data=np.array(onehot_list))
    out_file.create_dataset('target',data=target)
    out_file.close()
    

100%|██████████| 226254/226254 [00:05<00:00, 41108.95it/s]
100%|██████████| 139877/139877 [00:03<00:00, 42266.88it/s]
100%|██████████| 55989/55989 [00:01<00:00, 42379.05it/s]
