In [1]:
import h5py
import sys
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'
sys.path.append('/home/ztang/multitask_RNA/data_generation')
import utils
import numpy as np
import haiku as hk
import jax
import jax.numpy as jnp
from nucleotide_transformer.pretrained import get_pretrained_model
from tqdm import tqdm
import json
import tensorflow as tf
model_name = '2B5_1000G'
if '2B5' in model_name:
    print('2B5_model')
    embed_layer = 32
else:
    print('500M model')
    embed_layer = 24
cell_name = 'HepG2'
include_seq = True



2B5_model


In [2]:
def feature_bytes(values):
  """Convert numpy arrays to bytes features."""
  values = values.flatten().tostring()
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))

def feature_str(values):
  """Convert str to bytes features."""
  # value = np.array(values)
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))

def feature_floats(values):
  """Convert numpy arrays to floats features.
     Requires more space than bytes."""
  values = values.flatten().tolist()
  return tf.train.Feature(float_list=tf.train.FloatList(value=values))

In [3]:
data_file = h5py.File('/home/ztang/multitask_RNA/data/lenti_MPRA/'+cell_name+'_data.h5', 'r')
sequence = data_file['seq'][()]
target = data_file['mean'][()]

In [4]:
parameters, forward_fn, tokenizer, config = get_pretrained_model(
    model_name=model_name,
    mixed_precision=False,
    embeddings_layers_to_save=(embed_layer,),
    attention_maps_to_save=(),
    max_positions=41,
)
forward_fn = hk.transform(forward_fn)

In [5]:
N,  = sequence.shape
seq_pair = []
seq_onehot = []
for i in tqdm(range(N)):
    seq = sequence[i].decode()
    seq_onehot.append(utils.seq_to_onehot(seq))
    token_out = tokenizer.batch_tokenize([seq])
    token_id = [b[1] for b in token_out]
    seq_pair.append(np.squeeze(token_id))
#seq_pair = jnp.asarray(seq_pair,dtype=jnp.int32)


100%|██████████| 139877/139877 [00:11<00:00, 12135.83it/s]


In [6]:
#get embedding per input sequence
batch_size = 1024
lenti_embed = []
random_key = jax.random.PRNGKey(0)
for i in tqdm(range(0, N, batch_size)):
    seq_batch = jnp.asarray(seq_pair[i:i+batch_size],dtype=jnp.int32)
    outs = forward_fn.apply(parameters, random_key, seq_batch)
    lenti_embed.extend(np.array(outs['embeddings_'+str(embed_layer)]))

100%|██████████| 137/137 [08:32<00:00,  3.74s/it]


In [7]:
target = np.array(target)
lenti_embed = np.array(lenti_embed)
seq_onehot = np.array(seq_onehot,dtype=np.float32)

In [10]:
all_index = np.random.permutation(len(target))
train_index = all_index[:int(0.8*len(target))]
valid_index = all_index[int(0.8*len(target)):int(0.9*len(target))]
test_index = all_index[int(0.9*len(target)):]
num_samples = 512
if include_seq:
    tfr_dir = '/home/ztang/multitask_RNA/data/lenti_MPRA_embed/'+cell_name+'_seq_'+model_name+'/'
else:
    tfr_dir = '/home/ztang/multitask_RNA/data/lenti_MPRA_embed/'+cell_name+'_'+model_name+'/'

for dataset in ['test','valid','train']:
    print('saving '+dataset)
    index = globals()[dataset+ '_index'] 
    sub_target = target[index]
    sub_embed = lenti_embed[index]
    if include_seq:
        sub_seq = seq_onehot[index]

    num_tfrecords = len(index) // num_samples
    print('tfr directory: ', tfr_dir)
    if len(index) % num_samples:
        num_tfrecords += 1
    if not os.path.exists(tfr_dir):
        os.makedirs(tfr_dir)
        os.makedirs(tfr_dir+'/tfrecords')
    tfr_file_dir = tfr_dir+'/tfrecords'
    tf_opts = tf.io.TFRecordOptions(compression_type='ZLIB')
    for tfrec_num in tqdm(range(num_tfrecords)):
        end = ((tfrec_num + 1) * num_samples)
        end = end if end < len(index) else len(index)-1
        idx_range = range((tfrec_num * num_samples) , end)

        with tf.io.TFRecordWriter(
            tfr_file_dir+'/'+dataset+ "-%d.tfr" % tfrec_num, tf_opts
        ) as writer:
            for idx in idx_range:
                if include_seq:
                    features_dict = {
                    'sequence': feature_bytes(sub_embed[idx,:,:].astype('float16')),
                    'target': feature_bytes(sub_target[idx].astype('float16')),
                    'onehot' : feature_bytes(sub_seq[idx,:,:].astype('float16'))
                    }
                else:
                    features_dict = {
                    'sequence': feature_bytes(sub_embed[idx,:,:].astype('float16')),
                    'target': feature_bytes(sub_target[idx].astype('float16'))
                    }
                example = tf.train.Example(features=tf.train.Features(feature=features_dict))
                writer.write(example.SerializeToString())

saving test
tfr directory:  /home/ztang/multitask_RNA/data/lenti_MPRA_embed/HepG2_seq_2B5_1000G/


  values = values.flatten().tostring()
100%|██████████| 28/28 [01:29<00:00,  3.21s/it]


saving valid
tfr directory:  /home/ztang/multitask_RNA/data/lenti_MPRA_embed/HepG2_seq_2B5_1000G/


100%|██████████| 28/28 [01:30<00:00,  3.24s/it]


saving train
tfr directory:  /home/ztang/multitask_RNA/data/lenti_MPRA_embed/HepG2_seq_2B5_1000G/


100%|██████████| 219/219 [12:02<00:00,  3.30s/it]


In [11]:
stats_dict = {}
stats_dict['num_targets'] = 1
stats_dict ['onehot_length']=seq_onehot.shape[1]
stats_dict['embed_length'] =lenti_embed.shape[1]
stats_dict['embed_dim'] =lenti_embed.shape[2]
stats_dict['crop_bp'] = 0
stats_dict['train_seqs'] = len(train_index)
stats_dict['valid_seqs'] =len(valid_index)
stats_dict['test_seqs'] = len(test_index)

with open('%s/statistics.json' % tfr_dir, 'w') as stats_json_out:
  json.dump(stats_dict, stats_json_out, indent=4)

# Move to tfr

In [1]:
import tensorflow as tf
import os
import json
import h5py
from tqdm import tqdm

tfr_dir = '/home/ztang/multitask_RNA/data/lenti_MPRA_embed/HepG2_2B_1000G/'
filtered_file = h5py.File('/home/ztang/multitask_RNA/data/lenti_MPRA/HepG2_data_embed.h5','r')


2023-04-28 14:04:30.691429: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def feature_bytes(values):
  """Convert numpy arrays to bytes features."""
  values = values.flatten().tostring()
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))

def feature_str(values):
  """Convert str to bytes features."""
  # value = np.array(values)
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))

def feature_floats(values):
  """Convert numpy arrays to floats features.
     Requires more space than bytes."""
  values = values.flatten().tolist()
  return tf.train.Feature(float_list=tf.train.FloatList(value=values))

In [3]:
def h5_to_tfr(dataset,filtered_file,tfr_dir):
    num_samples = 512
    num_tfrecords = len(filtered_file[dataset+'_x']) // num_samples
    print('dataset size: '+ str(len(filtered_file[dataset+'_x'])))
    if len(filtered_file[dataset+'_x']) % num_samples:
        num_tfrecords += 1
    if not os.path.exists(tfr_dir):
        os.makedirs(tfr_dir)
        os.makedirs(tfr_dir+'/tfrecords')
    tfr_file_dir = tfr_dir+'/tfrecords'
    tf_opts = tf.io.TFRecordOptions(compression_type='ZLIB')
    for tfrec_num in tqdm(range(num_tfrecords)):
        end = ((tfrec_num + 1) * num_samples)
        end = end if end < len(filtered_file[dataset+'_x']) else len(filtered_file[dataset+'_x'])-1
        idx_range = range((tfrec_num * num_samples) , end)

        with tf.io.TFRecordWriter(
            tfr_file_dir+'/'+dataset+ "-%d.tfr" % tfrec_num, tf_opts
        ) as writer:
            for idx in idx_range:
                features_dict = {
                'sequence': feature_bytes(filtered_file[dataset+'_x'][idx,:,:].astype('float16')),
                'target': feature_bytes(filtered_file[dataset+'_y'][idx].astype('float16'))
                }
                example = tf.train.Example(features=tf.train.Features(feature=features_dict))
                writer.write(example.SerializeToString())

In [4]:
h5_to_tfr('test',filtered_file,tfr_dir)

dataset size: 13988


  values = values.flatten().tostring()
  0%|          | 0/28 [00:22<?, ?it/s]


KeyboardInterrupt: 

In [None]:
stats_dict = {}
stats_dict['num_targets'] = filtered_file['train_y'].shape[-1]
stats_dict['seq_length'] = filtered_file['train_x'].shape[1]
stats_dict['pool_width'] = 1
stats_dict['crop_bp'] = 0
stats_dict['target_length'] = filtered_file['train_y'].shape[1]
stats_dict['train_seqs'] = filtered_file['train_x'].shape[0]
stats_dict['valid_seqs'] = filtered_file['valid_x'].shape[0]

with open('%s/statistics.json' % tfr_dir, 'w') as stats_json_out:
  json.dump(stats_dict, stats_json_out, indent=4)