In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

# function to parse tfrecord row
def generate_tfr_parser(raw=False):
    
    def parse_proto(example_protos):
        """Parse TFRecord protobuf."""

        # define features
        features = {
            'sequence': tf.io.FixedLenFeature([], tf.string),
            'species': tf.io.FixedLenFeature([], tf.string),
            'mask': tf.io.FixedLenFeature([], tf.string),
            'repeat_mask': tf.io.FixedLenFeature([], tf.string),
        }


        # parse example into features
        parsed_features = tf.io.parse_single_example(example_protos, features=features)

        # decode sequence
        sequence = tf.io.decode_raw(parsed_features['sequence'], tf.uint8)

        # decode species label
        species = tf.io.decode_raw(parsed_features['species'], tf.int32)
        
        # decode mask
        mask = tf.io.decode_raw(parsed_features['mask'], tf.uint8)

        # decode repeat_mask
        repeat_mask = tf.io.decode_raw(parsed_features['repeat_mask'], tf.uint8)

        return sequence, species, mask, repeat_mask

    return parse_proto

        # features_dict = {
        #     'sequence': feature_bytes(seq_1hot),
        #     'mask': feature_bytes(mask),
        #     'repeat_mask': feature_bytes(repeat_mask),
        #     'species': feature_bytes(species_arr),
        # }

# function to convert tfrecord to numpy arrays
def tfr_to_numpy(dataset, return_inputs=True, return_species=True, return_masks=True, return_repeat_masks=True, dtype='float16'):
    """ Convert TFR inputs and/or outputs to numpy arrays."""
    with tf.name_scope('numpy'):
        dataset = dataset.map(generate_tfr_parser(raw=True))
        dataset = dataset.batch(1)

    # initialize input, species and mask lists
    seqs_1hot = []
    species = []
    masks = []
    repeat_masks = []

    # collect inputs and species
    for seq_raw, species_raw, mask_raw, repeat_mask_raw in dataset:
        
        # sequence
        if return_inputs:
            seq_1hot = seq_raw.numpy().reshape((16384,1))
            seqs_1hot.append(seq_1hot[None, ...])
        
        # species
        if return_species:
            species1 = species_raw.numpy().astype('int32')
            species1 = np.reshape(species1, (1,-1))
            species.append(species1[None, ..., 0])
        
        # mask
        if return_masks:
            mask = mask_raw.numpy().reshape((16384,))
            masks.append(mask[None, ...])

        # Repeat mask
        if return_repeat_masks:
            repeat_mask = repeat_mask_raw.numpy().reshape((16384,))
            repeat_masks.append(repeat_mask[None, ...])

    # make arrays
    seqs_1hot = np.concatenate(seqs_1hot, axis=0)
    species = np.concatenate(species, axis=0, dtype='int32')
    masks = np.concatenate(masks, axis=0)
    repeat_masks = np.concatenate(repeat_masks, axis=0)

    ret_tuple = []
    
    # return
    if return_inputs:
        ret_tuple.append(seqs_1hot)
    if return_species:
        ret_tuple.append(species)
    if return_masks:
        ret_tuple.append(masks)
    if return_repeat_masks:
        ret_tuple.append(repeat_masks)
    
    return ret_tuple



2024-08-09 18:26:30.735273: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-09 18:26:30.735384: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-09 18:26:31.384757: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-09 18:26:32.877643: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#Load single tf record

rec0 = tf.data.TFRecordDataset('/scratch4/khc/yeast_ssm/data/yeast/ensembl_fungi_59/test_chrXI_chrXIII_chrXV__valid_chrXII_chrXIV_chrXVI/data_saccharomycetales_gtf/tfrecords/valid-0.tfr', compression_type='ZLIB')
# rec0 = tf.data.TFRecordDataset('/scratch4/jlinder/seqnn/data/yeast/ensembl_fungi_59_2/data_fungi_sm_gtf/tfrecords/train-101.tfr', compression_type='ZLIB')

x, y, m, rm = tfr_to_numpy(rec0)
# x, y, m = tfr_to_numpy(rec0)

print("x.shape = " + str(x.shape))
print("y.shape = " + str(y.shape))
print("m.shape = " + str(m.shape))
print("rm.shape = " + str(rm.shape))


x.shape = (488, 16384, 1)
y.shape = (488, 1)
m.shape = (488, 16384)
rm.shape = (488, 16384)


In [3]:
y

array([[109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
       [109],
      

In [12]:
#Example 0 of tf record 'train-0' corresponds to the first row in sequences.bed labeled 'train'
#Example 10 of tf record 'train-1' corresponds to the 512 + 10:th row in sequences.bed labeled 'train'
# etc.

#Sequence 1-hot
x[0, :10, 0]


array([2, 0, 3, 0, 3, 3, 1, 0, 3, 1], dtype=uint8)

In [13]:
x[0, :1000, 0]

array([2, 0, 3, 0, 3, 3, 1, 0, 3, 1, 3, 3, 1, 2, 2, 0, 1, 0, 1, 3, 2, 2,
       1, 0, 1, 1, 2, 3, 2, 2, 1, 1, 3, 2, 2, 2, 2, 0, 2, 3, 0, 3, 1, 3,
       3, 3, 2, 3, 3, 2, 2, 1, 3, 0, 2, 3, 3, 1, 3, 3, 1, 1, 0, 1, 3, 2,
       3, 2, 0, 0, 0, 0, 3, 3, 2, 0, 3, 0, 2, 1, 3, 2, 0, 3, 3, 3, 1, 1,
       0, 2, 3, 0, 0, 2, 0, 0, 1, 2, 2, 0, 2, 0, 3, 2, 2, 3, 3, 3, 1, 0,
       1, 1, 0, 2, 3, 2, 1, 0, 2, 3, 1, 3, 0, 3, 1, 0, 1, 1, 1, 2, 1, 3,
       1, 3, 0, 1, 2, 2, 0, 0, 3, 1, 1, 0, 3, 3, 0, 3, 0, 0, 1, 1, 2, 0,
       2, 2, 3, 0, 3, 3, 3, 2, 1, 2, 3, 0, 2, 1, 1, 1, 3, 2, 3, 3, 1, 0,
       3, 1, 3, 3, 2, 0, 2, 3, 3, 1, 1, 0, 0, 1, 0, 1, 2, 0, 3, 0, 2, 0,
       2, 0, 2, 3, 2, 3, 1, 2, 1, 1, 2, 3, 1, 3, 0, 2, 1, 0, 1, 1, 0, 0,
       3, 0, 2, 1, 3, 2, 0, 1, 1, 3, 3, 3, 1, 1, 0, 0, 2, 2, 3, 0, 2, 2,
       0, 2, 1, 1, 0, 2, 1, 2, 0, 0, 1, 0, 3, 3, 2, 1, 3, 3, 2, 0, 1, 0,
       1, 3, 3, 1, 3, 3, 1, 3, 2, 2, 1, 0, 0, 1, 3, 3, 2, 3, 2, 0, 1, 2,
       1, 3, 3, 2, 1, 3, 3, 3, 2, 0, 3, 2, 0, 0, 0,

In [14]:
#Species index (for r64 there is just one species)
y

array([[56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],
       [56],

In [9]:
#Exon mask (0 for non-coding, 1 for coding bps)
m[0, :10]
# Count the number of 1s in the first 10 elements of the first row
count_of_ones = np.sum(m[0, :])
print("Number of 1s:", count_of_ones)


Number of 1s: 12522


In [10]:
#Exon mask (0 for non-coding, 1 for coding bps)
rm[0, :10]
count_of_ones = np.sum(rm[0, :])
print("Number of 1s:", count_of_ones)

Number of 1s: 274
