<a href="https://colab.research.google.com/github/aaronjoseph/siim-covid19-detection/blob/master/EDA_2_TF_Records.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import tensorflow as tf
print(tf.__version__)

2.5.0


In [6]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    # If the value is an eager tensor BytesList won't unpack a string from an EagerTensor.
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() 
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [8]:
#strings needs to be converted into bytes.
print(_bytes_feature(b'ome string'))
print(_float_feature(0.5))
print(_int64_feature(True))
print(_int64_feature(1))

bytes_list {
  value: "ome string"
}

float_list {
  value: 0.5
}

int64_list {
  value: 1
}

int64_list {
  value: 1
}



In [9]:
import numpy as np
a = np.random.randn(2,2)
_bytes_feature(tf.io.serialize_tensor(a))

bytes_list {
  value: "\010\002\022\010\022\002\010\002\022\002\010\002\" Y\270\023\031J\367\335\277F\261tt\325\306\344?\233\236\371^{\013\371\277\364QJy\276O\360\277"
}

In [11]:
n_observations = 1000
feature0 = np.random.choice([False,True],n_observations)
feature1 = np.random.randn(n_observations)
strings = np.array([b'cat',b'dog'])
feature2 = np.random.choice(strings,n_observations)
feature3 = np.random.randn(n_observations,2,2)

In [12]:
dataset = tf.data.Dataset.from_tensor_slices((feature0,feature1,feature2,feature3))

In [13]:
def create_example(feature0,feature1,feature2,feature3):
    feature = {
        'feature0': _int64_feature(feature0),
        'feature1': _float_feature(feature1),
        'feature2': _bytes_feature(feature2),
        'feature3': _bytes_feature(feature3),
    }

    example_proto = tf.train.Example(features = tf.train.Features(feature = feature))
    return example_proto

In [16]:
for feature0, feature1, feature2, feature3 in dataset.take(1):
    example_proto = create_example(feature0,
                                   feature1,
                                   feature2,
                                   tf.io.serialize_tensor(feature3))
    print(example_proto)

features {
  feature {
    key: "feature0"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "feature1"
    value {
      float_list {
        value: 0.13842745125293732
      }
    }
  }
  feature {
    key: "feature2"
    value {
      bytes_list {
        value: "dog"
      }
    }
  }
  feature {
    key: "feature3"
    value {
      bytes_list {
        value: "\010\002\022\010\022\002\010\002\022\002\010\002\" \242.%\314\241\271\346\277\331\372\215\271m~\362?SDl\267Fe\372\277\263\032v\017E\374\347\277"
      }
    }
  }
}



In [17]:
def serialize_example(feature0, feature1, feature2, feature3):
    feature = {
      'feature0': _int64_feature(feature0),
      'feature1': _float_feature(feature1),
      'feature2': _bytes_feature(feature2),
      'feature3': _bytes_feature(feature3),
    }
    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [18]:
for feature0, feature1, feature2, feature3 in dataset.take(1):
  serialized_example = serialize_example(feature0, 
                                 feature1, 
                                 feature2, 
                                 tf.io.serialize_tensor(feature3))
  print(serialized_example)

b'\n~\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x14\n\x08feature1\x12\x08\x12\x06\n\x04\xed\xbf\r>\n\x13\n\x08feature2\x12\x07\n\x05\n\x03dog\n>\n\x08feature3\x122\n0\n.\x08\x02\x12\x08\x12\x02\x08\x02\x12\x02\x08\x02" \xa2.%\xcc\xa1\xb9\xe6\xbf\xd9\xfa\x8d\xb9m~\xf2?SDl\xb7Fe\xfa\xbf\xb3\x1av\x0fE\xfc\xe7\xbf'


In [19]:
file_path = 'data.tfrecords'
with tf.io.TFRecordWriter(file_path) as writer:
    for feature0, feature1 , feature2, feature3 in dataset:
        serialized_example = serialize_example(feature0, feature1, feature2, tf.io.serialize_tensor(feature3))
    writer.write(serialized_example)

In [20]:
file_paths = [file_path]
tfrecord_dataset = tf.data.TFRecordDataset(file_paths)

In [21]:
def read_tfrecord(serialized_example):
    feature_description = {
            'feature0': tf.io.FixedLenFeature((), tf.int64),
            'feature1': tf.io.FixedLenFeature((), tf.float32),
            'feature2': tf.io.FixedLenFeature((), tf.string),
            'feature3': tf.io.FixedLenFeature((), tf.string),
    }
    example = tf.io.parse_single_example(serialized_example, feature_description)
  
    feature0 = example['feature0']
    feature1 = example['feature1']
    feature2 = example['feature2']
    feature3 = tf.io.parse_tensor(example['feature3'], out_type = tf.float64)
    
    return feature0, feature1, feature2, feature3

In [22]:
parsed_dataset = tfrecord_dataset.map(read_tfrecord)
for data in parsed_dataset.take(2):
  print(data)

(<tf.Tensor: shape=(), dtype=int64, numpy=1>, <tf.Tensor: shape=(), dtype=float32, numpy=2.0239651>, <tf.Tensor: shape=(), dtype=string, numpy=b'dog'>, <tf.Tensor: shape=(2, 2), dtype=float64, numpy=
array([[-0.27805062,  1.01309051],
       [-0.86768141, -0.59519124]])>)
