In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0
sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
matplotlib 3.1.1
numpy 1.17.2
pandas 0.25.1
sklearn 0.21.3
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [None]:
source_dir = './generated_csv'

In [None]:
def get_filenames_by_prefix(source_dir,prefix):
    files = os.listdir(source_dir)
    return [os.path.join(source_dir,file) for file in files if file.startswith(prefix)]

train_filenames = get_filenames_by_prefix(source_dir, "train")
valid_filenames = get_filenames_by_prefix(source_dir, "valid")
test_filenames = get_filenames_by_prefix(source_dir, "test")

In [None]:
def parse_csv_line(line,n_fileds = 9):
    defs = [np.nan] * n_fileds
    parsed_fileds = tf.io.decode_csv(line,defs)
    x = parsed_fileds[:-1]
    y = parsed_fileds[-1]
    return x,y

In [None]:
def csv_reader_dataset(
    file_names
    , n_readers = 5
    , n_parse_threads = 5
    , batch_size = 32
    , shuffle_buffer_size = 10000
):
    dataset = tf.data.Dataset.list_files(file_names)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line,
                          num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

In [None]:
batch_size = 32
train_set = csv_reader_dataset(train_filenames,
                               batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames,
                               batch_size = batch_size)
test_set = csv_reader_dataset(test_filenames,
                              batch_size = batch_size)

In [None]:
def serialize_example(x,y):
    '''
    Converts x, y to tf.train.Example and serialize
    '''
    x_float_list = tf.train.FloatList(value=x)
    y_float_list = tf.train.FloatList(value=y)
    
    features = tf.train.Example(feature = {
        'input_features': tf.train.FeatureList(float_list = x_float_list)
        ,'label': tf.train.FeatureList(float_list = y_float_list)
    })
    example = tf.train.Example(features=features)
    return example.SerializeToString()

In [None]:
def csv_dataset_to_tfrecords(
    base_filename
    , dataset
    , n_shards
    , steps_per_shard
    , compression_type = None
):
    options = tf.io.TFRecordOptions(compression_type=compression_type)
    all_filenames = []
    
    for shard_id in range(n_shards):
        filename_fullpath = '{}_{:05d}-of-{:05d}'.format(
            base_filename, shard_id, n_shards)
        with tf.io.TFRecordWriter(filename_fullpath,options=options) as writer:
            for x_batch,y_batch in dataset.skip(shard_id * steps_per_shard).take(steps_per_shard):
                for x_example,y_example in zip(x_batch,y_batch):
                    writer.write(
                        serialize_example(x_example,y_example)
                    )
        all_filenames.append(filename_fullpath)
        return all_filenames

In [None]:
n_shards = 20
train_steps_per_shard = 11610 // batch_size // n_shards
valid_steps_per_shard = 3880 // batch_size // n_shards
test_steps_per_shard = 5170 // batch_size // n_shards

output_dir = "generate_tfrecords"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

train_basename = os.path.join(output_dir, "train")
valid_basename = os.path.join(output_dir, "valid")
test_basename = os.path.join(output_dir, "test")

train_tfrecord_filenames = csv_dataset_to_tfrecords(
    train_basename, train_set, n_shards, train_steps_per_shard, None)
valid_tfrecord_filenames = csv_dataset_to_tfrecords(
    valid_basename, valid_set, n_shards, valid_steps_per_shard, None)
test_tfrecord_fielnames = csv_dataset_to_tfrecords(
    test_basename, test_set, n_shards, test_steps_per_shard, None)