In [1]:
import os
import sys
import shutil
import numpy as np
import tensorflow as tf
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [22]:
def read_samples(dataset_path, endswith=".csv"):
    datapaths, labels = list(), list()
    label = 0
    classes = sorted(os.walk(dataset_path).__next__()[1])
    # List each sub-directory (the classes)
    for c in classes:
        c_dir = os.path.join(dataset_path, c)
        walk = os.walk(c_dir).__next__()
        # Add each image to the training set
        for sample in walk[2]:
            # Only keeps csv samples
            if sample.endswith(endswith):
                datapaths.append(os.path.join(c_dir, sample))
                labels.append(label)
        label += 1
    return np.array(datapaths), np.array(labels), classes

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def convert_to(data_paths, labels, dest_path, class_name):
    """Converts a dataset to tfrecords."""
    filename = os.path.join(dest_path, class_name + '.tfrecords')
    if not os.path.exists(os.path.join(dest_path)):
        os.makedirs(os.path.join(dest_path))
    
    print('Writing', filename)
    with tf.python_io.TFRecordWriter(filename) as writer:
        for index in range(len(data_paths)):
            data_raw = np.loadtxt(open(data_paths[index], "rb"), delimiter=",").astype(np.float32)
            for i in range(540):
                data_raw[:, i] = scalers[i].transform(np.expand_dims(data_raw[:, i], axis=0))
            example = tf.train.Example(
              features=tf.train.Features(
                  feature={
                      'label': _int64_feature(int(labels[index])),
                      'data': _bytes_feature(data_raw.tostring())
                  }))
            writer.write(example.SerializeToString())
            
            sys.stdout.write("\r{}/{}".format(len(data_paths), index+1))
            sys.stdout.flush()
    print("\n")
            
def read_array(data_path):
    return np.loadtxt(open(data_path, "rb"), delimiter=",")

def scale_data(data_path):
    array = np.loadtxt(open(data_path, "rb"), delimiter=",")
    
    for i in range(540):
        array[:, i] = scalers[i].transform(np.expand_dims(array[:, i], axis=0))
    
    path, file = os.path.split(data_path)
    _, class_name = os.path.split(path) 
        
    np.savetxt((os.path.join(os.path.join(dest_path, class_name), file)), array.astype(np.float32), delimiter=",")

In [29]:
src_path = "/home/kalvik/Downloads" #"/home/kalvik/shared/CSI_DATA/preprocessed_level2/"
dest_path = "/home/kalvik/Downloads/tfrecords"#"/home/kalvik/shared/CSI_DATA/tfrecords/"

X, y, classes = read_samples(src_path)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
num_classes = len(classes)

In [4]:
print("Calculating scalers for training data")

jobs = 16
scalers = []
for i in range(540):
    scalers.append(StandardScaler())

for i in range(0, len(X_train), jobs):
    arrays = Parallel(n_jobs=jobs, verbose=0)(delayed(read_array)(addr) for addr in X_train[i:i+jobs-1])
    arrays = np.array(arrays)
    for j in range(540):
        scalers[j].partial_fit(arrays[:, :, j])
        
    sys.stdout.write("\r{}/{}".format(len(X_train), i+arrays.shape[0]))
    sys.stdout.flush()

Calculating scalers for training data
5/5

In [30]:
for i in range(1, num_classes):
    indices = np.where( y_train == i )
    os.makedirs(os.path.join(dest_path, "train"))
    convert_to(X_train[indices], y_train[indices], os.path.join(dest_path, "train"), classes[i])
    
    indices = np.where( y_test == i )
    os.makedirs(os.path.join(dest_path, "test"))
    convert_to(X_test[indices], y_test[indices], os.path.join(dest_path, "test"), classes[i])

In [None]:
print("\n\nScaling training data")

dest_path = "/home/kalvik/shared/CSI_DATA/preprocessed_final/train"
classes = os.walk(src_path).__next__()[1]
for class_name in classes:
    if not os.path.exists(os.path.join(dest_path, class_name)):
        os.makedirs(os.path.join(dest_path, class_name))

jobs = 16
for i in range(0, len(X_train), jobs):
    Parallel(n_jobs=jobs, verbose=0)(delayed(scale_data)(addr) for addr in X_train[i:i+jobs-1])
    
    sys.stdout.write("\r{}/{}".format(len(X_train), i+jobs))
    sys.stdout.flush()
    
print("\n\nScaling testing data")

dest_path = "/home/kalvik/shared/CSI_DATA/preprocessed_final/test"
classes = os.walk(src_path).__next__()[1]
for class_name in classes:
    if not os.path.exists(os.path.join(dest_path, class_name)):
        os.makedirs(os.path.join(dest_path, class_name))
        
jobs = 16
for i in range(0, len(X_test), jobs):
    Parallel(n_jobs=jobs, verbose=0)(delayed(scale_data)(addr) for addr in X_test[i:i+jobs-1])
    
    sys.stdout.write("\r{}/{}".format(len(X_test), i+jobs))
    sys.stdout.flush()