In [1]:
import os
import sys
import shutil
import numpy as np
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
def read_samples(dataset_path, endswith=".csv"):
    datapaths, labels = list(), list()

    label = 0

    classes = sorted(os.walk(dataset_path).__next__()[1])
    
    # List each sub-directory (the classes)
    for c in classes:
        c_dir = os.path.join(dataset_path, c)
        walk = os.walk(c_dir).__next__()
        # Add each image to the training set
        for sample in walk[2]:
            # Only keeps csv samples
            if sample.endswith(endswith):
                datapaths.append(os.path.join(c_dir, sample))
                labels.append(label)
        label += 1

    return datapaths, labels

In [3]:
src_path = "/home/kalvik/shared/CSI_DATA/preprocessed_level2/"

X, y = read_samples(src_path)
X_train, X_test, _, _ = train_test_split(X, y, test_size=0.15, random_state=42)

In [4]:
def read_array(data_path):
    return np.loadtxt(open(data_path, "rb"), delimiter=",")

In [9]:
print("Calculating scalers for training data")

jobs = 16
scalers = []
for i in range(540):
    scalers.append(StandardScaler())

for i in range(0, len(X_train), jobs):
    arrays = Parallel(n_jobs=jobs, verbose=0)(delayed(read_array)(addr) for addr in X_train[i:i+jobs-1])
    arrays = np.array(arrays)
    for j in range(540):
        scalers[j].partial_fit(arrays[:, :, j])
        
    sys.stdout.write("\r{}/{}".format(len(X_train), i+jobs))
    sys.stdout.flush()

Calculating scalers for training data
1096/1104

In [18]:
def scale_data(data_path):
    array = np.loadtxt(open(data_path, "rb"), delimiter=",")
    
    for i in range(540):
        array[:, i] = scalers[i].transform(np.expand_dims(array[:, i], axis=0))
    
    path, file = os.path.split(data_path)
    _, class_name = os.path.split(path)
    
    if not os.path.exists(os.path.join(dest_path, class_name)):
        os.makedirs(os.path.join(dest_path, class_name)) 
        
    np.savetxt((os.path.join(os.path.join(dest_path, class_name), file)), array.astype(np.float32), delimiter=",")

In [19]:
print("\n\nScaling training data")

dest_path = dest_path = "/home/kalvik/shared/CSI_DATA/preprocessed_final/train"
jobs = 16
for i in range(0, len(X_train), jobs):
    Parallel(n_jobs=jobs, verbose=0)(delayed(scale_data)(addr) for addr in X_train[i:i+jobs-1])
    
    sys.stdout.write("\r{}/{}".format(len(X_train), i+jobs))
    sys.stdout.flush()
    
print("\n\nScaling testing data")

dest_path = dest_path = "/home/kalvik/shared/CSI_DATA/preprocessed_final/test"
jobs = 16
for i in range(0, len(X_test), jobs):
    Parallel(n_jobs=jobs, verbose=0)(delayed(scale_data)(addr) for addr in X_test[i:i+jobs-1])
    
    sys.stdout.write("\r{}/{}".format(len(X_test), i+jobs))
    sys.stdout.flush()



Scaling training data
1096/1104

Scaling testing data
194/208