In [1]:
import os
import sys
import h5py
import shutil
import numpy as np
from sklearn.decomposition import PCA

  from ._conv import register_converters as _register_converters


In [2]:
def read_samples(dataset_path, endswith=".csv"):
    datapaths, labels = list(), list()

    label = 0

    classes = sorted(os.walk(dataset_path).__next__()[1])
    
    # List each sub-directory (the classes)
    for c in classes:
        c_dir = os.path.join(dataset_path, c)
        walk = os.walk(c_dir).__next__()
        # Add each image to the training set
        for sample in walk[2]:
            # Only keeps csv samples
            if sample.endswith(endswith):
                datapaths.append(os.path.join(c_dir, sample))
                labels.append(label)
        label += 1

    return datapaths, labels

In [3]:
src_path =  "/home/kalvik/shared/CSI_DATA/preprocessed_final/"
dest_path = "/home/kalvik/shared/preprocessed.h5"

X_train, y_train = read_samples(src_path+"train")
X_test, y_test = read_samples(src_path+"test")

In [4]:
print("Calculating scalers for training data")
dataset = []
for i in range(len(X_train)):
    data = np.loadtxt(open(X_train[i], "rb"), delimiter=",")
    data = data.flatten()
    if (data.shape[0] == 4320000):
        dataset.append([data, y_train[i]])
        
    sys.stdout.write("\r%.2f%%" % (((i+1)/len(X_train))*100))
    sys.stdout.flush()

Calculating scalers for training data
100.00%

In [8]:
pca = PCA(n_components = 0.95)
X_train, y_train = zip(*dataset)
pca.fit(X_train)
X_train = pca.transform(X_train)

In [9]:
print(np.array(X_train).shape, np.array(y_train).shape)

(849, 333) (849,)


In [10]:
del dataset

In [13]:
def preprocess_data(file_paths, pca):    
    dataset = []
    for i in range(len(file_paths)):   
        data = np.loadtxt(open(file_paths[i], "rb"), delimiter=",")
        data = data.flatten()
        data = np.expand_dims(data, axis=0)
        data = pca.transform(data)
        dataset.append(data)

        sys.stdout.write("\r%.2f%%" % (((i+1)/len(file_paths))*100))
        sys.stdout.flush()
    return np.array(dataset)        

In [16]:
print("\ntesting data")
X_test = preprocess_data(X_test, pca)


testing data
100.00%

In [17]:
X_test = np.squeeze(X_test)
print(np.array(X_test).shape, np.array(X_train).shape, np.array(y_test).shape, np.array(y_train).shape)

(150, 333) (849, 333) (150,) (849,)


In [18]:
hf = h5py.File(dest_path, 'w')
hf.create_dataset('X_train', data=np.array(X_train))
hf.create_dataset('X_test', data=np.array(X_test))
hf.create_dataset('y_train', data=np.array(y_train))
hf.create_dataset('y_test', data=np.array(y_test))
hf.close()