In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pickle
import gzip

# Load files

In [4]:
data_dir = '/mnt/disk1/yunseob/Pytorch/SSM/1_Semblex/05_no_overlap/SemblexData/wt_4040'
data_config = {}
data_config['GOOD'], data_config['BAD'] = [], []

for file in os.listdir(data_dir):
    if 'GOOD' in file:
        data_config['GOOD'].append(file)
    elif 'BAD' in file:
        data_config['BAD'].append(file)
        
for cls, files in data_config.items():
    for i, file in enumerate(files):
        data = np.load(os.path.join(data_dir, file))
        data_config[cls][int(i)] = {'0_index': int(i+1), '1_file': file, '2_data': data}

# Train / Test (Random split)

- Data was overlapped with 10 % in the processing that segment the clipped signal with 1650(1s) of window size.
- Splitting training and testing set randomly would be VALID because Data was overlapped to next data with only 10 %

In [5]:
for cls, files in data_config.items():
    for i, file in enumerate(files):
        data = data_config[cls][int(i)]['2_data']
        data_num = len(data)
        train_idx = np.random.choice(data_num, int(0.8*data_num), replace = False)
        train_data = data[train_idx]
        test_data = data[np.setdiff1d(np.arange(data_num), train_idx)]
        data_config[cls][int(i)]['3_train_data'], data_config[cls][int(i)]['4_test_data'] = train_data, test_data

In [6]:
data_config

{'BAD': [{'0_index': 1,
   '1_file': 'SEMBLEX_200211__BAD_CONDITIONS__HEAVY_OIL__WT_1s_1650hz__232_40_40.npy',
   '2_data': array([[[[2.73868076e-01, 5.60269834e-02],
            [2.82058503e-01, 5.79557324e-02],
            [2.90057805e-01, 5.98646863e-02],
            ...,
            [2.98858206e-01, 7.51236113e-02],
            [2.91117624e-01, 7.36789371e-02],
            [2.83152587e-01, 7.21583197e-02]],
   
           [[3.86514126e-01, 5.03509060e-02],
            [4.02268927e-01, 5.25377081e-02],
            [4.17808853e-01, 5.47153917e-02],
            ...,
            [4.31086111e-01, 6.96776438e-02],
            [4.15806173e-01, 6.78560402e-02],
            [4.00248616e-01, 6.59597160e-02]],
   
           [[4.58541073e-01, 3.39911496e-02],
            [4.83559332e-01, 3.55681628e-02],
            [5.08542239e-01, 3.71247331e-02],
            ...,
            [5.20195255e-01, 4.28278254e-02],
            [4.95270814e-01, 4.15303117e-02],
            [4.70244738e-01, 4.01914

In [7]:
with gzip.open(data_dir + '/SemblexData_config.pickle', 'wb') as f:
    pickle.dump(data_config, f)

# Load data

In [8]:
data_dir = '/mnt/disk1/yunseob/Pytorch/SSM/1_Semblex/05_no_overlap/SemblexData/wt_4040'

with gzip.open(data_dir + '/SemblexData_config.pickle', 'rb') as f:
    data = pickle.load(f)

In [9]:
data['BAD'][0].keys()

dict_keys(['3_train_data', '4_test_data', '2_data', '1_file', '0_index'])

In [10]:
for cls, data_bunch in data.items():
    for data_i in data_bunch:
        print("{} - {:02d} - {}".format(cls, data_i['0_index'], data_i['1_file']))
        print("Shape of data = total: {}, training: {}, testing: {}".format(data_i['2_data'].shape, 
                                                                                    data_i['3_train_data'].shape, 
                                                                                    data_i['4_test_data'].shape))
        print('')

GOOD - 01 - SEMBLEX_200211__GOOD_DATA__REGULAR__WT_1s_1650hz__394_40_40.npy
Shape of data = total: (394, 40, 40, 2), training: (315, 40, 40, 2), testing: (79, 40, 40, 2)

GOOD - 02 - SEMBLEX_200211__GOOD_DATA__REGULAR__WT_1s_1650hz__597_40_40.npy
Shape of data = total: (597, 40, 40, 2), training: (477, 40, 40, 2), testing: (120, 40, 40, 2)

GOOD - 03 - SEMBLEX_200211__GOOD_DATA__REGULAR__WT_1s_1650hz__674_40_40.npy
Shape of data = total: (674, 40, 40, 2), training: (539, 40, 40, 2), testing: (135, 40, 40, 2)

GOOD - 04 - SEMBLEX_200211__GOOD_DATA__REGULAR__WT_1s_1650hz__777_40_40.npy
Shape of data = total: (777, 40, 40, 2), training: (621, 40, 40, 2), testing: (156, 40, 40, 2)

BAD - 01 - SEMBLEX_200211__BAD_CONDITIONS__HEAVY_OIL__WT_1s_1650hz__232_40_40.npy
Shape of data = total: (232, 40, 40, 2), training: (185, 40, 40, 2), testing: (47, 40, 40, 2)

BAD - 02 - SEMBLEX_200211__BAD_TOOLS__CHIPPED_PUNCH__WT_1s_1650hz__124_40_40.npy
Shape of data = total: (124, 40, 40, 2), training: (99,