In [None]:
# built-in
import os
from time import gmtime, strftime

# libraries
import h5py
import numpy as np
import pandas as pd


In [None]:
# Initializations
MACHINE = 'RoboticArm'
INPUT_FOLDER = f'data/{MACHINE}'
OUTPUT_FOLDER = f'data/{MACHINE}/windowed'
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# constants
# Duration of initial data time affected by the gyroscope warm-up period
GYROSCOPE_WARM_UP_TIME = pd.to_timedelta('35ms')
WINDOW_SIZE_TS = pd.to_timedelta('100ms')

# Initialize utility dictionary for preprocessing operations
sensor_dict = {
    'imp23absu_mic': {
        'fs': 16000,
        'number_of_channel': 1
    },
    'ism330dhcx_acc': {
        'fs': 7063,  # Estimated sampling rate calculated by averaging time deltas across all files
        'number_of_channel': 3
    },
    'ism330dhcx_gyro': {
        'fs': 7063,  # Estimated sampling rate calculated by averaging time deltas across all files
        'number_of_channel': 3
    }
}

for sensor in sensor_dict.keys():
    sensor = sensor_dict[sensor]
    sensor['window_length'] = int(
        sensor['fs'] * WINDOW_SIZE_TS.total_seconds())
sensor_dict


In [None]:
normal_source_train = pd.read_csv(
    f'{INPUT_FOLDER}/train/attributes_normal_source_train.csv',
    index_col=0)
normal_target_train = pd.read_csv(
    f'{INPUT_FOLDER}/train/attributes_normal_target_train.csv',
    index_col=0)
normal_source_test = pd.read_csv(
    f'{INPUT_FOLDER}/test/attributes_normal_source_test.csv',
    index_col=0)
anomaly_source_test = pd.read_csv(
    f'{INPUT_FOLDER}/test/attributes_anomaly_source_test.csv',
    index_col=0)
normal_target_test = pd.read_csv(
    f'{INPUT_FOLDER}/test/attributes_normal_target_test.csv',
    index_col=0)
anomaly_target_test = pd.read_csv(
    f'{INPUT_FOLDER}/test/attributes_anomaly_target_test.csv',
    index_col=0)

Train_Metadata = pd.concat(
    [normal_source_train, normal_target_train], axis=0).reset_index(drop=True)
Test_Metadata = pd.concat([normal_source_test,
                           anomaly_source_test,
                           normal_target_test,
                           anomaly_target_test],
                          axis=0).reset_index(drop=True)

# create segment id column
Train_Metadata['segment_id'] = Train_Metadata['imp23absu_mic'].apply(
    lambda x: x.replace('imp23absu_mic_', ''))
Test_Metadata['segment_id'] = Test_Metadata['imp23absu_mic'].apply(
    lambda x: x.replace('imp23absu_mic_', ''))

# add customized path to each filepath in the Metadata dataframes
for sensor in sensor_dict.keys():
    Train_Metadata[sensor] = INPUT_FOLDER + '/train/' + Train_Metadata[sensor]
    Test_Metadata[sensor] = INPUT_FOLDER + '/test/' + Test_Metadata[sensor]


In [None]:
# Loop through each dataset split type ('train' and 'test') with
# corresponding metadata
for split_type, metadata in zip(
        ['train', 'test'], [Train_Metadata, Test_Metadata]):
    # Define the save path for the HDF5 file
    save_path = '{}/{}_dataset_window_{:.3f}s.h5'.format(
        OUTPUT_FOLDER,
        split_type,
        WINDOW_SIZE_TS.total_seconds()
    )

    print(save_path)

    # Open the HDF5 file in write mode
    with h5py.File(save_path, 'w') as h5file:
        # ================================================================ INIT
        # Initialize datasets dictionary to store HDF5 datasets
        datasets = {}

        # Create datasets for each sensor defined in sensor_dict
        for sensor in sensor_dict.keys():
            window_length = sensor_dict[sensor]['window_length']
            number_of_channel = sensor_dict[sensor]['number_of_channel']

            # Create a dataset for each sensor with specified shape and
            # chunking
            datasets[sensor] = h5file.create_dataset(
                sensor,
                shape=(0, number_of_channel, window_length),
                maxshape=(None, number_of_channel, window_length),
                chunks=True
            )

        # Create additional datasets for segment ID and various labels

        # dataset containing the index of corresponding segment
        datasets['segment_id'] = h5file.create_dataset(
            'segment_id',
            shape=(0, 1),
            maxshape=(None, 1),
            chunks=True,
            dtype=h5py.string_dtype(encoding='utf-8')
        )

        # dataset containing split labels
        datasets['split_label'] = h5file.create_dataset(
            'split_label',
            shape=(0, 1),
            maxshape=(None, 1),
            chunks=True,
            dtype=h5py.string_dtype(encoding='utf-8')
        )

        # dataset containing anomaly labels
        datasets['anomaly_label'] = h5file.create_dataset(
            'anomaly_label',
            shape=(0, 1),
            maxshape=(None, 1),
            chunks=True,
            dtype=h5py.string_dtype(encoding='utf-8')
        )

        # dataset containing operational domain shift labels
        datasets['domain_shift_op'] = h5file.create_dataset(
            'domain_shift_op',
            shape=(0, 1),
            maxshape=(None, 1),
            chunks=True,
            dtype=h5py.string_dtype(encoding='utf-8')
        )

        # dataset containing environmental domain shift labels
        datasets['domain_shift_env'] = h5file.create_dataset(
            'domain_shift_env',
            shape=(0, 1),
            maxshape=(None, 1),
            chunks=True,
            dtype=h5py.string_dtype(encoding='utf-8')
        )

        # ============================================  DATA SEGMENTATION INTO
        # Every row of the Metadata represent the i-th segment of one specific recording:
        # the same segment is recorded for all sensors, named in the same way and its path
        # is linked in the appropriate column of the dataframe

        # Iterate over all segments in the metadata
        for file_index in range(len(metadata)):
            try:
                print(
                    f'Completed: {file_index / (len(metadata)-1)*100:.2f}%',
                    end='\r')

                # Load and process data for each sensor
                for sensor in sensor_dict:
                    sensor_df = pd.read_parquet(metadata[sensor][file_index])
                    sensor_df['Time'] = pd.to_datetime(
                        sensor_df['Time'], unit='s')
                    sensor_df.set_index('Time', inplace=True)
                    sensor_df.sort_index(inplace=True)

                    sensor_dict[sensor]['data_raw'] = sensor_df
                    sensor_dict[sensor]['max_ts'] = sensor_df.index[-1]
                    sensor_dict[sensor]['min_ts'] = sensor_df.index[0]

                # Determine the time range for the segment: makes sure that
                # there is available data for all sensors
                max_ts_list = [sensor_dict[sensor]['max_ts']
                               for sensor in sensor_dict]
                min_ts_list = [sensor_dict[sensor]['min_ts']
                               for sensor in sensor_dict]

                start_timestamp = max(
                    sensor_dict['ism330dhcx_gyro']['min_ts'] +
                    GYROSCOPE_WARM_UP_TIME,
                    max(min_ts_list))
                end_timestamp = min(max_ts_list)

                # Extract labels for the segment
                segment_id = metadata['segment_id'][file_index]
                split_label = metadata['split_label'][file_index]
                anomaly_label = metadata['anomaly_label'][file_index]
                domain_shift_op = metadata['domain_shift_op'][file_index]
                domain_shift_env = metadata['domain_shift_env'][file_index]

                flag = 1
                number_of_window = (
                    end_timestamp - start_timestamp) // WINDOW_SIZE_TS

                # Iterate over each sensor to process the data into windows
                for sensor in sensor_dict:
                    sensor_df = sensor_dict[sensor]['data_raw']
                    num_points_per_window = sensor_dict[sensor]['window_length']
                    num_channel = sensor_dict[sensor]['number_of_channel']

                    # Iterate over each window in the segment
                    for window_idx in range(number_of_window):
                        start = start_timestamp + window_idx * WINDOW_SIZE_TS
                        end = start + WINDOW_SIZE_TS
                        sensor_df_window = sensor_df[start:end].values

                        # Zero-pad or truncate the window to match the expected
                        # length
                        l = len(sensor_df_window)
                        if l < num_points_per_window:
                            pad_size = num_points_per_window - l
                            padding = np.zeros((pad_size, num_channel))
                            sensor_df_window = np.vstack(
                                [sensor_df_window, padding])
                        else:
                            sensor_df_window = sensor_df_window[:num_points_per_window, :]

                        # Resize and store the windowed data in the HDF5
                        # dataset
                        current_size = datasets[sensor].shape[0]
                        datasets[sensor].resize(current_size + 1, axis=0)
                        datasets[sensor][-1] = sensor_df_window.T

                        if flag:
                            current_size = datasets['segment_id'].shape[0]

                            datasets['segment_id'].resize(
                                current_size + 1, axis=0)
                            datasets['segment_id'][-1] = segment_id

                            datasets['split_label'].resize(
                                current_size + 1, axis=0)
                            datasets['split_label'][-1] = split_label

                            datasets['anomaly_label'].resize(
                                current_size + 1, axis=0)
                            datasets['anomaly_label'][-1] = anomaly_label

                            datasets['domain_shift_op'].resize(
                                current_size + 1, axis=0)
                            datasets['domain_shift_op'][-1] = domain_shift_op

                            datasets['domain_shift_env'].resize(
                                current_size + 1, axis=0)
                            datasets['domain_shift_env'][-1] = domain_shift_env

                    flag = 0
            except Exception as e:
                print('could not read file index {}'.format(file_index), e)
