The datatset is obtained from https://mb.uni-paderborn.de/en/kat/main-research/datacenter/bearing-datacenter/data-sets-and-download

In [2]:
from scipy import io
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

#### Data Loader functions 

- 0 for force 16005
- 1 for phase_current_1 256070
- 2 for phase_current_2 256070
- 3 for speed 16005
- 4 for temp_2_bearing_module 5
- 5 for torque 16005
- 6 for vibration signal 256070

In [2]:
def matfile_to_array(folder_path, bearing_list, label, damage_type, damage_severity):
    for bearing in bearing_list:
        files_path = Path(folder_path + '/' + bearing)
        operating_condition = ['N09_M07_F10', 'N15_M01_F10', 'N15_M07_F04', 'N15_M07_F10']
        filesname_list = ['N09_M07_F10', 'N15_M01_F10', 'N15_M07_F04', 'N15_M07_F10']
        for f in filesname_list:
            f = f + '_'+bearing
            filename = str(f).split('\\')[-1][:16]
            for oc in operating_condition:
                if oc == filename[:11]:
                    data_readings = []
                    for reading in range(1, 21):
                        filename_reading = "".join([filename, '_', str(reading)])
                        if filename_reading != "N15_M01_F10_KA08_2":
                            print("".join([folder_path, bearing, '/', filename_reading, '.mat']))
                            file = io.loadmat("".join([folder_path, bearing, '/', filename_reading, '.mat']))#'Data/PDUData/Healthy/K001/N09_M07_F10_K001_1.mat')
                            data = file[filename_reading]['Y'][0][0][0][6][2][0]
                            data_readings.append(data)
                            #all_data = np.hstack(data_readings)
                np.save("".join(["Data/KAT_preprocessing", '/', bearing, '_', oc, '_', label, 
                                 '_', damage_severity, '_', damage_type, '.npy']),  np.array(data_readings).flatten())#all_data)

In [3]:
def array_to_dic(folder_path):
    '''
    Read all the matlab files of the CWRU Bearing Dataset and return a 
    dictionary. The key of each item is the filename and the value is the data 
    of one matlab file, which also has key value pairs.
    
    Parameter:
        folder_path: 
            Path (Path object) of the folder which contains the matlab files.
    Return:
        output_dic: 
            Dictionary which contains data of all files in the folder_path.
    '''
    output_dic = {}
    for _, filepath in enumerate(folder_path.glob('*.npy')):
        # strip the folder path and get the filename only.
        key_name = str(filepath).split('\\')[-1]
        output_dic[key_name] = np.load(filepath, allow_pickle = True)
    return output_dic

In [4]:
def label(filename):
    '''
    Function to create label for each signal based on the filename. Apply this
    to the "filename" column of the DataFrame.
    Usage:
        df['label'] = df['filename'].apply(label)
    '''
    if 'H_S0' in filename:
        return 0
    elif 'IR_S1' in filename:
        return 1
    elif 'OR_S1' in filename:
        return 2
    elif 'IR_S2' in filename:
        return 3
    elif 'OR_S2' in filename:
        return 4

In [5]:
def divide_signal(df, segment_length):
    '''
    This function divide the signal into segments, each with a specific number 
    of points as defined by segment_length. Each segment will be added as an 
    example (a row) in the returned DataFrame. Thus it increases the number of 
    training examples. The remaining points which are less than segment_length 
    are discarded.
    
    Parameter:
        df: 
            DataFrame returned by matfile_to_df()
        segment_length: 
            Number of points per segment.
    Return:
        DataFrame with segmented signals and their corresponding filename and 
        label
    '''
    dic = {}
    idx = 0
    for i in range(df.shape[0]):
        n_sample_points = len(df.iloc[i,1])
        n_segments = n_sample_points // segment_length
        for segment in range(n_segments):
            dic[idx] = {
                'signal': df.iloc[i,1][segment_length * segment:segment_length * (segment+1)], 
                'label': df.iloc[i,2],
                'filename' : df.iloc[i,0]
            }
            idx += 1
    df_tmp = pd.DataFrame.from_dict(dic,orient='index')
    df_output = pd.concat(
        [df_tmp[['label', 'filename']], 
         pd.DataFrame(np.hstack(df_tmp["signal"].values).T)
        ], 
        axis=1 )
    return df_tmp


#### Get data

In [None]:
folder_path = 'Data/KAT/Healthy/'
bearing_list = ['K001', 'K002', 'K003', 'K004', 'K005', 'K006']
matfile_to_array(folder_path, bearing_list, 'H', 'None', 'S0')

folder_path = 'Data/KAT/Real damage/'
bearing_list = ['KA04', 'KA15', 'KA22', 'KA30']
matfile_to_array(folder_path, bearing_list, 'OR', 'Real', 'S1')


folder_path = 'Data/KAT/Real damage/'
bearing_list = ['KA16']
matfile_to_array(folder_path, bearing_list, 'OR', 'Real', 'S2')


folder_path = 'Data/KAT/Real damage/'
bearing_list = ['KI04', 'KI14', 'KI17', 'KI21']
matfile_to_array(folder_path, bearing_list, 'IR', 'Real', 'S1')


folder_path = 'Data/KAT/Real damage/'
bearing_list = ['KI18']
matfile_to_array(folder_path, bearing_list, 'IR', 'Real', 'S2')


folder_path = 'Data/KAT/Artifical damage/'
bearing_list = ['KA01', 'KA05', 'KA07']#, 'KI01', 'KI03', 'KI05', 'KI07', 'KI08']
matfile_to_array(folder_path, bearing_list, 'OR', 'Artifical', 'S1')

folder_path = 'Data/KAT/Artifical damage/'
bearing_list = ['KA03',  'KA06', 'KA08', 'KA09']#, 'KI01', 'KI03', 'KI05', 'KI07', 'KI08']
matfile_to_array(folder_path, bearing_list, 'OR', 'Artifical', 'S2')

folder_path = 'Data/KAT/Artifical damage/'
bearing_list = ['KI01', 'KI03', 'KI05']
matfile_to_array(folder_path, bearing_list, 'IR', 'Artifical', 'S1')

folder_path = 'Data/KAT/Artifical damage/'
bearing_list = ['KI07', 'KI08']
matfile_to_array(folder_path, bearing_list, 'IR', 'Artifical', 'S2')

In [8]:
data_path = Path('Data/KAT_preprocessing/')
dic = array_to_dic(data_path)
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in dic.items()])).T
df = df.reset_index().rename(mapper={'index':'filename'},axis=1)
df['label'] = df['filename'].apply(label)
# df.fillna(0, inplace = True)

In [None]:
appended_data = []
for i in range(20):
    new_df = divide_signal(df[['filename', i, 'label']], 1200)
    print(i, new_df.shape)
    appended_data.append(new_df)
df_all = pd.concat(appended_data, ignore_index = True)

In [22]:
signal_data = np.array(df_all[['signal']])
out = np.vstack(signal_data[:,0])
## Split the data into train and validation set
X_train, X_test, y_train, y_test = train_test_split(out, 
                                                      df_all['label'],
                                                      test_size=0.30, random_state=42, shuffle=True)

In [23]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(335033, 1200) (143586, 1200) (335033,) (143586,)


In [24]:
X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]

In [25]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(335033, 1200, 1) (143586, 1200, 1) (335033,) (143586,)


In [26]:
np.save('Data/KAT_processed/train_test/X_train.npy', X_train)
np.save('Data/KAT_processed/train_test/X_test.npy', X_test)
np.save('Data/KAT_processed/train_test/y_train.npy', y_train)
np.save('Data/KAT_processed/train_test/y_test.npy', y_test)

In [None]:
np.unique(y_test, return_counts=True)

In [18]:
df_all.to_csv('Data/KAT_processed/df_all_signals.csv')

#### Fraction of samples for data efficient experiments

In [27]:
X_train = np.load('Data/KAT_processed/train_test/X_train.npy')
y_train = np.load('Data/KAT_processed/train_test/y_train.npy')

In [30]:
def select_by_number_sample(X_train, y_train):
    number_samples = [5, 10, 20, 50, 100]
    for samples in number_samples:
        indexes = []
        for label in range(len(np.unique(y_train))):
            ind = np.where(y_train == label)[0]
            selected_ind = np.random.choice(ind, samples)
            indexes.append(selected_ind)
            print(samples,label,  X_train[selected_ind].shape)
        s_indexes = np.hstack(indexes)
        features = X_train[s_indexes]
        print(features.shape)
        labels = y_train[s_indexes] #to_categorical()
        np.save('Data/KAT_processed/number_samples/'+str(samples)+'_X_train.npy', features)
        np.save('Data/KAT_processed/number_samples/'+str(samples)+'_y_train.npy', labels)

In [31]:
select_by_number_sample(X_train, y_train)

5 0 (5, 1200, 1)
5 1 (5, 1200, 1)
5 2 (5, 1200, 1)
5 3 (5, 1200, 1)
5 4 (5, 1200, 1)
(25, 1200, 1)
10 0 (10, 1200, 1)
10 1 (10, 1200, 1)
10 2 (10, 1200, 1)
10 3 (10, 1200, 1)
10 4 (10, 1200, 1)
(50, 1200, 1)
20 0 (20, 1200, 1)
20 1 (20, 1200, 1)
20 2 (20, 1200, 1)
20 3 (20, 1200, 1)
20 4 (20, 1200, 1)
(100, 1200, 1)
50 0 (50, 1200, 1)
50 1 (50, 1200, 1)
50 2 (50, 1200, 1)
50 3 (50, 1200, 1)
50 4 (50, 1200, 1)
(250, 1200, 1)
100 0 (100, 1200, 1)
100 1 (100, 1200, 1)
100 2 (100, 1200, 1)
100 3 (100, 1200, 1)
100 4 (100, 1200, 1)
(500, 1200, 1)


#### Invariance to noval fault types and severity (clustering)
    - Train on all dataset except for fault type OR [2,4] with all damages severity 
    - Train on all dataset except for fault type IR [1,3] with all damages severity 

In [42]:
def remove_fault_types(x_train, y_train, fault_remove_list, fault_category):
    indexes = []
    for fault in fault_remove_list:
        indexes.append(np.where(y_train==fault)[0])
    s_indexes = np.hstack(indexes)
    print(s_indexes.shape)
    X_train_sample = np.delete(x_train, s_indexes, axis=0)
    y_train_sample = np.delete(y_train, s_indexes, axis=0)
    print(X_train_sample.shape, y_train_sample.shape)
    np.save('Data/KAT_processed/fault_types/'+fault_category+'/X_train.npy', X_train_sample)
    np.save('Data/KAT_processed/fault_types/'+fault_category+'/y_train.npy', y_train_sample)

In [41]:
x_train = np.load('Data/KAT_processed/train_test/X_train.npy')
y_train = np.load('Data/KAT_processed/train_test/y_train.npy')

In [43]:
remove_fault_types(x_train, y_train, [1,3], 'IR')

(119734,)
(215299, 1200, 1) (215299,)


In [44]:
remove_fault_types(x_train, y_train, [2,4], 'OR')

(143639,)
(191394, 1200, 1) (191394,)
