The dataset is obtained from https://engineering.case.edu/bearingdatacenter/download-data-file

In [None]:
from scipy import io
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

#### Data loader functions

In [None]:
'''Preprocessing snippet from https://github.com/XiongMeijing/CWRU-1/blob/master/helper.py'''

def matfile_to_dic(folder_path):
    '''
    Read all the matlab files of the CWRU Bearing Dataset and return a 
    dictionary. The key of each item is the filename and the value is the data 
    of one matlab file, which also has key value pairs.
    
    Parameter:
        folder_path: 
            Path (Path object) of the folder which contains the matlab files.
    Return:
        output_dic: 
            Dictionary which contains data of all files in the folder_path.
    '''
    output_dic = {}
    for _, filepath in enumerate(folder_path.glob('*.mat')):
        # strip the folder path and get the filename only.
        key_name = str(filepath).split('\\')[-1]
        output_dic[key_name] = io.loadmat(filepath)
    return output_dic

def remove_dic_items(dic):
    '''
    Remove redundant data in the dictionary returned by matfile_to_dic inplace.
    '''
    # For each file in the dictionary, delete the redundant key-value pairs
    for _, values in dic.items():
        del values['__header__']
        del values['__version__']    
        del values['__globals__']

def rename_keys(dic):
    '''
    Rename some keys so that they can be loaded into a 
    DataFrame with consistent column names
    '''
    # For each file in the dictionary
    for _,v1 in dic.items():
        # For each key-value pair, rename the following keys 
        for k2,_ in list(v1.items()):
            if 'DE_time' in k2:
                v1['DE_time'] = v1.pop(k2)
            elif 'BA_time' in k2:
                v1['BA_time'] = v1.pop(k2)
            elif 'FE_time' in k2:
                v1['FE_time'] = v1.pop(k2)
            elif 'RPM' in k2:
                v1['RPM'] = v1.pop(k2)
                
def label(filename):
    '''
    Function to create label for each signal based on the filename. Apply this
    to the "filename" column of the DataFrame.
    Usage:
        df['label'] = df['filename'].apply(label)
    '''
    if 'B007' in filename:
        return 1
    elif 'IR007' in filename:
        return 2
    elif 'OR007' in filename:
        return 3
    elif 'B014' in filename:
        return 4
    elif 'IR014' in filename:
        return 5
    elif 'OR014' in filename:
        return 6
    elif 'B021' in filename:
        return 7
    elif 'IR021' in filename:
        return 8
    elif 'OR021' in filename:
        return 9
    elif 'normal' in filename:
        return 0


def fault_severity(filename):
    '''
    Function to create label for each signal based on the filename. Apply this
    to the "filename" column of the DataFrame.
    Usage:
        df['fault_severity'] = df['filename'].apply(fault_severity)
    '''
    if '007' in filename:
        return 7
    elif '014' in filename:
        return 14
    elif '021' in filename:
        return 21
    elif 'normal' in filename:
        return 0
    
def fault_type(filename):
    '''
    Function to create label for each signal based on the filename. Apply this
    to the "filename" column of the DataFrame.
    Usage:
        df['fault_type'] = df['filename'].apply(fault_type)
    '''
    if 'B' in filename:
        return 'B'
    elif 'IR' in filename:
        return 'IR'
    elif 'OR' in filename:
        return 'OR'
    elif 'normal' in filename:
        return 0
    
    
def divide_signal(df, segment_length):
    '''
    This function divide the signal into segments, each with a specific number 
    of points as defined by segment_length. Each segment will be added as an 
    example (a row) in the returned DataFrame. Thus it increases the number of 
    training examples. The remaining points which are less than segment_length 
    are discarded.
    
    Parameter:
        df: 
            DataFrame returned by matfile_to_df()
        segment_length: 
            Number of points per segment.
    Return:
        DataFrame with segmented signals and their corresponding filename and 
        label
    '''
    dic = {}
    idx = 0
    for i in range(df.shape[0]):
        n_sample_points = len(df.iloc[i,1])
        n_segments = n_sample_points // segment_length
        for segment in range(n_segments):
            dic[idx] = {
                'signal': df.iloc[i,1][segment_length * segment:segment_length * (segment+1)], 
                'label': df.iloc[i,2],
                'filename' : df.iloc[i,0]
            }
            idx += 1
    df_tmp = pd.DataFrame.from_dict(dic,orient='index')
    df_output = pd.concat(
        [df_tmp[['label', 'filename']], 
         pd.DataFrame(np.hstack(df_tmp["signal"].values).T)
        ], 
        axis=1 )
    return df_output


def normalize_signal(df):
    '''
    Normalize the signals in the DataFrame returned by matfile_to_df() by subtracting
    the mean and dividing by the standard deviation.
    '''
    mean = df['DE_time'].apply(np.mean)
    std = df['DE_time'].apply(np.std)
    df['DE_time'] = (df['DE_time'] - mean) / std
    
    mean = df['FE_time'].apply(np.mean)
    std = df['FE_time'].apply(np.std)
    df['FE_time'] = (df['FE_time'] - mean) / std

#### Get data

Data classes:
    
    - 0: Normal
    - 1: B007
    - 2: IR007
    - 3: OR007
    - 4: B014
    - 5: IR014
    - 6: OR014
    - 7: B021
    - 8: IR021
    - 9: OR021

In [None]:
data_path = Path('Data/48kDE_CWRU/')
dic = matfile_to_dic(data_path)
remove_dic_items(dic)
rename_keys(dic)

In [None]:
df = pd.DataFrame.from_dict(dic).T
df = df.reset_index().rename(mapper={'index':'filename'},axis=1)
df['label'] = df['filename'].apply(label)
df['operating_condition'] = df['filename'].apply(operating_condition)
df['fault_severity'] = df['filename'].apply(fault_severity)
df['fault_type'] = df['filename'].apply(fault_type)
df.drop(['X217','RPM', 'ans'], axis=1, errors='ignore', inplace = True)

In [None]:
normalize_signal(df)

In [None]:
df_de = df.drop(['BA_time','FE_time', 'RPM', 'ans'], axis=1, errors='ignore')
df_de_pr = divide_signal(df_de, 512)
df_de_pr['label'] = df_de_pr['filename'].apply(label)
df_fe = df.drop(['BA_time','DE_time', 'RPM', 'ans'], axis=1, errors='ignore')
df_fe_pr = divide_signal(df_fe, 512)
df_fe_pr['label'] = df_fe_pr['filename'].apply(label)
df_de_arr = np.array(df_de_pr[df_de_pr.columns[2:]])
df_fe_arr = np.array(df_fe_pr[df_fe_pr.columns[2:]])
df_both = np.dstack((df_fe_arr, df_de_arr))

In [None]:
## Split the data into train and validation set
X_train, X_test, y_train, y_test = train_test_split(df_both, 
                                                      df_de_pr['label'].values,
                                                      test_size=0.30, random_state=42, shuffle=True)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
np.save('Data/48kDE_CWRU_processed/normalized_train_test/X_train.npy', X_train)
np.save('Data/48kDE_CWRU_processed/normalized_train_test/X_test.npy', X_test)
np.save('Data/48kDE_CWRU_processed/normalized_train_test/y_train.npy', y_train)
np.save('Data/48kDE_CWRU_processed/normalized_train_test/y_test.npy', y_test)

#### Fraction of samples for data efficient experiments

In [None]:
# X_train = np.load('Data/48kDE_CWRU_processed/train_test/X_train.npy')
# y_train = np.load('Data/48kDE_CWRU_processed/train_test/y_train.npy')

In [None]:
def select_by_number_sample(X_train, y_train):
    number_samples = [5, 10, 20, 50, 100]
    for samples in number_samples:
        indexes = []
        for label in range(len(np.unique(y_train))):
            ind = np.where(y_train == label)[0]
            selected_ind = np.random.choice(ind, samples)
            indexes.append(selected_ind)
            print(samples,label,  X_train[selected_ind].shape)
        s_indexes = np.hstack(indexes)
        print(s_indexes, len(s_indexes))
        features = X_train[s_indexes]
        labels = y_train[s_indexes] 
        np.save('Data/48kDE_CWRU_processed/normalized_number_samples/'+str(samples)+'_X_train.npy', features)
        np.save('Data/48kDE_CWRU_processed/normalized_number_samples/'+str(samples)+'_y_train.npy', labels)

In [None]:
select_by_number_sample(X_train, y_train)

#### Invariance to noval fault types and severity (clustering)
- Train on all dataset except for fault type B [1,4,7] with all fault severities 
- Train on all dataset except of fault type IR [2,5,8] with all fault severities
- Train on all datset except for fault type OR [3,6,9] with all fault severities. 


In [None]:
x_train = np.load('Data/48kDE_CWRU_processed/train_test/X_train.npy')
y_train = np.load('Data/48kDE_CWRU_processed/train_test/y_train.npy')

In [None]:
def remove_fault_types(x_train, y_train, fault_remove_list, fault_category):
    indexes = []
    for fault in fault_remove_list:
        indexes.append(np.where(y_train==fault)[0])
    s_indexes = np.hstack(indexes)
    print(s_indexes.shape)
    X_train_sample = np.delete(x_train, s_indexes, axis=0)
    y_train_sample = np.delete(y_train, s_indexes, axis=0)
    print(X_train_sample.shape, y_train_sample.shape)
    np.save('Data/48kDE_CWRU_processed/fault_types/'+fault_category+'/X_train.npy', X_train_sample)
    np.save('Data/48kDE_CWRU_processed/fault_types/'+fault_category+'/y_train.npy', y_train_sample)

In [None]:
remove_fault_types(x_train, y_train, [1,4,7], 'B')

In [None]:
remove_fault_types(x_train, y_train, [2,5,8], 'IR')

In [None]:
remove_fault_types(x_train, y_train, [3,6,9], 'OR')

----------------------