## Load and preprocess data
This code reads the data files, performs some transformations to prepare the data and does the train, test and validation split and saves the numpy arrays to use for the construction, training and testing of the models.

In [3]:
# libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import scipy.io as sio
from scipy.fft import fft, fftfreq
from keras.utils import to_categorical
import os

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [4]:
# load a single file as a numpy array changing to frequency domain
def load_file(filepath, filename):
	data = sio.loadmat(filepath)
	
	# vibration data
	vib_data = data[filename]['Y'][0][0][0][6][2].transpose()
	vib_data = vib_data.reshape(vib_data.shape[0],)

	# frequency domain and splitting into samples of size 1024 (512 in freq)
	n_loop = round(vib_data.shape[0]/1024)
	vib_freq = np.zeros([n_loop,512])
	for i in range(n_loop):
		vib_data_1024 = vib_data[1024*i:1024*(1+i)]
		vib_freq[i] = np.abs(fft(vib_data_1024))[0:1024//2]

	return vib_freq

In [68]:
# load a dataset group
def load_dataset_group(prefix='',group='', setting=''):
	filepath = prefix + group + '/' + setting

	# load all 20 files as a single array
	# X will have 20*N rows and 512 columns
	# where N is the number of rows of each file when splitting into samples
	X = np.zeros([1,512])

	# Instead of reading all 20 measurements, only 2 to reduce dataset length
	# and 4 for the healthy to have balanced dataset
	if group=='K001' or group=='K002' or group=='K003' or group=='K004' or group=='K005' or group=='K006': # healthy
		measurements = 4
	else:
		measurements = 2
		
	for i in range(1,measurements+1):
		try:
			path = filepath+'_'+group+'_'+str(i)+'.mat'
			filename = setting+'_'+group+'_'+str(i)
			if i == 1:
				X = load_file(path,filename)
			else:
				X = np.concatenate([X,load_file(path,filename)],axis=0)
		except Exception as e: # if the loading the file fails, load another one
			path = filepath+'_'+group+'_'+str(i+10)+'.mat'
			filename = setting+'_'+group+'_'+str(i+10)
			X = np.concatenate([X,load_file(path,filename)],axis=0)
	
	# load class output
	if group=='K001' or group=='K002' or group=='K003' or group=='K004' or group=='K005' or group=='K006': # healthy
		y = np.ones(X.shape[0])
	elif group=='KA04' or group=='KA15' or group=='KA16' or group=='KA22' or group=='KA30':# real OR damage
		y = np.ones(X.shape[0])*2
	elif group=='KI04' or group=='KI14' or group=='KI16' or group=='KI17' or group=='KI18' or group=='KI21':# real IR damage
		y = np.ones(X.shape[0])*3
	elif group=='KA01' or group=='KA03' or group=='KA05' or group=='KA06' or group=='KA07' or group=='KA08' or group=='KA09':# artificial OR damage
		y = np.ones(X.shape[0])*2
	elif group=='KI01' or group=='KI03' or group=='KI05' or group=='KI07' or group=='KI08':# artificial IR damage
		y = np.ones(X.shape[0])*3
	else:
		y = np.zeros(X.shape[0])
	return X, y

In [69]:
def get_directory_names(path):
    # get a list of all items (files and directories) in the folder
    items = os.listdir(path)

    # initialize an empty list for folder names
    folder_names = []

    # loop over each item in the folder
    for item in items:
        # get the full path to the item
        item_path = os.path.join(path, item)
        
        # check if the item is a directory
        if os.path.isdir(item_path):
            # append the folder name to the list of folder names
            folder_names.append(item)

    return folder_names

In [70]:
# load the dataset, returns train and test X and y elements
def load_dataset(prefix=''):
	"""
	Output:
	- trainX, testX, valX: numpy arrays of shape (n_samples, 512, 2)
		512 frequency data points for each sample and the rotational speed for each sample
	- trainy, testy, valy: numpy arrays of shape (n_samples, 3)
		one hot endcoding of y for the 3 classes (healthy, inside ring damage, outside ring damage)
	"""

	# load all data
	folder_names = get_directory_names('data/raw')
	X_list = []
	y_list = []

	settings_speed={'N15_M07_F10':1500,
					'N09_M07_F10':900,
					'N15_M01_F10':1500,
					'N15_M07_F04':1500}
	
	for setting in settings_speed.keys():
		for group in folder_names:
			X_group, y_group = load_dataset_group(prefix+'data/raw/', group, setting)
			X_group = X_group.reshape(X_group.shape[0],X_group.shape[1],1)

			# create an array of shape (n_samples, 512, 1) containing the same speed value
			speed = np.full((X_group.shape[0], X_group.shape[1], 1), settings_speed.get(setting))

			# concatenate X and speed along the last axis
			X_group_speed = np.concatenate([X_group, speed], axis=2)

			X_list.append(X_group_speed)
			y_list.append(y_group)
			

	X = np.concatenate(X_list, axis=0)
	y = np.concatenate(y_list, axis=0)

	# train/test/validation split
	trainX, testValX, trainy, testValy = train_test_split(X, y.ravel(), test_size = 0.3, random_state = 42)
	testX, valX, testy, valy = train_test_split(testValX, testValy, test_size = 0.5, random_state = 42)
	
	# zero-offset class values
	trainy = trainy - 1
	testy = testy - 1
	valy = valy - 1
	# one hot encode y
	trainy = to_categorical(trainy)
	testy = to_categorical(testy)
	valy = to_categorical(valy)
	
	print("train: ",trainX.shape, trainy.shape, "\ntest: ", testX.shape, testy.shape, "\nval: ", valX.shape, valy.shape)
	return trainX, trainy, testX, testy, valX, valy

In [71]:
trainX, trainy, testX, testy, valX, valy = load_dataset()

train:  (49209, 512, 2) (49209, 3) 
test:  (10545, 512, 2) (10545, 3) 
val:  (10545, 512, 2) (10545, 3)


In [72]:
# check that data is balanced
pd.DataFrame(np.argmax(trainy,axis=1)).value_counts()

1    16899
0    16885
2    15425
dtype: int64

In [73]:
# save data
with open('data/processed/trainX.npy', 'wb') as f:
    np.save(f, trainX)
with open('data/processed/trainy.npy', 'wb') as f:
    np.save(f, trainy)
with open('data/processed/testX.npy', 'wb') as f:
    np.save(f, testX)
with open('data/processed/testy.npy', 'wb') as f:
    np.save(f, testy)
with open('data/processed/valX.npy', 'wb') as f:
    np.save(f, valX)
with open('data/processed/valy.npy', 'wb') as f:
    np.save(f, valy)

#### Visualize some data in time and frequency graphs

In [None]:
def load_X_time(path,data,setting):
    X = sio.loadmat(path+data+'/'+setting+'_'+data+'_1')[setting+'_'+data+'_1']['Y'][0][0][0][6][2].transpose()
    X = X.reshape(X.shape[0],)
    return X

X_k001 = load_X_time('data/raw/', 'K001', 'N15_M07_F10') # healthy
X_k002 = load_X_time('data/raw/', 'K002', 'N15_M07_F10') # healthy

X_ka01 = load_X_time('data/raw/', 'KA01', 'N15_M07_F10') # artificial OR damage
X_ka04 = load_X_time('data/raw/', 'KA04', 'N15_M07_F10') # real OR damage

X_ki01 = load_X_time('data/raw/', 'KI01', 'N15_M07_F10') # artificial IR damage
X_ki04 = load_X_time('data/raw/', 'KI04', 'N15_M07_F10') # real IR damage

# Crear subplots:
fig = make_subplots(rows=6, cols=2)

# Tiempo
fig.add_trace(
    go.Scatter(x= np.linspace(0,4,X_k001.shape[0]), y=X_k001, marker_color='green', showlegend=False),
    row=1, col=1
)
fig.update_yaxes(title_text="Healthy", row=1, col=1)

fig.add_trace(
    go.Scatter(x= np.linspace(0,4,X_k002.shape[0]), y=X_k002, marker_color='green', showlegend=False),
    row=2, col=1
)
fig.update_yaxes(title_text="Healthy", row=2, col=1)

fig.add_trace(
    go.Scatter(x= np.linspace(0,4,X_ka01.shape[0]), y=X_ka01, marker_color='firebrick', showlegend=False),
    row=3, col=1
)
fig.update_yaxes(title_text="OR artificial", row=3, col=1)

fig.add_trace(
    go.Scatter(x= np.linspace(0,4,X_ka04.shape[0]), y=X_ka04, marker_color='firebrick', showlegend=False),
    row=4, col=1
)
fig.update_yaxes(title_text="OR real", row=4, col=1)

fig.add_trace(
    go.Scatter(x= np.linspace(0,4,X_ki01.shape[0]), y=X_ki01, marker_color='steelblue', showlegend=False),
    row=5, col=1
)
fig.update_yaxes(title_text="IR artificial", row=5, col=1)

fig.add_trace(
    go.Scatter(x= np.linspace(0,4,X_ki04.shape[0]), y=X_ki04, marker_color='steelblue', showlegend=False),
    row=6, col=1
)
fig.update_yaxes(title_text="IR real", row=6, col=1)

fig.update_xaxes(title_text="Seconds (s)", row=6, col=1)

# Frecuencia
freqk001 = X_k001.shape[0]/4
xfk001 = fftfreq(X_k001.shape[0],1/freqk001)
fig.add_trace(
    go.Scatter(x= xfk001[:X_k001.shape[0]//2], y=np.abs(fft(X_k001))[0:X_k001.shape[0]//2], marker_color='green', showlegend=False),
    row=1, col=2
)

freqk002 = X_k002.shape[0]/4
xfk002 = fftfreq(X_k002.shape[0],1/freqk002)
fig.add_trace(
    go.Scatter(x= xfk002[:X_k002.shape[0]//2], y=np.abs(fft(X_k002))[0:X_k002.shape[0]//2], marker_color='green', showlegend=False),
    row=2, col=2
)

freqka01 = X_ka01.shape[0]/4
xfka01 = fftfreq(X_ka01.shape[0],1/freqka01)
fig.add_trace(
    go.Scatter(x= xfka01[:X_ka01.shape[0]//2], y=np.abs(fft(X_ka01))[0:X_ka01.shape[0]//2], marker_color='firebrick', showlegend=False),
    row=3, col=2
)

freqka04 = X_ka04.shape[0]/4
xfka04 = fftfreq(X_ka04.shape[0],1/freqka04)
fig.add_trace(
    go.Scatter(x= xfka04[:X_ka04.shape[0]//2], y=np.abs(fft(X_ka04))[0:X_ka04.shape[0]//2], marker_color='firebrick', showlegend=False),
    row=4, col=2
)

freqki01 = X_ki01.shape[0]/4
xfki01 = fftfreq(X_ki01.shape[0],1/freqki01)
fig.add_trace(
    go.Scatter(x= xfki01[:X_ki01.shape[0]//2], y=np.abs(fft(X_ki01))[0:X_ki01.shape[0]//2], marker_color='steelblue', showlegend=False),
    row=5, col=2
)

freqki04 = X_ki04.shape[0]/4
xfki04 = fftfreq(X_ki04.shape[0],1/freqki04)
fig.add_trace(
    go.Scatter(x= xfki04[:X_ki04.shape[0]//2], y=np.abs(fft(X_ki04))[0:X_ki04.shape[0]//2], marker_color='steelblue', showlegend=False),
    row=6, col=2
)

fig.update_xaxes(title_text="Frequency (Hz)", row=6, col=2)

fig.update_layout(height=1000, width=800)
fig.show()