# Loading the important libraries

In [14]:
from scipy.io import loadmat
import glob
import os
import pandas as pd
import numpy as np

# Preprocessing

## Loading the files with data

Exploration of our files of data

In [3]:
mat_files = glob.glob('data/row_data/*.mat')

In [35]:
for file in mat_files:
    data = loadmat(mat_files[0])
    keys = data.keys()
    for key in keys:
        if not key.startswith('__'):
            if data[key] is None:
                print(f"{key}: None")
        


In [5]:
for key in data:
    if not key.startswith('__'):  # Skip metadata
        print(f"{key}: {type(data[key])}, shape: {getattr(data[key], 'shape', 'N/A')}")

data: <class 'numpy.ndarray'>, shape: (512000, 1)
fs: <class 'numpy.ndarray'>, shape: (1, 1)
rpm: <class 'numpy.ndarray'>, shape: (256000, 1)
ru: <class 'numpy.ndarray'>, shape: (256000, 1)


In [9]:
display(data)

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Fri Aug 26 22:26:46 2022',
 '__version__': '1.0',
 '__globals__': [],
 'data': array([[-0.34357557],
        [-0.07140313],
        [ 0.18997874],
        ...,
        [-1.82823065],
        [-0.59752596],
        [ 0.74405342]], shape=(512000, 1)),
 'fs': array([[24.93]]),
 'rpm': array([[-1.23141266e-04],
        [-2.18301926e-04],
        [-1.76079497e-04],
        ...,
        [ 1.45409845e+03],
        [ 1.45409941e+03],
        [ 1.45410047e+03]], shape=(256000, 1)),
 'ru': array([[-0.00314207],
        [-0.00244709],
        [-0.00260559],
        ...,
        [-0.65266762],
        [ 0.25275306],
        [-0.22923205]], shape=(256000, 1))}

Now we explored what is the structure of the .mat files and we can extract the important infos. 

In [51]:
def load_file(filepath):
    response = dict()
    mat = loadmat(filepath)
    signal = mat.get('data').squeeze()
    fs = float(mat.get('fs').squeeze())
    if mat.get('rpm') is not None:
        rpm = mat.get('rpm').squeeze()
    else:
        rpm = []
    if mat.get('ru') is not None:
        rpm = mat.get('ru').squeeze()
    else:
        rpm = []

    label = os.path.basename(filepath).split('.')[0]  
    response['signal'] = signal
    response['fs'] = fs 
    response['rpm'] = rpm
    response['ru'] = ru
    response['label'] = label
    return response

In [38]:
def segment_signal(signal, window_size=1024, step=512):
    if signal is not None:
       return [signal[i:i+window_size] for i in range(0, len(signal) - window_size + 1, step)]
    else:
        return []

Creating a pandas dataframe with all the infos

In [54]:

rows = []

for filepath in mat_files:
    matfile = load_file(filepath)
    segments = segment_signal(matfile['signal'])

    for seg in segments:
        row = {
            "label": matfile['label'],
            "fs": matfile['fs'],
            "rpm": matfile['rpm'],
            "ru": matfile['ru'],
            "segment": seg  
        }
        rows.append(row)

df = pd.DataFrame(rows)

df.head()

Unnamed: 0,label,fs,rpm,ru,segment
0,B500,24.93,"[-0.0031420730399999985, -0.002447086859999999...","[-0.0031420730399999985, -0.002447086859999999...","[-0.34357556658, -0.07140312793, 0.18997873582..."
1,B500,24.93,"[-0.0031420730399999985, -0.002447086859999999...","[-0.0031420730399999985, -0.002447086859999999...","[-0.23173765893, -0.055473313120000006, 0.0055..."
2,B500,24.93,"[-0.0031420730399999985, -0.002447086859999999...","[-0.0031420730399999985, -0.002447086859999999...","[-0.67900394035, -0.89501661856, 0.09341223502..."
3,B500,24.93,"[-0.0031420730399999985, -0.002447086859999999...","[-0.0031420730399999985, -0.002447086859999999...","[-0.046475071, 0.12785062915, 0.53091213170000..."
4,B500,24.93,"[-0.0031420730399999985, -0.002447086859999999...","[-0.0031420730399999985, -0.002447086859999999...","[0.14569470414000002, 0.35809223494000003, 0.0..."
