In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from time import gmtime, strftime

In [2]:
import glob

cur_path = "D:/full/"

normal_file_names = glob.glob(cur_path + '/normal/*.csv',recursive=True)
imnormal_file_names = glob.glob(cur_path + '/*/*/*.csv',recursive=True)
imnormal_file_names_2 = glob.glob(cur_path + '/*/*/*/*.csv',recursive=True)


all_files = normal_file_names + imnormal_file_names + imnormal_file_names_2

columns_names = ['Tachometer'
                 ,'Underhang bearing accelerometer axial'
                 ,'Underhang bearing accelerometer radiale'
                 ,'Underhang bearing accelerometer tangential'
                 ,'Overhang bearing accelerometer axial'
                 ,'Overhang bearing accelerometer radiale'
                 ,'Overhang bearing accelerometer tangential'
                 ,'Microphone']

len(all_files)

1951

In [3]:
def readFiles(path_names):
    data = {}
    report_step = 30
    report_iter = 0
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()),'Start Processing Files...')
    for i in path_names:
        if ((report_iter % report_step) == 0):
            print(strftime("%Y-%m-%d %H:%M:%S", gmtime()),'Processing file number',report_iter,i)
        low_data = pd.read_csv(i,header=None)
        data[i] = low_data
        report_iter += 1
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()),'Finished processing files!!')
    return data

data_raw = readFiles(all_files)

2021-07-28 18:52:17 Start Processing Files...
2021-07-28 18:52:17 Processing file number 0 D:/full//normal\12.288.csv
2021-07-28 18:52:29 Processing file number 30 D:/full//normal\42.3936.csv
2021-07-28 18:52:39 Processing file number 60 D:/full\horizontal-misalignment\0.5mm\23.552.csv
2021-07-28 18:52:50 Processing file number 90 D:/full\horizontal-misalignment\0.5mm\53.248.csv
2021-07-28 18:53:01 Processing file number 120 D:/full\horizontal-misalignment\1.0mm\33.792.csv
2021-07-28 18:53:12 Processing file number 150 D:/full\horizontal-misalignment\1.5mm\14.9504.csv
2021-07-28 18:53:22 Processing file number 180 D:/full\horizontal-misalignment\1.5mm\45.4656.csv
2021-07-28 18:53:32 Processing file number 210 D:/full\horizontal-misalignment\2.0mm\25.6.csv
2021-07-28 18:53:43 Processing file number 240 D:/full\horizontal-misalignment\2.0mm\55.9104.csv
2021-07-28 18:53:53 Processing file number 270 D:/full\imbalance\10g\38.5024.csv
2021-07-28 18:54:04 Processing file number 300 D:/full\i

![image.png](attachment:d64eae14-5da9-4bfd-9854-0da63c512e9c.png)

In [6]:
from scipy import fft,stats
from sklearn.neighbors import KernelDensity

def calculateEntropy(data,filePath,_debug =False,bandwidth = 0.1, mode = 'decimal'):
    data_min = int(data.min()-1)
    data_max = int(data.max()+2)
    if(_debug):
        print('------'*10)
        print('File:',filePath)
        print('Min:',data_min,'Max:',data_max)
        print('Creating Kernel...')
    kde_model = KernelDensity(bandwidth = bandwidth, kernel='gaussian')
    kde_model.fit(data.values.reshape(-1, 1))
    if(_debug):
        print('Kernel fitted!')
    if (mode == 'integer'):
        samples_data = np.array(range(data_min-2,data_max+2))
        factor = 1
    else:
        step = (data_max - data_min) * 10 + 1 
        samples_data = np.linspace(data_min,data_max,step)
        factor = 10
    if(_debug):
        print('Getting Probabilities...')
    log_probabilities = kde_model.score_samples(samples_data.reshape((len(samples_data), 1)))
    probabilities = np.exp(log_probabilities)/factor
    if(_debug):
        print('Probabilities calculated!!',probabilities)
    entropy = 0
    for p in probabilities[probabilities >= 1e-10 ]:
        entropy += p*np.log(1/p)
    
    if(_debug):
        print(samples_data)
        print('Entropy is',entropy,'and total probability is',probabilities.sum())
        fig, ax = plt.subplots(constrained_layout=True)
        ax.plot(samples_data[:], probabilities, color='r')
        ax2 = ax.twinx()
        ax2.hist(data.values, bins=50, density=True, label = 'histogram')
        plt.show()
    return entropy

def calculateMetricsAlzghoul(data,filePath, SAMPLE_RATE = 50000, DURATION = 5, _debug = False):

    mean = data.mean()
    entropy = calculateEntropy(data,filePath,_debug)
    kurtosis = stats.kurtosis(data)
    if(_debug):
        print('Mean',mean)
        print('Kustosis',kurtosis)
        print('------'*10)
        print()
        print()
    #plt.figure(figsize=(15,5))
    #plt.plot(xf, np.abs(yf))
    #plt.xlim(0,1000)
    #plt.xticks(range(0,1000,50))
    #plt.show()
    return [mean, entropy ,kurtosis]

def calculateMainFrequency(data,filePath, SAMPLE_RATE = 50000, DURATION = 5, _debug = False):
    # Normal Sequences
    # There are 49 sequences without any fault, 
    # each with a fixed rotation speed within the range from 737 rpm to 3686 
    # rpm with steps of approximately 60 rpm.

    # Number of samples
    N = SAMPLE_RATE * DURATION
    
    if(_debug):
        print('------'*10)
        print('File:',filePath)
    yf = fft.rfft(data.values)
    xf = fft.rfftfreq(N, 1/SAMPLE_RATE)
    freq = xf[np.argpartition(np.abs(yf), -4)[-4:]].min()
    if(_debug):
        print('Frequency is',freq, 'in hertz and', freq*60,'in RPM',)
        print('------'*10)
        print()
        print()
    #plt.figure(figsize=(15,5))
    #plt.plot(xf, np.abs(yf))
    #plt.xlim(0,1000)
    #plt.xticks(range(0,1000,50))
    #plt.show()
    return freq

![image.png](attachment:a55a4976-2b16-4cb6-89fc-956a3856da9a.png)

![image.png](attachment:be528184-6090-482e-9635-e03f409a466b.png)

In [7]:
def calculateMetricsDongyang(data, filePath, _debug = False):
    N = len(data)
    rms = np.sqrt(np.sum(data**2)/N)
    peak_peak = data.min() - data.max()
    sum_abs = (1/N)*np.sum(np.abs(data))
    shape_indicator = rms / sum_abs
    max_abs = np.max(np.abs(data))
    impulse_indicator = max_abs / sum_abs
    crest_indicator = max_abs/rms
    clearance_indicator = max_abs/(np.sum(np.sqrt(np.abs(data)))/N)**2
    skewness = stats.skew(data)
    if(_debug):
        print('------'*10)
        print('File:',filePath)
        print('Root Mean Square is',rms)
        print('Peak-peak is',peak_peak)
        print('Shape Indicator is',shape_indicator)
        print('Impulse Indicator is',impulse_indicator)
        print('Crest Indicator is',crest_indicator)
        print('Clearance Indicator is',clearance_indicator)
        print('skewness is',skewness)
        print('------'*10)
        print()
        print()
    return [rms,peak_peak,shape_indicator,impulse_indicator,crest_indicator,clearance_indicator,skewness]

In [8]:
# 50 kHz sampling rate during 5 s
SAMPLE_RATE = 50000  #columnrtz
DURATION = 5  # Seconds

df_column_names = [columns_names[0] + ' main frequency', 'filePath']
metrics = ['mean', 'entropy' ,'kurtosis',
          'rms','peak to peak','shape indicator','impulse indicator','crest indicator','clearance indicator','skewness']

for column in columns_names:
    for m in metrics:
        df_column_names.append(column + ' ' + m)

report_step = 10
report_iter = 0

result = []
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()),'Start Processing Files...')
for filePath in all_files:
#samples = [0,5]
#for sample in samples:
    #filePath = normal_file_names[sample]
    if ((report_iter % report_step) == 0):
        print(strftime("%Y-%m-%d %H:%M:%S", gmtime()),'Processing file number',report_iter,filePath)
    data = data_raw[filePath]
    freq = calculateMainFrequency(data[0],filePath)
    row = [freq,filePath]
    for column in data.columns:
        start = calculateMetricsAlzghoul(data[column],filePath)
        end = calculateMetricsDongyang(data[column],filePath)
        row.extend(start)
        row.extend(end)
    result.append(row)
    report_iter += 1

print(strftime("%Y-%m-%d %H:%M:%S", gmtime()),'Finished processing files!!')
df = pd.DataFrame(columns=df_column_names, data = result)
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()),'DataSet Create!!')
df.to_csv(cur_path + 'mycsvfile.csv')
df.head()

2021-07-28 19:05:17 Start Processing Files...
2021-07-28 19:05:17 Processing file number 0 D:/full//normal\12.288.csv
2021-07-28 19:08:28 Processing file number 10 D:/full//normal\22.3232.csv
2021-07-28 19:11:35 Processing file number 20 D:/full//normal\32.9728.csv
2021-07-28 19:14:44 Processing file number 30 D:/full//normal\42.3936.csv
2021-07-28 19:17:51 Processing file number 40 D:/full//normal\53.8624.csv
2021-07-28 19:20:53 Processing file number 50 D:/full\horizontal-misalignment\0.5mm\13.5168.csv
2021-07-28 19:23:57 Processing file number 60 D:/full\horizontal-misalignment\0.5mm\23.552.csv
2021-07-28 19:27:05 Processing file number 70 D:/full\horizontal-misalignment\0.5mm\33.1776.csv
2021-07-28 19:30:17 Processing file number 80 D:/full\horizontal-misalignment\0.5mm\43.008.csv
2021-07-28 19:33:26 Processing file number 90 D:/full\horizontal-misalignment\0.5mm\53.248.csv
2021-07-28 19:36:41 Processing file number 100 D:/full\horizontal-misalignment\1.0mm\13.7216.csv
2021-07-28 1

2021-07-29 00:08:48 Processing file number 920 D:/full\overhang\ball_fault\0g\53.0432.csv
2021-07-29 00:18:00 Processing file number 930 D:/full\overhang\ball_fault\20g\13.5168.csv
2021-07-29 00:23:11 Processing file number 940 D:/full\overhang\ball_fault\20g\23.7568.csv
2021-07-29 00:29:21 Processing file number 950 D:/full\overhang\ball_fault\20g\33.3824.csv
2021-07-29 00:35:37 Processing file number 960 D:/full\overhang\ball_fault\35g\19.456.csv
2021-07-29 00:41:59 Processing file number 970 D:/full\overhang\ball_fault\35g\29.4912.csv
2021-07-29 00:47:51 Processing file number 980 D:/full\overhang\ball_fault\6g\19.2512.csv
2021-07-29 00:53:13 Processing file number 990 D:/full\overhang\ball_fault\6g\29.2864.csv
2021-07-29 00:59:42 Processing file number 1000 D:/full\overhang\ball_fault\6g\39.7312.csv
2021-07-29 01:08:18 Processing file number 1010 D:/full\overhang\ball_fault\6g\49.5616.csv
2021-07-29 01:19:12 Processing file number 1020 D:/full\overhang\cage_fault\0g\16.1792.csv
202

2021-07-29 06:20:28 Processing file number 1820 D:/full\underhang\outer_race\20g\16.9984.csv
2021-07-29 06:23:25 Processing file number 1830 D:/full\underhang\outer_race\20g\28.0576.csv
2021-07-29 06:26:35 Processing file number 1840 D:/full\underhang\outer_race\20g\37.888.csv
2021-07-29 06:30:16 Processing file number 1850 D:/full\underhang\outer_race\20g\47.3088.csv
2021-07-29 06:34:18 Processing file number 1860 D:/full\underhang\outer_race\20g\57.5488.csv
2021-07-29 06:38:00 Processing file number 1870 D:/full\underhang\outer_race\35g\19.2512.csv
2021-07-29 06:41:04 Processing file number 1880 D:/full\underhang\outer_race\35g\29.4912.csv
2021-07-29 06:44:38 Processing file number 1890 D:/full\underhang\outer_race\35g\39.3216.csv
2021-07-29 06:48:45 Processing file number 1900 D:/full\underhang\outer_race\35g\48.9472.csv
2021-07-29 06:52:06 Processing file number 1910 D:/full\underhang\outer_race\6g\21.0944.csv
2021-07-29 06:55:05 Processing file number 1920 D:/full\underhang\outer_

Unnamed: 0,Tachometer main frequency,filePath,Tachometer mean,Tachometer entropy,Tachometer kurtosis,Tachometer rms,Tachometer peak to peak,Tachometer shape indicator,Tachometer impulse indicator,Tachometer crest indicator,...,Microphone mean,Microphone entropy,Microphone kurtosis,Microphone rms,Microphone peak to peak,Microphone shape indicator,Microphone impulse indicator,Microphone crest indicator,Microphone clearance indicator,Microphone skewness
0,12.0,D:/full//normal\12.288.csv,0.00293,2.155789,4.304473,1.585572,-6.2337,1.617277,5.209933,3.221424,...,0.008016,1.818507,0.065309,0.112425,-0.77416,1.221686,5.872358,4.806766,6.75248,0.68555
1,12.8,D:/full//normal\13.1072.csv,0.005695,2.15349,4.16897,1.596103,-6.2125,1.60971,5.110197,3.174607,...,0.007704,1.834837,0.241943,0.116036,-0.86253,1.228879,6.391702,5.201244,7.374075,0.740827
2,14.0,D:/full//normal\14.336.csv,-0.003637,2.150613,4.138177,1.598855,-6.267,1.595314,5.08182,3.185467,...,0.017536,1.910258,0.063221,0.132987,-0.96741,1.236221,6.284034,5.083263,7.279235,0.69469
3,14.8,D:/full//normal\15.1552.csv,0.00255,2.149442,3.967702,1.612556,-6.3006,1.589255,5.00019,3.146247,...,0.019339,1.919532,0.018246,0.135144,-0.96375,1.2352,6.260704,5.068577,7.249635,0.674586
4,15.8,D:/full//normal\16.1792.csv,0.0047,2.162512,3.813742,1.625237,-6.373,1.57944,4.95833,3.139296,...,0.014026,1.880101,-0.100899,0.125709,-0.83014,1.218678,5.577019,4.576287,6.405791,0.616928
