Importing all Necessary Modules

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy import io, misc

In [3]:
duration_samples = 0.2 #seconds
size_max = 120000
frequence = 10000 #Hertz

In [4]:
file = "dataset.mat"

dataset = scipy.io.loadmat(file)

df_normal = dataset["normal"].reshape(-1)[:size_max]
df_inner = dataset["inner"].reshape(-1)[:size_max]
df_roller = dataset["roller"].reshape(-1)[:size_max]
df_outer = dataset["outer"].reshape(-1)[:size_max]

data = [df_normal,df_inner,df_roller,df_outer]

Hearing the Normal Engine sound

In [5]:
import IPython
IPython.display.Audio(df_normal, rate=frequence)

0- normal defect
1- inner defect
2- roller defect
3- outer defect

There is 4 audios of 12 sec of each type and we are creating a dataset from this 4 audios 
taking 0.2sec each. so total no of data point= 48/0.2= 240
now 70% is train and 30% is test data

In [6]:
def load_data(train=0.7):
    type_track = 0
    n_samples_each = int(size_max/frequence/duration_samples)
#     120000/10000/0.2= 60 for each type
    audios_train = []
    audios_test = []
    number_train=int(n_samples_each*0.7)

    for track in data:
        for i in range(0,n_samples_each):
            t1 = int(i*frequence*duration_samples)
            t2 = int((i+1)*frequence*duration_samples)
            new = list(track)[t1:t2]
            if i<number_train:
                audios_train.append((type_track,new))
            else:
                audios_test.append((type_track,new))
        type_track = type_track+1
    np.random.seed(1)
    np.random.shuffle(audios_train)
    np.random.seed(1)
    np.random.shuffle(audios_test)
    return [i[1] for i in audios_train], [i[1] for i in audios_test], [i[0] for i in audios_train], [i[0] for i in audios_test]

In [7]:
audios_train, audios_test, label_train, label_test = load_data()

In [None]:
# Hearing a sample sound we are using for training our model

In [8]:
import IPython
IPython.display.Audio(audios_train[2], rate=frequence)

In [34]:
# checking the typr of our training data ---> list or numpy arrary to avoid parsing complicacy

In [35]:
type(audios_train[0])

list

In [10]:
# using librosa module we are doing fetaure ectraction from the audio samples
# sr=sample rate
# mfcc=> Mel-Frequency Cepstral Coefficients (MFCC)
# sr=sampling rate
#  the number of samples per second (or per other unit) taken from a continuous signal
# to make a discrete or digital signal.


In [11]:
import librosa
# changing our list datatypr to numpy array
y= np.array(audios_train[0])
sr=frequence
# audio_data, sample_rate= librosa.load(y=y,sr=sr)
mfcc= librosa.feature.mfcc(y=y,sr=sr)
mfcc

  return f(*args, **kwargs)


array([[-122.66910248,  -92.38567051,  -90.14709196,  -95.04445657],
       [  65.11893795,   65.22694178,   65.74420185,   62.49582789],
       [ -23.78778531,  -22.8203103 ,  -24.81519375,  -24.33575148],
       [  33.9697855 ,   36.7075763 ,   38.60036311,   39.16014693],
       [  19.13609532,   18.71309184,   18.6506835 ,   20.118161  ],
       [ -14.59032857,  -14.31039357,  -13.29117492,  -11.43985052],
       [  -3.7035524 ,   -6.63739772,  -10.82981239,  -11.13944179],
       [   7.87573437,    2.31676845,    1.22366204,    2.58736084],
       [ -20.97423589,  -18.75185994,  -18.8267292 ,  -15.36336298],
       [  11.01888393,    8.41455642,    2.4582269 ,    6.09945736],
       [   7.08547832,    6.72868892,    4.85371586,    4.597242  ],
       [  -8.27849928,  -13.05751705,  -12.613952  ,  -12.84161621],
       [ -12.54682523,  -15.42040138,  -13.41485823,  -13.03844075],
       [  -6.00353258,   -5.68629068,   -6.50225963,   -4.82829332],
       [   5.21023325,    4.028929

In [12]:
mfcc.shape

(20, 4)

In [None]:
# writing a function to extract features using mfcc and scaling the features about mean

In [13]:
def feature_extract(l):
    y=np.array(l)
    sr=frequence
    mfcc= librosa.feature.mfcc(y=y,sr=sr)
    mfcc_scaled=np.mean(mfcc.T, axis=0)
    
    return mfcc_scaled

In [36]:
# using this function for each training data we are extracting the features and correcponding label 
# ans addiing them to extracted featres list

In [14]:
extracted_features=[]
i=0
for audio in audios_train:
    data=feature_extract(audio)
    label= label_train[i]
    extracted_features.append([data,label])
    i=i+1

In [37]:
# changing our list dataset to a pandas dataframe object for better visualization

In [15]:
df= pd.DataFrame(extracted_features, columns=['feature','class'])
df.head()

Unnamed: 0,feature,class
0,"[-100.06158038013302, 64.64647736755063, -23.9...",0
1,"[35.74615329316609, -53.1169484254411, 4.16197...",3
2,"[-101.55683725535413, 62.47573651477131, -24.4...",0
3,"[40.669772226180235, -51.51024941922913, 1.916...",3
4,"[-63.955624976208675, 13.54734617515508, -25.1...",2


In [16]:
# split the dataset into dependent and independent features
x=np.array(df['feature'].tolist())
y=np.array(df['class'].tolist())

In [17]:
x.shape

(168, 20)

In [52]:
y.shape

(168,)

In [38]:
# using one hot encoding to the labels so that we can train our ML Model

In [39]:
from keras.utils.np_utils import to_categorical  
import numpy as np
y_hot_train = to_categorical(y,num_classes =4)
y_hot_train[4]

array([0., 0., 1., 0.], dtype=float32)

In [40]:
# We can see that our y shape has changed after one hot encoding

In [41]:
y_hot_train.shape

(168, 4)

# Model Creation Using Neural Network

In [42]:
# importing necessary modules

In [21]:
import tensorflow as tf
print(tf.__version__)

2.9.1


In [43]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics


In [23]:
### No of classes
num_labels=y_hot_train.shape[1]

In [24]:
# Dense()

In [25]:
model=Sequential()
###first layer
model.add(Dense(100,input_shape=(20,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

###final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [26]:
# model.summery()

In [27]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [28]:
## Trianing my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 100
num_batch_size = 32

# checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.hdf5', 
#                                verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x,y_hot_train)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Training completed in time:  0:00:01.234282


In [44]:
# Model Accuracy is coming out to be 100% which is because pof overfitting due to less no of data

In [45]:
test_accuracy=model.evaluate(x,y_hot_train,verbose=0)
print(test_accuracy[1])

1.0


# Checking our model output for One individual test data

In [30]:
# taking a single data to just verify the prdiction of the model

In [46]:
arr=np.array(audios_test[0])

In [47]:
from scipy.io.wavfile import write
samplerate=22050
write("example.wav", samplerate,arr)

In [48]:
arr.shape

(2000,)

In [49]:
test_dummy= feature_extract(audios_test[0])

  return f(*args, **kwargs)


In [50]:
test_dummy.shape
test_dummy= test_dummy.reshape(1,-1)
test_dummy.shape

(1, 20)

In [55]:
ans= model.predict(test_dummy)
ans
# so predicted class is 1
# index of max of this array
print(ans)
# import torch
# print(torch.argmax(input))
pred_class = np.argmax(ans, axis=-1)
print(pred_class)

[[4.9432053e-04 9.9903119e-01 1.8565016e-04 2.8886180e-04]]
[1]


In [56]:
label_test[0]
# hence predicting correct label

1

# Saving this trained model as a hdf5 file and a pickle file so that we can use this model later

In [None]:
# import pickle

In [None]:
# pickle.dump(model,open('./model_audio.sav','wb'))

In [None]:
# tf.keras.models.save_model(model,'model_audio_final.hdf5')