In [1]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models,utils
import matplotlib.pyplot as plt
import numpy as np
import librosa 
import librosa.display
import IPython.display as ipd
import resampy
from resampy import resample

In [2]:
import pandas as pd
import os 


audio_dataset_path='audio_dataset/'
metadata=pd.read_csv('metadata3.csv')

Unnamed: 0,file_name_1,classid,class,fold
0,bear_1.wav,1,bear_fault,1
1,bear_10.wav,1,bear_fault,1
2,bear_11.wav,1,bear_fault,1
3,bear_12.wav,1,bear_fault,1
4,bear_13.wav,1,bear_fault,1


In [3]:
def features_extractor(file):
    audio,sample_rate=librosa.load(file_name,res_type='kaiser_fast')
    mfccs_features=librosa.feature.mfcc(y=audio,sr=sample_rate,n_mfcc=40)
    mfccs_scaled_features=np.mean(mfccs_features.T,axis=0)
    return mfccs_scaled_features

In [4]:
from tqdm import tqdm 


extracted_features=[]
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row["file_name_1"]))
    final_class_labels=row["class"]
    data=features_extractor(file_name)
    extracted_features.append([data,final_class_labels])

210it [01:18,  2.68it/s]


In [5]:
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])

In [6]:
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())


In [7]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [12]:
# Dense()
num_labels=y.shape[1]

In [13]:
model=Sequential()
###first layer
model.add(Dense(100,input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

###final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [14]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [15]:
#training the model

from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 100
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start

Epoch 1/100
1/6 [====>.........................] - ETA: 3s - loss: 22.8174 - accuracy: 0.3438
Epoch 1: val_loss improved from inf to 4.07251, saving model to saved_models\audio_classification.hdf5
Epoch 2/100
1/6 [====>.........................] - ETA: 0s - loss: 14.8841 - accuracy: 0.3438
Epoch 2: val_loss did not improve from 4.07251
Epoch 3/100
1/6 [====>.........................] - ETA: 0s - loss: 18.0335 - accuracy: 0.3125
Epoch 3: val_loss improved from 4.07251 to 1.53123, saving model to saved_models\audio_classification.hdf5
Epoch 4/100
1/6 [====>.........................] - ETA: 0s - loss: 11.7315 - accuracy: 0.3750
Epoch 4: val_loss improved from 1.53123 to 0.82973, saving model to saved_models\audio_classification.hdf5
Epoch 5/100
1/6 [====>.........................] - ETA: 0s - loss: 9.3476 - accuracy: 0.4062
Epoch 5: val_loss improved from 0.82973 to 0.72130, saving model to saved_models\audio_classification.hdf5
Epoch 6/100
1/6 [====>.........................] - ETA: 0s -

Epoch 27/100
1/6 [====>.........................] - ETA: 0s - loss: 1.3900 - accuracy: 0.5625
Epoch 27: val_loss did not improve from 0.37162
Epoch 28/100
1/6 [====>.........................] - ETA: 0s - loss: 2.0727 - accuracy: 0.6250
Epoch 28: val_loss did not improve from 0.37162
Epoch 29/100
1/6 [====>.........................] - ETA: 0s - loss: 0.9128 - accuracy: 0.7188
Epoch 29: val_loss improved from 0.37162 to 0.35698, saving model to saved_models\audio_classification.hdf5
Epoch 30/100
1/6 [====>.........................] - ETA: 0s - loss: 1.2576 - accuracy: 0.6875
Epoch 30: val_loss improved from 0.35698 to 0.35163, saving model to saved_models\audio_classification.hdf5
Epoch 31/100
1/6 [====>.........................] - ETA: 0s - loss: 1.8742 - accuracy: 0.5625
Epoch 31: val_loss improved from 0.35163 to 0.34809, saving model to saved_models\audio_classification.hdf5
Epoch 32/100
1/6 [====>.........................] - ETA: 0s - loss: 0.5658 - accuracy: 0.8438
Epoch 32: val_lo

Epoch 53/100
1/6 [====>.........................] - ETA: 0s - loss: 0.5845 - accuracy: 0.8438
Epoch 53: val_loss improved from 0.23808 to 0.23475, saving model to saved_models\audio_classification.hdf5
Epoch 54/100
1/6 [====>.........................] - ETA: 0s - loss: 0.5864 - accuracy: 0.8750
Epoch 54: val_loss improved from 0.23475 to 0.23027, saving model to saved_models\audio_classification.hdf5
Epoch 55/100
1/6 [====>.........................] - ETA: 0s - loss: 0.4874 - accuracy: 0.7812
Epoch 55: val_loss improved from 0.23027 to 0.23007, saving model to saved_models\audio_classification.hdf5
Epoch 56/100
1/6 [====>.........................] - ETA: 0s - loss: 0.3997 - accuracy: 0.8438
Epoch 56: val_loss did not improve from 0.23007
Epoch 57/100
1/6 [====>.........................] - ETA: 0s - loss: 0.2602 - accuracy: 0.8750
Epoch 57: val_loss improved from 0.23007 to 0.22875, saving model to saved_models\audio_classification.hdf5
Epoch 58/100
1/6 [====>.........................] 

In [18]:
def classify():
    filename="gear_2.wav"
    audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    print(mfccs_scaled_features)
    mfccs_scaled_features=mfccs_scaled_features.reshape(1,-1)
    print(mfccs_scaled_features)
    print(mfccs_scaled_features.shape)
    predicted_label=np.argmax(model.predict(mfccs_scaled_features),axis=-1)
    print(predicted_label)
    prediction_class = labelencoder.inverse_transform(predicted_label) 
    prediction_class

[-51.242413    49.093845   -18.26149      3.055037   -22.694397
   4.807211   -19.459581    14.904933   -29.962736    -3.8382246
  -9.103179    17.830744   -22.41513     14.5221     -16.380196
   4.0303774  -18.906586     9.933028    -8.045572     7.4074197
  -3.130435     7.2118545  -15.448091     4.136127    -6.293926
   9.502879    -5.3641586    6.619323     1.5061382   -1.7083994
  -3.285543     7.676069    -9.124614    -0.658288    -0.22298487
   1.3961397   -6.960764     4.032412     2.008322    -2.8119197 ]
[[-51.242413    49.093845   -18.26149      3.055037   -22.694397
    4.807211   -19.459581    14.904933   -29.962736    -3.8382246
   -9.103179    17.830744   -22.41513     14.5221     -16.380196
    4.0303774  -18.906586     9.933028    -8.045572     7.4074197
   -3.130435     7.2118545  -15.448091     4.136127    -6.293926
    9.502879    -5.3641586    6.619323     1.5061382   -1.7083994
   -3.285543     7.676069    -9.124614    -0.658288    -0.22298487
    1.3961397   -6.9

array(['gear_fault'], dtype='<U10')