In [1]:
import os
import numpy as np
import librosa
import scipy
import math
import pickle

In [2]:
#Convert to pcm16
def convert_urban_pcm24_to_pcm16():
    """Convert urbansound codec from PCM_24 to PCM_16."""
    src_dir = ['D:/Thesis/UrbanSound8K/audio/fold{:d}'.format(i+1) for i in range(10)]
    dst_dir = ['D:/Thesis/UrbanSound8K-16bit/audio/fold{:d}'.format(i+1) for i in range(10)]
    converted_wav_paths = []
    for dsrc, ddst in zip(src_dir, dst_dir):
        create_directory(ddst)
        wav_files = filter(lambda FP: FP if FP.endswith('.wav') else None, 
                           [FP for FP in os.listdir(dsrc)])
        for wav_file in wav_files:
            src_wav, dst_wav = os.path.join(dsrc, wav_file), os.path.join(ddst, wav_file)
            convert_wav(src_wav, dst_wav, subtype='PCM_16')
            converted_wav_paths.append(dst_wav)
            print('converted count:', len(converted_wav_paths))
    print(converted_wav_paths, len(converted_wav_paths))


def arange_urban_sound_file_by_class():
    """Arange urbansound files by their classes."""
    src_paths = ["D:/Thesis/UrbanSound8K/audio/fold{:d}".format(i+1) for i in range(10)]
    dst_dir = 'D:/Thesis/UrbanSound8K-16bit/audio-classified'
    CLASSES = [
        'air conditioner',
        'car horn',
        'children playing',
        'dog bark',
        'drilling',
        'engine idling',
        'gun shot',
        'jackhammer',
        'siren',
        'street music']
    CLASSES_STRIPED = [c.replace(' ', '_') for c in CLASSES]
    for src in src_paths:
        fold_dir = glob.glob(os.path.join(src, "*.wav"))
        for fn in fold_dir:
            lbl = int(fn.split('\\')[1].split('-')[1])
            dst = '{dir}/{label}'.format(dir=dst_dir, label=CLASSES_STRIPED[lbl])
            create_directory(dst)
            copy_file(fn, '{dst}/{name}'.format(dst=dst, name=fn.split('\\')[-1]))


In [None]:
convert_urban_pcm24_to_pcm16()
arange_urban_sound_file_by_class()

In [3]:
#Sort folders and convert to 16 bit pcm
#Add chimp sounds in folder labelled chimp
#Add Bg sounds
DATASET_16BIT_PATH = "D:/Thesis/UrbanSound8K-16bit/audio-classified"
#DATASET_16BIT_PATH = "D:/Thesis/Keras/Attempt5/subset"

In [4]:
def getListOfFiles(dirpath):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirpath)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirpath, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)        
    
    return allFiles

def urban_labels(Y, fpaths):
    """urban sound dataset labels."""
    urban_label = lambda path: int(os.path.split(path)[-1].split('-')[1])
    for p in fpaths:
        Y = np.append(Y, [urban_label(p)])
    return Y

def load_sound_files(file_paths):

    X,sr = librosa.load(file_paths, sr=11025, res_type='kaiser_fast')
    mfccs=np.mean(librosa.feature.mfcc(y=X,sr=sr,n_mfcc=40).T,axis=0)

    return mfccs

In [5]:
filepaths = np.asarray(getListOfFiles(DATASET_16BIT_PATH))
print("No of entries in our Dataset: ", filepaths.shape)

Y = np.ndarray(0)
labels = urban_labels(Y, filepaths)
print("Labels : ",labels.shape)

with open("filepaths.txt", "wb") as fp:   #Pickling
    pickle.dump(filepaths, fp)
    
with open("labels.txt", "wb") as fp:   #Pickling
    pickle.dump(labels, fp)

No of entries in our Dataset:  (11708,)
Labels :  (11708,)


In [5]:
raw = []
for i in range(len(filepaths)):
    #len(filepaths) when ready
    raw.append(load_sound_files(filepaths[i]))
    if i%100 == 0 :
        print("Raw Sound loaded for :", filepaths[i])

#np.save("Attempt5/raw",np.asarray(raw))
#del raw

Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\100852-0-0-0.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\13230-0-0-22.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\146714-0-0-41.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\162103-0-0-14.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\177726-0-0-15.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\178686-0-0-43.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\189982-0-0-20.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\204240-0-0-23.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_conditioner\57320-0-0-5.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\air_cond

Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\engine_idling\62567-5-0-1.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\engine_idling\94710-5-0-1.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\gun_shot\145206-6-2-0.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\gun_shot\159710-6-0-0.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\gun_shot\197320-6-9-0.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\jackhammer\103074-7-4-6.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\jackhammer\105029-7-3-2.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\jackhammer\14772-7-2-0.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\jackhammer\165039-7-12-0.wav
Raw Sound loaded for : D:/Thesis/UrbanSound8K-16bit/audio-classified\jackhammer\177537-7-0-1.wav
Raw Sound loaded for : D:/Thesis

FileNotFoundError: [Errno 2] No such file or directory: 'Attempt5/raw.npy'

In [6]:
#np.save("Attempt5-NN/raw",np.asarray(raw))
#del raw
raw = np.load("Attempt5-NN/raw.npy")

In [7]:
from sklearn.model_selection import train_test_split
XTrain,XTest,YTrain,YTest=train_test_split(raw,labels,test_size=0.2)

from keras.utils import to_categorical

YTrain = to_categorical(YTrain, num_classes=12)
YTest = to_categorical(YTest, num_classes=12)

del raw

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D, GlobalAveragePooling2D
from keras import optimizers
from keras.utils import np_utils
from sklearn import metrics 


In [9]:
# copied 
try_model = Sequential()

try_model.add(Dense(256, input_shape=(40,)))
try_model.add(Activation('relu'))
try_model.add(Dropout(0.5))

try_model.add(Dense(256))
try_model.add(Activation('relu'))
try_model.add(Dropout(0.5))

# try_model.add(Dense(256))
# try_model.add(Activation('relu'))
# try_model.add(Dropout(0.5))

try_model.add(Dense(256))
try_model.add(Activation('relu'))
try_model.add(Dropout(0.5))

try_model.add(Dense(12))
try_model.add(Activation('softmax'))

try_model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [10]:
try_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               10496     
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
activation_2 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
__________

In [14]:
try_model = load_model("my_model.h5")

In [15]:
try_model.fit(np.array(XTrain), np.array(YTrain), batch_size=32, epochs=1, validation_data=(np.array(XTest), np.array(YTest)))

Train on 9366 samples, validate on 2342 samples
Epoch 1/1


<keras.callbacks.History at 0x1b0cd27a828>

In [16]:
#On training Set
train_evaluate = try_model.evaluate(x=np.array(XTrain), y=np.array(YTrain), batch_size=32)
print(train_evaluate)
#On test Set
test_evaluate = try_model.evaluate(x=np.array(XTest), y=np.array(YTest), batch_size=32)
print(test_evaluate)

[0.39984255973762767, 0.8752936152293849]
[0.40280713445095806, 0.872331340734415]


In [17]:
try_model.save("my_model.h5")

In [18]:
from sklearn.metrics import classification_report
import numpy as np

#Test Set Metrics
Y_test = np.argmax(YTest, axis=1) # Convert one-hot to index
y_pred = try_model.predict_classes(XTest)
print(classification_report(Y_test, y_pred))

             precision    recall  f1-score   support

          0       0.76      0.89      0.82       213
          1       0.98      0.89      0.93        89
          2       0.57      0.76      0.65       194
          3       0.95      0.63      0.76       231
          4       0.95      0.79      0.87       204
          5       0.99      0.92      0.95       194
          6       0.83      0.57      0.67        60
          7       0.78      0.99      0.88       194
          8       0.96      0.93      0.94       191
          9       0.82      0.83      0.83       198
         10       1.00      0.99      1.00       290
         11       0.99      1.00      0.99       284

avg / total       0.89      0.87      0.87      2342



In [19]:
#Train Set Metrics
Y_train = np.argmax(YTrain, axis=1) # Convert one-hot to index
y_pred = try_model.predict_classes(XTrain)
print(classification_report(Y_train, y_pred))

             precision    recall  f1-score   support

          0       0.74      0.87      0.80       787
          1       1.00      0.86      0.92       340
          2       0.59      0.80      0.68       806
          3       0.94      0.68      0.79       769
          4       0.96      0.81      0.88       796
          5       0.99      0.90      0.94       806
          6       0.75      0.66      0.71       314
          7       0.80      0.97      0.88       806
          8       0.98      0.91      0.94       738
          9       0.82      0.80      0.81       802
         10       1.00      0.99      1.00      1202
         11       1.00      0.98      0.99      1200

avg / total       0.89      0.88      0.88      9366



In [35]:
#testing on other samples
import IPython.display as ipd

inf = []
inf.append(load_sound_files('Attempt5-NN/Chimp_inference.wav'))
ipd.display(ipd.Audio('Attempt5-NN/Chimp_inference.wav'))

# inf.append(load_sound_files('Attempt5/Chimp_inference2.wav'))
# ipd.display(ipd.Audio('Attempt5/Chimp_inference2.wav'))

# inf.append(load_sound_files('Attempt5/Chimp_inference3.wav'))
# ipd.display(ipd.Audio('Attempt5/Chimp_inference3.wav'))

inf.append(load_sound_files('Attempt5-NN/Chimpanzee_Sound_Effect_1.wav'))
ipd.display(ipd.Audio('Attempt5-NN/Chimpanzee_Sound_Effect_1.wav'))

inf.append(load_sound_files('Attempt5-NN/Chimpanzee_Sound_Effect_2.wav'))
ipd.display(ipd.Audio('Attempt5-NN/Chimpanzee_Sound_Effect_2.wav'))

# inf.append(load_sound_files('Attempt5/Chimpanzee_Sound_Effect_3.wav'))
# ipd.display(ipd.Audio('Attempt5/Chimpanzee_Sound_Effect_3.wav'))

inf.append(load_sound_files('Attempt5-NN/6902-2-0-7.wav'))
ipd.display(ipd.Audio('Attempt5-NN/6902-2-0-7.wav'))


In [36]:
inf_pred = try_model.predict_classes(np.array(inf))
print(inf_pred)

[10 10  3  2]


In [40]:
inf_pred[0].shape

(12,)

### Conclusion 
  100% precision recall metrics for the chimp calls raises questions regarding overfitting. <br>
  The model has overfit and does not generalize well for other input formats which have chimp calls.<br>
  In the above example, the model has predicted some chimp calls(10) as dog_barks(3). 
  Solution : Gotta standardize the dataset a bit more. <br>
  Added a few files with different sample rates : Same effect of overfitting <br>
  Working : Appended chimp files with 334 low bitrate files, 174 youtube files of 4 secs length 


In [37]:
inf_pred = try_model.predict(np.array(inf))
inf_pred

array([[7.35081345e-32, 2.52177976e-30, 1.08846966e-10, 2.17165227e-18,
        1.78919034e-17, 8.43050411e-27, 2.83774567e-24, 8.80573601e-38,
        1.66223567e-15, 1.85351667e-22, 1.00000000e+00, 3.31733337e-38],
       [1.41708453e-16, 5.65970432e-15, 5.03341260e-04, 1.20088515e-07,
        5.91723515e-09, 4.99051247e-14, 2.40630237e-12, 8.41459172e-21,
        8.47796777e-10, 5.01856438e-11, 9.99496460e-01, 1.66141806e-22],
       [7.97311441e-06, 1.12324042e-05, 8.28092620e-02, 3.61038953e-01,
        1.95831731e-02, 5.66532181e-05, 1.50264487e-01, 2.01950024e-06,
        3.13760899e-02, 6.80102559e-04, 3.54169995e-01, 1.06624519e-07],
       [1.96147963e-01, 1.85620561e-02, 2.02612206e-01, 4.65384610e-02,
        1.18795961e-01, 4.96117398e-02, 2.97800452e-02, 1.96525112e-01,
        1.65734421e-02, 1.20367639e-01, 1.71679538e-03, 2.76853121e-03]],
      dtype=float32)

0 = air_conditioner
1 = car_horn
2 = children_playing
3 = dog_bark
4 = drilling
5 = engine_idling
6 = gun_shot
7 = jackhammer
8 = siren
9 = street_music
10 = chimp
11 = background

In [None]:
#Raspberry pi