# Loading audio dataset

In [346]:
import numpy as np
import librosa
import os
from keras.utils import to_categorical

max_pad_len, mfcc_features = 50, 13
def wav2mfcc(file_path, max_pad_len=50):
    wave, sr = librosa.load(file_path, mono=True, sr=None)
    wave = wave[::3]
    mfcc = librosa.feature.mfcc(wave, sr=8000, n_mfcc=mfcc_features, hop_length=160, n_fft=800)
    mfcc = mfcc.T
    pad_width = max_pad_len - mfcc.shape[0]
    mfcc = np.pad(mfcc, pad_width=((0, pad_width), (0, 0)), mode='constant')
    #print(mfcc.shape)
    return mfcc

In [348]:
mfccs, labels = [], []
recordings_path = "sound-mnist/recordings/"
for f in os.listdir(recordings_path):
    mfccs.append(wav2mfcc(recordings_path + f))
    label = f.split('_')[0]
    labels.append(label)
mfccs = librosa.util.normalize(np.asarray(mfccs))
mfccs.shape

(1500, 50, 13)

# Loading Image Dataset

In [349]:
from keras.datasets import mnist
img_width, img_height = 28, 28
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Preparing Training and Testing Data

In [350]:
np.random.seed(42)
X_train_image, X_train_sound, Digits, Y_train = [], [], [], []
X_test_image, X_test_sound, Y_test = [], [], []
check_ind = set()
train_set = 20000
test_set = 2000

#Training Data
t, f = 0, 0
while t < 11589 or f < 8411:
    sound_ind = np.random.randint(mfccs.shape[0])
    image_ind = np.random.randint(x_train.shape[0])
    if (sound_ind, image_ind) in check_ind:
        continue
    if int(labels[sound_ind]) == int(y_train[image_ind]) and t < 11589: #Chose a random number of True Cases for training
        Y_train.append(int(labels[sound_ind]) == int(y_train[image_ind]))
        X_train_image.append(x_train[image_ind])
        X_train_sound.append(mfccs[sound_ind])
        Digit.append(int(labels[sound_ind]))
        check_ind.add((sound_ind, image_ind))
        t+=1
    if int(labels[sound_ind]) != int(y_train[image_ind]) and f < 8411:
        Y_train.append(int(labels[sound_ind]) == int(y_train[image_ind]))
        X_train_image.append(x_train[image_ind])
        X_train_sound.append(mfccs[sound_ind])
        Digit.append(int(labels[sound_ind]))
        check_ind.add((sound_ind, image_ind))
        f+=1                               

#Testing Data
t, f = 0, 0
while t < 562 or f < 1438:
    sound_ind = np.random.randint(mfccs.shape[0])
    image_ind = np.random.randint(x_train.shape[0])
    if (sound_ind, image_ind) in check_ind:
        continue
    if int(labels[sound_ind]) == int(y_train[image_ind]) and t < 562:   #Chose a random number of True cases for testing
        Y_test.append(int(labels[sound_ind]) == int(y_train[image_ind]))
        X_test_image.append(x_train[image_ind])
        X_test_sound.append(mfccs[sound_ind])
        check_ind.add((sound_ind, image_ind))
        t+=1
    if int(labels[sound_ind]) != int(y_train[image_ind]) and f < 1438:
        Y_test.append(int(labels[sound_ind]) == int(y_train[image_ind]))
        X_test_image.append(x_train[image_ind])
        X_test_sound.append(mfccs[sound_ind])
        check_ind.add((sound_ind, image_ind))
        f+=1
    

In [351]:
Y_train.count(True), Y_test.count(True)

(11589, 562)

In [352]:
X_train_image = np.asarray(X_train_image).reshape(train_set, img_width, img_height, 1) / 255
X_train_sound = np.asarray(X_train_sound).reshape(train_set, max_pad_len, mfcc_features, 1)
Y_train = to_categorical(np.asarray(Y_train))

X_test_image = np.asarray(X_test_image).reshape(test_set, img_width, img_height, 1) / 255
X_test_sound = np.asarray(X_test_sound).reshape(test_set, max_pad_len, mfcc_features, 1)
Y_test = to_categorical(np.asarray(Y_test))

# Model

In [353]:
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, concatenate, Add
from keras import backend as K

In [359]:
batch_size = 128
input_shape = (28, 28, 1)

image_model = Sequential()
image_model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
image_model.add(Conv2D(64, (3, 3), activation='relu'))
image_model.add(MaxPooling2D(pool_size=(2, 2)))
image_model.add(Dropout(0.25))
image_model.add(Flatten())
image_model.add(Dense(128, activation='relu'))
image_model.add(Dropout(0.5))
#image_model.compile(loss=keras.losses.categorical_crossentropy,
#              optimizer=keras.optimizers.Adadelta(),
#              metrics=['accuracy'])

In [360]:
sound_model = Sequential()
input_shape2 = (50, 13, 1)
sound_model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=input_shape2))
sound_model.add(BatchNormalization())

sound_model.add(Conv2D(48, kernel_size=(2, 2), activation='relu'))
sound_model.add(BatchNormalization())

sound_model.add(Conv2D(120, kernel_size=(2, 2), activation='relu'))
sound_model.add(BatchNormalization())

sound_model.add(MaxPooling2D(pool_size=(2, 2)))
sound_model.add(Dropout(0.25))

sound_model.add(Flatten())

sound_model.add(Dense(256, activation='relu'))
sound_model.add(BatchNormalization())
sound_model.add(Dropout(0.25))
sound_model.add(Dense(128, activation='relu'))
sound_model.add(BatchNormalization())
sound_model.add(Dropout(0.4))
#sound_model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'])

In [361]:
model = Add()([image_model.output, sound_model.output])
model = Dense(128, activation='relu')(model)
model = Dropout(.35)(model)

model = Dense(2, activation='softmax')(model)

final_model = Model([image_model.input, sound_model.input], model)
from keras.optimizers import SGD
opt = SGD(lr=0.001)
final_model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.adadelta(), metrics=['accuracy'])

In [362]:
final_model.fit([X_train_image, X_train_sound], Y_train, epochs=20, batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x28cdb92eb00>

In [363]:
final_model.evaluate([X_test_image, X_test_sound], Y_test)



[0.07816050169244408, 0.972]

In [364]:
final_model.save("C:/Users/ankubhat/Desktop/codes/Notebooks/mymodel.h5")