# Parallel architectures for multi-modal binary classification on Google Audioset
Dorian Cazau, 24-10-2018, Atelier ML, PNBI

## Set up the environment

In [60]:
import os
import warnings

import keras
import numpy as np

warnings.filterwarnings("ignore")
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from keras.models import Sequential
from keras.layers import Flatten, Dense, GRU, Dropout, Merge
from keras.layers.convolutional import Convolution3D
from keras.layers.core import Activation

%matplotlib inline

## Preprocess the data

### Explore the data

Load all the pre computed data

In [61]:
fileForLabels = 'associatedLabels.npy'
labels = np.load(fileForLabels)

fileForMatFeats = 'dataMat.npy'
dataMelspectros = np.load(fileForMatFeats)
dataToStore = dataMelspectros.T.swapaxes(1, 2).reshape(-1, max_len * n_mfccs)

fileForTensorsFeats = 'tensors.npy'
videoFrames = np.load(fileForTensorsFeats)

#### Dataset: Google audioset (2 classes):
 - Cricket
 - Cough

In [62]:
unique, counts = np.unique(labels, return_counts=True)
dict(zip(unique, counts))

{'Cough': 680, 'Cricket': 677}

##### Multi modal dataset with

Audio

In [63]:
import glob
import librosa
from IPython.display import Audio

pathDataDirAudio = '/media/paul/ec1a0ffa-763a-4a39-b960-3b51e45ecf26/data/newunbalanced_segments/audio'
audiofiles = glob.glob(os.path.join(pathDataDirAudio, "*.flac"))
sampleX, samplingRate = librosa.load(audiofiles[1], 22050)

Audio(sampleX, rate=samplingRate)

Video

In [64]:
import io
import base64
from IPython.display import HTML

pathDataDirVideo = '/media/paul/ec1a0ffa-763a-4a39-b960-3b51e45ecf26/data/newunbalanced_segments/video'
videofiles = glob.glob(os.path.join(pathDataDirVideo, "*.mp4"))
video = io.open(videofiles[1], 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''<video alt="test" controls>
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii')))

### Encode labels

In [65]:
le = preprocessing.LabelEncoder()
le.fit(labels)
encodedLabels = le.transform(labels)

### All features were pre computed. Video frames were extracted (only 4 / 1video) with FFMPEG. All frame images were resized to have the same dimensions 

Variables used for MFCCs

In [66]:
nOccur = len(labels)
n_mfccs = 13
max_len = 10

Scale MFCC images and frames

In [67]:
scaler = StandardScaler()
concatMelspectros = scaler.fit_transform(dataToStore)
del dataMelspectros

videoFrames = videoFrames.astype(np.float32) / 255

## Train / Test a network

### Set up cross-validation

Split data in train/test datasets for cross-validation.
Other methods in Python exists (train_test_split, ShufffleSplit, ...) but this one 
preserves the percentage of samples for each class.

In [68]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
sss.get_n_splits(concatMelspectros, labels)
numClasses = len(np.unique(labels))

### Metrics

In [69]:
def print_model_metrics(model, x_test, y_test):
    loss, accuracy = model.evaluate(x=x_test, y=y_test)
    print("\n model test loss is " + str(loss) + " accuracy is " + str(accuracy))

    y_softmax = model.predict(x_test)  # this is an n x class matrix of probabilities
    y_hat = y_softmax.argmax(axis=-1)  # this will be the class number.
    test_y = y_test.argmax(axis=-1)  # our test data is also categorical
    print(classification_report(test_y, y_hat))
    print(confusion_matrix(test_y, y_hat))
    print("\n Accuracy score is:" + str(accuracy_score(test_y, y_hat)))


### Train / Test a network

#### Notions

#### Training / Testing

In [70]:
    j = 0
    for train_index, test_index in sss.split(matMFCC, encodedLabels):
        print('\nFold ', j)
        # Split in train / test sets for the ith fold
        X_train_branch1, X_test_branch1 = matMFCC[train_index], matMFCC[test_index]
        X_train_branch2, X_test_branch2 = videoFrames[train_index, :, :, :], videoFrames[test_index, :, :, :]
        y_train, y_test = encodedLabels[train_index], encodedLabels[test_index]

        # Reshape all feature vectors in tensors to feed the network
        x_train = np.reshape(X_train_branch1, (len(X_train_branch1), n_mfccs, max_len))
        x_test = np.reshape(X_test_branch1, (len(X_test_branch1), n_mfccs, max_len))

        # Convert labels to categorical ones ( binary class matrix )
        y_train = to_categorical(y_train)
        y_test = to_categorical(y_test)

        ############################## Build network ##############################
        # First architecture on MFCCs
        branch1 = Sequential()
        branch1.add(GRU(256, input_shape=x_train.shape[1:]))
        branch1.add(Dropout(0.5))

        # Second architecture on video frames
        branch2 = Sequential()
        branch2.add(Convolution3D(filters=5, kernel_size=2,
                                  input_shape=X_train_branch2.shape[1:], activation='relu'))
        branch2.add(Flatten()) # -> flatten to have an array of 1 dimension

        # Merge the output of the two architectures
        model = Sequential()
        model.add(Merge([branch1, branch2], mode='concat'))

        # Decision
        model.add(Dense(numClasses, activation='sigmoid'))
        # Optimizer
        opt = keras.optimizers.SGD(lr=0.5)
        # Generate the network
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=["accuracy"])
        ############################################################################

        # Train model
        model.fit([x_train, X_train_branch2], y_train,
                  batch_size=128,
                  epochs=5,
                  shuffle=True,
                  verbose=2,
                  validation_data=([x_test, X_test_branch2], y_test))
        # Test model of this fold and print loss and accuracy on test set
        print_model_metrics(model, [x_test, X_test_branch2], y_test)
        j += 1


Fold  0
Train on 1085 samples, validate on 272 samples
Epoch 1/5
 - 1s - loss: 1.3090 - acc: 0.5530 - val_loss: 0.6790 - val_acc: 0.5956
Epoch 2/5
 - 0s - loss: 0.6776 - acc: 0.6101 - val_loss: 0.6658 - val_acc: 0.6250
Epoch 3/5
 - 0s - loss: 0.6649 - acc: 0.6083 - val_loss: 0.6788 - val_acc: 0.5919
Epoch 4/5
 - 0s - loss: 0.6626 - acc: 0.6074 - val_loss: 0.6419 - val_acc: 0.6324
Epoch 5/5
 - 0s - loss: 0.6503 - acc: 0.6286 - val_loss: 0.6307 - val_acc: 0.6434

 model test loss is 0.6306544998112846 accuracy is 0.6433823529411765
             precision    recall  f1-score   support

          0       0.63      0.68      0.66       136
          1       0.66      0.60      0.63       136

avg / total       0.64      0.64      0.64       272

[[93 43]
 [54 82]]

 Accuracy score is:0.6433823529411765

Fold  1
Train on 1085 samples, validate on 272 samples
Epoch 1/5
 - 1s - loss: 0.8301 - acc: 0.5152 - val_loss: 0.6811 - val_acc: 0.6029
Epoch 2/5
 - 0s - loss: 0.6778 - acc: 0.6111 - val_l

### Explore the last created network

<img src="model.png" title="Title text" />