In [1]:
import os, json
import numpy as np
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, Conv1D, GlobalAveragePooling1D, Dropout
from keras.regularizers import l2
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


Define the model input features and parameters.

In [2]:
N_MELS = 80
N_MFCC = 40
FEATURES_TYPE = 'log-mel-spectrogram'
FEATURES_SPEED = 1
INPUT_FRAMES = 297 # number of features rows (or frames) for test audios of 3s or 48000 samples
HOP_FRAMES_TRAIN_SPLIT = 100 # or INPUT_FRAMES
FEATURES_NORMALIZED = False
DROPOUT = 0
REGULARIZATION = 0

Define the directories to load extracted features of train and test audio files as well the model name.

In [3]:
model_id = 'CNN_'+FEATURES_TYPE
dir_train = '../data/features_extraction/train/'+FEATURES_TYPE
dir_test = '../data/features_extraction/test/'+FEATURES_TYPE
if 'mfcc' in FEATURES_TYPE:
    dir_train += '_'+str(N_MFCC)
    dir_test += '_'+str(N_MFCC)
    model_id += '_'+str(N_MFCC)
elif 'mel' in FEATURES_TYPE:
    dir_train += '_'+str(N_MELS)
    dir_test += '_'+str(N_MELS)
    model_id += '_'+str(N_MELS)
dir_train += ('' if FEATURES_SPEED==1 else '_speed'+str(FEATURES_SPEED))+'/'
dir_test += ('' if FEATURES_SPEED==1 else '_speed'+str(FEATURES_SPEED))+'/'
model_id += ('' if FEATURES_SPEED==1 else '_speed'+str(FEATURES_SPEED))
model_id += ('' if HOP_FRAMES_TRAIN_SPLIT!=INPUT_FRAMES else '_NoHopTrainSplit3s')
model_id += ('' if not FEATURES_NORMALIZED else '_features_normalized')
model_id += ('' if DROPOUT==0 else '_dropout'+str(DROPOUT))
model_id += ('' if REGULARIZATION==0 else '_regularization'+str(REGULARIZATION))

Load train features and targets (into list of arrays for Keras fit generator).

*Note:* I have to split the features of training audio files with different durations into splits of the same shape than the features of test audio files of exactly 3 seconds or 48000 samples at 16kHz of frequency. Since the features extraction configuration was *center=False*, *hop_length=160* and *n_fft=512*, for test audio files the features has INPUT_FRAMES=297 rows equivalent to 1+(48000-512)//160=297. Now I will use moving windows (number of frames between consecutive splits) of HOP_FRAMES_TRAIN_SPLIT=100 frames equivalent to 100*160=16000 samples or 1 second in order to: consider the time context of each audio, data augmentation and not discard many remaining frames after the last possible complete split.

In [4]:
train_labels = np.loadtxt('../data/train_labels.txt', dtype=str)

In [5]:
x_train_list, y_train_list = [], []
for filename in os.listdir(dir_train):
    features = np.load(dir_train+filename)
    for i in range(0, features.shape[0]-INPUT_FRAMES, HOP_FRAMES_TRAIN_SPLIT):
        features_split = features[i:(i+INPUT_FRAMES),:]
        if FEATURES_NORMALIZED:
            means = np.mean(features_split, axis=0)
            stds = np.std(features_split, axis=0)
            features_split = (features_split-means)/stds
        x_train_list.append(features_split)
        index = np.where(train_labels[:,0]==filename.replace('.npy', '.wav'))[0][0]
        y_train_list.append(train_labels[index,1]) 

In [6]:
len(x_train_list), len(y_train_list), type(x_train_list[0]), x_train_list[0].shape

(95583, 95583, numpy.ndarray, (297, 80))

Initialize model architecture.

In [7]:
INPUT_FRAMES_FEATURES = x_train_list[0].shape[1]
N_OUTPUTS = 6
model = Sequential()
model.add(Conv1D(500, 6, strides=1, activation='relu', input_shape=(INPUT_FRAMES, INPUT_FRAMES_FEATURES)))
model.add(Conv1D(500, 7, strides=2, activation='relu'))
model.add(Conv1D(500, 1, strides=1, activation='relu'))
model.add(Conv1D(3000, 1, strides=1, activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dense(1500, activation='relu', kernel_regularizer=l2(REGULARIZATION)))
if DROPOUT>0:
    model.add(Dropout(rate=DROPOUT))
model.add(Dense(600, activation='relu', kernel_regularizer=l2(REGULARIZATION)))
model.add(Dense(N_OUTPUTS, activation='softmax', kernel_regularizer=l2(REGULARIZATION)))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 292, 500)          240500    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 143, 500)          1750500   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 143, 500)          250500    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 143, 3000)         1503000   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 3000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1500)              4501500   
_________________________________________________________________
dense_2 (Dense)              (None, 600)               900600    
__________

Load test features and targets (into arrays for Keras model predict).

In [8]:
test_labels = np.loadtxt('../Data/test_labels.txt', dtype=str)

In [9]:
x_test, y_test = [], []
for filename in os.listdir(dir_test):
    features = np.load(dir_test+filename)
    if FEATURES_NORMALIZED:
        means = np.mean(features, axis=0)
        stds = np.std(features, axis=0)
        features = (features-means)/stds
    x_test.append(features)
    index = np.where(test_labels[:,0]==filename.replace('.npy', '.wav'))[0][0]
    y_test.append(test_labels[index,1])

In [10]:
x_test, y_test = np.array(x_test), np.array(y_test)

In [11]:
x_test.shape, y_test.shape

((3130, 297, 80), (3130,))

Convert test targets to *keras.utils.to_categorical* format using this languages mapping.

In [12]:
languages = {'estonian':0,'farsi':1,'german':2,'kabyle':3,'mandarin':4,'spanish':5}

In [13]:
y_test_dummies = to_categorical([languages[y] for y in y_test], N_OUTPUTS)

In [14]:
y_test_dummies.shape

(3130, 6)

Define the *generator* function to load the list of training samples features by batches.

In [15]:
seed = 1
def generator(x_train_list, y_train_list, batch_size):
    global seed
    np.random.seed(seed)
    indexes = np.random.choice(len(x_train_list), len(x_train_list), replace=False)
    while True:
        for i in range(0, len(x_train_list), batch_size):
            batch_indexes = indexes[i:(i+batch_size)]
            batch_features = np.array([x_train_list[j] for j in batch_indexes])
            batch_labels = to_categorical([languages[y_train_list[j]] for j in batch_indexes], N_OUTPUTS)
            yield batch_features, batch_labels
    seed += 1

Train the model.

In [None]:
callbacks = [EarlyStopping(monitor='loss', min_delta=0.01, patience=3),
             EarlyStopping(monitor='acc', min_delta=0.01, patience=3),
             EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5),
             EarlyStopping(monitor='val_acc', min_delta=0.01, patience=5),
             ModelCheckpoint('outputs/'+model_id+'.h5', monitor='val_loss', save_best_only=True)
            ]
opt = Adam(amsgrad=True, lr=0.001)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
EPOCHS = 15
BATCH_SIZE = 40
SAMPLES_PER_EPOCH = len(range(0, len(x_train_list), BATCH_SIZE))
history = model.fit_generator(generator = generator(x_train_list, y_train_list, BATCH_SIZE),
                              samples_per_epoch = SAMPLES_PER_EPOCH,                             
                              nb_epoch = EPOCHS,
                              callbacks = callbacks,
                              validation_data = (x_test, y_test_dummies),
                              verbose=1)

Save the training history.

In [None]:
json.dump(history.history, open('outputs/'+model_id+'.json', 'w'))