In [1]:
import os
import re
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
POSSIBLE_LABELS = 'yes no up down left right on off stop go silence unknown'.split()
id2name = {i: name for i, name in enumerate(POSSIBLE_LABELS)}
name2id = {name: i for i, name in id2name.items()}
print(name2id)

{'on': 6, 'no': 1, 'up': 2, 'stop': 8, 'off': 7, 'left': 4, 'yes': 0, 'go': 9, 'right': 5, 'down': 3, 'unknown': 11, 'silence': 10}


In [3]:
def load_data(data_dir):
    
    all_files = glob(os.path.join(data_dir, 'train/audio/*/*wav'))

    with open(os.path.join(data_dir, 'train/validation_list.txt'), 'r') as fin:
        validation_files = fin.readlines()
    
    # Just a simple regexp for paths with three groups:
    # prefix, label, user_id
    pattern = re.compile("(.+\/)?(\w+)\/([^_]+)_.+wav")
    valset = set()
    for entry in validation_files:
        r = re.match(pattern, entry)
        if r:
            valset.add(r.group(3))

    possible = set(POSSIBLE_LABELS)
    train, val = [], []
    for entry in all_files:
        r = re.match(pattern, entry)
        if r:
            label, uid = r.group(2), r.group(3)
            if label == '_background_noise_':
                label = 'silence'
            if label not in possible:
                label = 'unknown'

            label_id = name2id[label]

            sample = (label, label_id, uid, entry)
            if uid in valset:
                val.append(sample)
            else:
                train.append(sample)

    print('There are {} train and {} val samples'.format(len(train), len(val)))
    
    columns_list = ['label', 'label_id', 'user_id', 'wav_file']
    
    train_df = pd.DataFrame(train, columns = columns_list)
    valid_df = pd.DataFrame(val, columns = columns_list)
    
    return train_df, valid_df

In [4]:
train_df, valid_df = load_data('./data/')

There are 57929 train and 6798 val samples


In [5]:
train_df.head()

Unnamed: 0,label,label_id,user_id,wav_file
0,yes,0,8eb4a1bf,./data/train/audio/yes/8eb4a1bf_nohash_3.wav
1,yes,0,9b402bc2,./data/train/audio/yes/9b402bc2_nohash_0.wav
2,yes,0,20174140,./data/train/audio/yes/20174140_nohash_0.wav
3,yes,0,3b8406c0,./data/train/audio/yes/3b8406c0_nohash_1.wav
4,yes,0,9c59dd28,./data/train/audio/yes/9c59dd28_nohash_2.wav


In [6]:
train_df.label.value_counts()

unknown    36818
stop        2134
yes         2116
up          2115
go          2112
right       2111
on          2110
left        2106
no          2105
off         2101
down        2095
silence        6
Name: label, dtype: int64

In [7]:
# Separating silence data
silence_files = train_df[train_df.label == 'silence']
train_df      = train_df[train_df.label != 'silence']

In [8]:
from scipy.io import wavfile
def read_wav_file(fname):
    _, wav = wavfile.read(fname)
    wav = wav.astype(np.float32) / np.iinfo(np.int16).max
    return wav

In [9]:
# Combining the silence data
silence_data = np.concatenate([read_wav_file(x) for x in silence_files.wav_file.values])
len(silence_data)/16000 # in seconds



399.3981875

In [10]:
from scipy.signal import stft

def calc_phase_amp(wav):
    specgram = stft(wav, 16000, nperseg = 400, noverlap = 240, nfft = 512, padded = False, boundary = None)
    phase = np.angle(specgram[2]) / np.pi
    amp = np.log1p(np.abs(specgram[2]))
    return np.concatenate([phase, amp], axis = 1)

def process_wav_file(fname):
    wav = read_wav_file(fname)
    
    L = 16000  # 1 sec
    
    if len(wav) > L:
        i = np.random.randint(0, len(wav) - L) # if length is more than 1s, randomly sample the file
        wav = wav[i:(i+L)]
    elif len(wav) < L: # if length is less than 1s, add silence on both ends to make it 1s
        rem_len = L - len(wav)
        i = np.random.randint(0, len(silence_data) - rem_len)
        silence_part = silence_data[i:(i+L)]
        j = np.random.randint(0, rem_len)
        silence_part_left  = silence_part[0:j]
        silence_part_right = silence_part[j:rem_len]
        wav = np.concatenate([silence_part_left, wav, silence_part_right])
    
    return calc_phase_amp(wav)

In [11]:
import random
import tensorflow as tf
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Conv2D, MaxPooling2D, Activation, BatchNormalization, GlobalAveragePooling2D, GlobalMaxPool2D, concatenate, Dense, Dropout
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.utils import to_categorical

In [12]:
def train_generator(train_batch_size):
    while True:
        this_train = train_df.copy()
        
        # let's say we have about 400 silence files (since we have ~400s of silence data)
        ids = list(range(this_train.shape[0]))
        ids = ids + [None]*400
        
        shuffled_ids = random.sample(ids, len(ids))
        for start in range(0, len(shuffled_ids), train_batch_size):
            x_batch = []
            y_batch = []
            
            end = min(start + train_batch_size, len(shuffled_ids))
            
            i_train_batch = shuffled_ids[start:end]
            
            for i in i_train_batch:
                if i is None:
                    k = np.random.randint(0, len(silence_data) - 16000)
                    this_silence = silence_data[k:(k+16000)]
                    x_batch.append(calc_phase_amp(this_silence))
                    y_batch.append(name2id['silence'])
                else:
                    x_batch.append(process_wav_file(this_train.wav_file.values[i]))
                    y_batch.append(this_train.label_id.values[i])
            
            x_batch = np.array(x_batch)
            y_batch = to_categorical(y_batch, num_classes = len(POSSIBLE_LABELS))
            yield x_batch, y_batch

In [13]:
def valid_generator(val_batch_size):
    while True:
        ids = list(range(valid_df.shape[0]))
        for start in range(0, len(ids), val_batch_size):
            x_batch = []
            y_batch = []
            end = min(start + val_batch_size, len(ids))
            i_val_batch = ids[start:end]
            for i in i_val_batch:
                x_batch.append(process_wav_file(valid_df.wav_file.values[i]))
                y_batch.append(valid_df.label_id.values[i])
            x_batch = np.array(x_batch)
            y_batch = to_categorical(y_batch, num_classes = len(POSSIBLE_LABELS))
            yield x_batch, y_batch

In [14]:
from tensorflow.python.keras.layers import LSTM, Flatten

In [15]:
x_in = Input(shape = (257,196))
x = BatchNormalization()(x_in)
x = LSTM(256, return_sequences=True)(x)
x = LSTM(256, return_sequences=True)(x)
x = LSTM(256, return_sequences=True)(x)
x = LSTM(256)(x)
x = Dense(128, activation = 'relu')(x)
x = Dropout(0.5)(x)
x = Dense(len(POSSIBLE_LABELS), activation = 'softmax')(x)
model = Model(inputs = x_in, outputs = x)

In [16]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 257, 196)          0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 257, 196)          784       
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 256)         463872    
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 256)         525312    
_________________________________________________________________
lstm_3 (LSTM)                (None, None, 256)         525312    
_________________________________________________________________
lstm_4 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
__________

In [17]:
# from keras_tqdm import TQDMNotebookCallback
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [18]:
callbacks = [EarlyStopping(monitor='val_loss',
                           patience=4,
                           verbose=1,
                           min_delta=0.01,
                           mode='min'),
             ReduceLROnPlateau(monitor='val_loss',
                               factor=0.1,
                               patience=2,
                               verbose=1,
                               epsilon=0.01,
                               mode='min'),
             ModelCheckpoint(monitor='val_loss',
                             filepath='weights/03-weights.hdf5',
                             save_best_only=True,
                             save_weights_only=True,
                             mode='min')]

In [19]:
(train_df.shape[0] + 400) / 128

455.6484375

In [20]:
(valid_df.shape[0])/128

53.109375

In [21]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [22]:
history = model.fit_generator(generator=train_generator(128),
                              steps_per_epoch=456,
                              epochs=25,
                              verbose=2,
                              callbacks=callbacks,
                              validation_data=valid_generator(128),
                              validation_steps=54)

Epoch 1/25
993s - loss: 1.5750 - categorical_accuracy: 0.6300 - val_loss: 1.5534 - val_categorical_accuracy: 0.6192
Epoch 2/25
933s - loss: 1.5295 - categorical_accuracy: 0.6334 - val_loss: 1.5357 - val_categorical_accuracy: 0.6200
Epoch 3/25
923s - loss: 1.5277 - categorical_accuracy: 0.6333 - val_loss: 1.5398 - val_categorical_accuracy: 0.6205
Epoch 4/25
824s - loss: 1.5236 - categorical_accuracy: 0.6332 - val_loss: 1.5316 - val_categorical_accuracy: 0.6209
Epoch 5/25
823s - loss: 1.5117 - categorical_accuracy: 0.6328 - val_loss: 1.5035 - val_categorical_accuracy: 0.6209
Epoch 6/25
822s - loss: 1.4790 - categorical_accuracy: 0.6326 - val_loss: 1.4578 - val_categorical_accuracy: 0.6209
Epoch 7/25
823s - loss: 1.4205 - categorical_accuracy: 0.6343 - val_loss: 1.4234 - val_categorical_accuracy: 0.6208
Epoch 8/25
822s - loss: 1.3752 - categorical_accuracy: 0.6348 - val_loss: 1.3284 - val_categorical_accuracy: 0.6224
Epoch 9/25
821s - loss: 1.3081 - categorical_accuracy: 0.6374 - val_loss