In [34]:
from keras.datasets import mnist
from keras.utils import np_utils

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop

In [43]:
batch_size = 32
num_classes = 10
epochs = 20

In [44]:
def my_generator():
    # TODO: 音声ファイルのディレクトリからwavデータを読み込む
    # 拡張のベースとなるディレクトリの音声ファイルはすべてメモリに乗せてもよいかも
    (X_train, y_train), (X_test, y_test) = mnist.load_data()

    X_train = X_train.reshape(60000, 784)
    X_test = X_test.reshape(10000, 784)
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    X_train /= 255
    X_test /= 255
    y_train = np_utils.to_categorical(y_train, 10)

    # バッチサイズ単位でデータとラベルのタプルをyieldで返す
    while True:
        for i in range(1875):  # 1875 * 32 (batch_size) = 60000
            data_batch = X_train[i * batch_size:(i + 1) * batch_size]
            label_batch = y_train[i * batch_size:(i + 1) * batch_size]

            # TODO: ここで音声ファイルの拡張処理（ノイズ付与など） + 特徴抽出

            yield data_batch, label_batch

In [45]:
gen = my_generator()

In [46]:
gen

<generator object my_generator at 0x13b924620>

In [47]:
images, labels = gen.__next__()

In [48]:
images.shape

(32, 784)

In [49]:
labels.shape

(32, 10)

In [51]:
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(784,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

In [53]:
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

In [54]:
# steps_per_epoch: 1エポックを宣言してから次のエポックの開始前までにgeneratorから生成される
# サンプル (サンプルのバッチ) の総数． 典型的には，データにおけるユニークなサンプル数をバッチサイズで割った値です．
model.fit_generator(my_generator(), steps_per_epoch=1875, epochs=epochs, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x10611aef0>

## 音声用のGenerator

In [55]:
import os
import glob
import numpy as np
import librosa

In [88]:
def pad(y, length):
    arr = np.zeros(length)
    if len(y) < length:
        arr[:len(y)] = y
    else:
        arr[:] = y[:length]
    return arr

def extract_melspectrogram(wave_batch, sr, length, n_fft, hop_length, n_mels):
    melgram_batch = []
    
    for y in wave_batch:
        x = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
        melgram = librosa.power_to_db(x, ref=np.max)
        melgram_batch.append(melgram)
    
    melgram_batch = np.array(melgram_batch)

    return melgram_batch

In [89]:
def sound_generator(sound_dir, batch_size=10, sr=16000, length=3,
                    n_fft=512, hop_length=256, n_mels=128):
    wave_data = []
    labels = []

    # load all data and labels from directory
    for fpath in glob.glob(os.path.join(sound_dir, '*.wav')):
        fname = os.path.basename(fpath)

        # load sound
        # メモリを圧迫する場合はwhileループの中でI/Oする方法もある
        # 8万ファイルで4GB程度なので最初にメモリに載せた方が効率よさそう
        y, sr = librosa.load(fpath, sr=sr)
        y = pad(y, sr * length)
        wave_data.append(y)

        # extract label
        label = int(fname.split('.')[0].split('_')[-1][:2]) - 1
        labels.append(label)

    wave_data = np.array(wave_data, dtype=np.float32)
    labels = np.array(labels, dtype=np.int32)

    print(wave_data.dtype, wave_data.shape)
    print(labels.dtype, labels.shape)

    while True:
        # 各バッチに対する処理
        for i in range(batch_size):  # 100 samples / 10 batch_size = 10 batch
            wave_batch = wave_data[i * batch_size:(i + 1) * batch_size]
            label_batch = labels[i * batch_size:(i + 1) * batch_size]
            
            print(i, wave_batch.shape, label_batch.shape)

            # TODO: ここで音声ファイルの拡張処理（ノイズ付与など）
            noise_wave_batch = add_noise(wave_batch, snr)

            # バッチ単位で特徴抽出
            data_batch = extract_melspectrogram(wave_batch, sr, length, n_fft, hop_length, n_mels)

            yield data_batch, label_batch

In [90]:
gen = sound_generator('../../Projects/speech-emotion-recognition/data/sounds/sample/')

In [91]:
gen.__next__()

float32 (100, 48000)
int32 (100,)
0 (10, 48000) (10,)


(array([[[-32.30982954, -34.984229  , -27.52332604, ..., -80.        ,
          -80.        , -80.        ],
         [-28.24036922, -33.31718039, -27.09672633, ..., -80.        ,
          -80.        , -80.        ],
         [-26.06931372, -32.02468645, -26.63682338, ..., -80.        ,
          -80.        , -80.        ],
         ..., 
         [-80.        , -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ],
         [-80.        , -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ],
         [-80.        , -80.        , -80.        , ..., -80.        ,
          -80.        , -80.        ]],
 
        [[-57.16541029, -75.87655468, -75.61125944, ..., -80.        ,
          -80.        , -80.        ],
         [-57.04721801, -76.0775156 , -62.90697491, ..., -80.        ,
          -80.        , -80.        ],
         [-56.86804705, -76.23355357, -59.8938694 , ..., -80.        ,
          -80.        , -80.   