In [1]:
import tensorflow
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

#TensorFlowがGPUを認識しているか確認
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

2022-05-01 08:38:28.214460: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-01 08:38:28.692540: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /device:GPU:0 with 22319 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:02:00.0, compute capability: 8.6


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 7274336705582571117,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 23403757568
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 9021967592445404728
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:02:00.0, compute capability: 8.6"]

# モデル構造の定義
<img src="images/CRNN_SED_DCASE2017_task3.jpg">


In [None]:
from __future__ import print_function

import numpy as np
import time
import sys
import matplotlib.pyplot as plot

from tensorflow.keras.layers import Bidirectional, TimeDistributed, Conv2D, MaxPooling2D, Input, GRU, Dense, Activation, Dropout, Reshape, Permute, LSTM
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import confusion_matrix
import metrics
import utils
from IPython import embed
import keras.backend as K

from tensorflow.keras.utils import plot_model

import tensorflow.experimental.numpy as tnp

plot.switch_backend('agg')
sys.setrecursionlimit(10000)

def load_data(_feat_folder, _mono, _fold=None):
    
    # Load name
    feat_file_fold = os.path.join(_feat_folder, 'mbe123_mon_snr_0-28_step2_fold{}.npz'.format(_fold))
        
    dmp = np.load(feat_file_fold)
    _X_train, _Y_train, _X_test, _Y_test = dmp['arr_0'],  dmp['arr_1'],  dmp['arr_2'],  dmp['arr_3']
    return _X_train, _Y_train, _X_test, _Y_test

mel_filt = 40

def get_model(data_in, data_out, _cnn_nb_filt, _cnn_pool_size, _rnn_nb, _fc_nb):
    
    # input_shape (ch, time, mel)
    spec_start = Input(shape=(data_in.shape[-3], data_in.shape[-2], data_in.shape[-1])) #default

    spec_x = spec_start
    for _i, _cnt in enumerate(_cnn_pool_size):
        spec_x = Conv2D(filters=_cnn_nb_filt, kernel_size=(3, 3), padding='same', data_format="channels_last")(spec_x)
        spec_x = BatchNormalization(axis=3)(spec_x)
        spec_x = Activation('relu')(spec_x)
        spec_x = MaxPooling2D(pool_size=(_cnn_pool_size[_i], 1))(spec_x) #cnn_pool_size [5, 2, 2]
        spec_x = Dropout(dropout_rate)(spec_x)
    spec_x = Permute((1, 3, 2))(spec_x)
    spec_x = Reshape((data_in.shape[-3], -1))(spec_x) #[-2]:time

    for _r in _rnn_nb:
        spec_x = Bidirectional(
            GRU(_r, activation='tanh', dropout=dropout_rate, return_sequences=True),
            merge_mode='mul')(spec_x)
 
    for _f in _fc_nb:
        spec_x = TimeDistributed(Dense(_f))(spec_x)
        spec_x = Dropout(dropout_rate)(spec_x)

    spec_x = TimeDistributed(Dense(data_out.shape[-1]))(spec_x)
    out = Activation('sigmoid', name='strong_out')(spec_x)

    _model = Model(inputs=spec_start, outputs=out)
    _model.compile(optimizer="Adam", loss='binary_crossentropy')
    _model.summary()
    
    return _model


def plot_functions(_nb_epoch, _tr_loss, _val_loss, _f1, _er, extension=''):
    plot.figure()

    plot.subplot(211)
    plot.plot(range(_nb_epoch), _tr_loss, label='train loss')
    plot.plot(range(_nb_epoch), _val_loss, label='val loss')
    plot.legend()
    plot.grid(True)

    plot.subplot(212)
    plot.plot(range(_nb_epoch), _f1, label='f')
    plot.plot(range(_nb_epoch), _er, label='er')
    plot.legend()
    plot.grid(True)

    plot.savefig(__models_dir + __fig_name + extension)
    plot.close()
    print('figure name : {}'.format(__fig_name))


def preprocess_data(_X, _Y, _X_test, _Y_test, _seq_len, _nb_ch):
    # split into sequences
    _X = utils.split_in_seqs(_X, _seq_len)
    _Y = utils.split_in_seqs(_Y, _seq_len)

    _X_test = utils.split_in_seqs(_X_test, _seq_len)
    _Y_test = utils.split_in_seqs(_Y_test, _seq_len)

    _X = utils.split_multi_channels(_X, _nb_ch)
    _X_test = utils.split_multi_channels(_X_test, _nb_ch)
    return _X, _Y, _X_test, _Y_test

In [5]:
#######################################################################################
# MAIN SCRIPT STARTS HERE
#######################################################################################

is_mono = True  # True: mono-channel input, False: binaural input

feat_folder = 'feat/'


nb_ch = 1 if is_mono else 2
batch_size = 480 #560    # 576でOOM # Decrease this if you want to run on smaller GPU's
seq_len = 256       # Frame sequence length. Input to the CRNN.
nb_epoch = 150      # Training epochs
patience = int(0.25 * nb_epoch)  # Patience for early stopping

STEP = 10 #generatorで何倍に増量するか


# Number of frames in 1 second, required to calculate F and ER for 1 sec segments.
# Make sure the nfft and sr are the same as in feature.py
sr = 48000
nfft = 1024
frames_1_sec = int(sr/(nfft/2.0))

# Folder for saving model and training curves
__models_dir = 'models/'
utils.create_folder(__models_dir)

# CRNN model definition
cnn_nb_filt = 128            # CNN filter size
cnn_pool_size = [5, 2, 2]   # Maxpooling across frequency. Length of cnn_pool_size =  number of CNN layers
rnn_nb = [32, 32]           # Number of RNN nodes.  Length of rnn_nb =  number of RNN layers
fc_nb = [32]                # Number of FC nodes.  Length of fc_nb =  number of FC layers
dropout_rate = 0.5          # Dropout after each layer

__fig_name = f'mbf{mel_filt}_cnn-f{cnn_nb_filt}-{cnn_pool_size}_bigru{rnn_nb}_fc{fc_nb}_spec{STEP}_e{nb_epoch}p{patience}_batch{batch_size}_{time.strftime("%Y_%m_%d_%H_%M_%S")}'

print('\n\nUNIQUE ID: {}'.format(__fig_name))
print('TRAINING PARAMETERS: nb_ch: {}, seq_len: {}, batch_size: {}, nb_epoch: {}, frames_1_sec: {}'.format(nb_ch, seq_len, batch_size, nb_epoch, frames_1_sec))
print('MODEL PARAMETERS:\n cnn_nb_filt: {}, cnn_pool_size: {}, rnn_nb: {}, fc_nb: {}, dropout_rate: {}'.format(cnn_nb_filt, cnn_pool_size, rnn_nb, fc_nb, dropout_rate))



UNIQUE ID: mbf40_cnn-f128-[5, 2, 2]_bigru[32, 32]_fc[32]_spec10_e150p37_batch480_2022_05_01_08_38_46
TRAINING PARAMETERS: nb_ch: 1, seq_len: 256, batch_size: 480, nb_epoch: 150, frames_1_sec: 93
MODEL PARAMETERS:
 cnn_nb_filt: 128, cnn_pool_size: [5, 2, 2], rnn_nb: [32, 32], fc_nb: [32], dropout_rate: 0.5


In [None]:
X, Y, X_test, Y_test = load_data(feat_folder, is_mono, 1)
print("load.data_X_test:",X_test.shape)
X, Y, X_test, Y_test = preprocess_data(X, Y, X_test, Y_test, seq_len, nb_ch)

print(Y.shape)
print(X.shape)
print(Y.shape)
print(X_test.shape)
print(Y_test.shape)

load.data_X_test: (506265, 40)
(8490, 256, 7)
(8490, 1, 256, 40)
(8490, 256, 7)
(1977, 1, 256, 40)
(1977, 256, 7)


## SpecAugmentのジェネレータ
* ランダムに周波数方向と時間方向にマスクして水増しすることで汎化性能を高める事を目的とする
* 引用論文: https://arxiv.org/abs/1904.08779

In [None]:
import random

random.seed(42)

# class data generator
class SpecaugmentGenerator():
    def __init__(self, x_train, y_train, batch_size=16, alpha=0.2, shuffle=True):
        self.x_train = x_train
        self.y_train = y_train
        self.batch_size = batch_size
        self.alpha = alpha
        self.shuffle = shuffle
        self.sample_num = len(x_train)

    def __call__(self):
        while True:
            indexes = self.__get_exploration_order()
            itr_num = int(len(indexes) // (self.batch_size * 2))

            for i in range(itr_num):
                batch_ids = indexes[i * self.batch_size * 2:(i + 1) * self.batch_size * 2]
                x, y = self.__data_generation(batch_ids)

                yield x, y

    def __get_exploration_order(self):
        indexes = np.arange(self.sample_num)

        if self.shuffle:
            np.random.shuffle(indexes)

        return indexes

    def __data_generation(self, batch_ids):
        
        x1 = self.x_train[batch_ids[:self.batch_size]]
        y = self.y_train[batch_ids[:self.batch_size]]

        for j, _ in enumerate(x1): # shape(batch, time, freq, ch) = x1.shape: (64, 256, 40, 1)
            # 時間軸のマスク
            k = random.randint(0, 50) # max time mask1 width
            l = random.randint(10, 200) # time mask1 start
            x1[j, l:l+k, :, :] = 0
            
            # 周波数軸のマスク
            m = random.randint(1, 8) # max freq mask width
            n = random.randint(5, 31)  # freq mask start
            x1[j, :, n:n+m, :] = 0

        x = x1

        return x, y

In [None]:
avg_er = list()
avg_f1 = list()


for fold in [1, 2, 3, 4, 5]:
    print('\n\n----------------------------------------------')
    print('FOLD: {}'.format(fold))
    print('----------------------------------------------\n')
    
    K.clear_session() #モデルを初期化してメモリーリセット
    
    # Load feature and labels, pre-process it
    X, Y, X_test, Y_test = load_data(feat_folder, is_mono, fold)
    print("load.data_X_test:",X_test.shape)
    X, Y, X_test, Y_test = preprocess_data(X, Y, X_test, Y_test, seq_len, nb_ch)
    X=X.transpose(0,2,3,1) #(time, mel, ch)
    X_test=X_test.transpose(0,2,3,1) #(time, mel, ch)
    print(X.shape)
    print(X_test.shape)
    print(Y.shape)
    print(Y_test.shape)

    # Load model
    model = get_model(X, Y, cnn_nb_filt, cnn_pool_size, rnn_nb, fc_nb)

    # Training
    best_epoch, pat_cnt, best_er, f1_for_best_er, best_conf_mat = 0, 0, 99999, None, None
    tr_loss, val_loss, f1_overall_1sec_list, er_overall_1sec_list = [0] * nb_epoch, [0] * nb_epoch, [0] * nb_epoch, [0] * nb_epoch
    posterior_thresh = 0.5
    
    print(Y.shape)
    training_generator = SpecaugmentGenerator(X, Y, batch_size=batch_size)()
    for i in range(nb_epoch):
        print('Epoch : {} '.format(i), end='')

        hist = model.fit(
                        x=training_generator,
                        steps_per_epoch=X.shape[0] // batch_size * STEP,
                        validation_data=(X_test, Y_test),
                        epochs=1,
                        # validation_steps=X_valid.shape[0] // BATCH_SIZE,
                        validation_steps=None,
                        verbose=1,
                        shuffle=True)

        val_loss[i] = hist.history.get('val_loss')[-1]
        tr_loss[i] = hist.history.get('loss')[-1]

        pred = model.predict(X_test)
        pred_thresh = pred > posterior_thresh
        score_list = metrics.compute_scores(pred_thresh, Y_test, frames_in_1_sec=frames_1_sec)

        f1_overall_1sec_list[i] = score_list['f1_overall_1sec']
        er_overall_1sec_list[i] = score_list['er_overall_1sec']
        pat_cnt = pat_cnt + 1

        # Calculate confusion matrix
        test_pred_cnt = np.sum(pred_thresh, 2)
        Y_test_cnt = np.sum(Y_test, 2)
        conf_mat = confusion_matrix(Y_test_cnt.reshape(-1), test_pred_cnt.reshape(-1))
        conf_mat = conf_mat / (utils.eps + np.sum(conf_mat, 1)[:, None].astype('float'))

        if er_overall_1sec_list[i] < best_er:
            best_conf_mat = conf_mat
            best_er = er_overall_1sec_list[i]
            f1_for_best_er = f1_overall_1sec_list[i]
            model.save(os.path.join(__models_dir, '{}_fold{}_model.h5'.format(__fig_name, fold)))
            best_epoch = i
            pat_cnt = 0

        print('Train_loss: {}, Val_loss : {}, F1_overall : {}, loss_overall: {} Best_loss: {}, best_epoch: {}'.format(
                tr_loss[i], val_loss[i], f1_overall_1sec_list[i], er_overall_1sec_list[i], best_er, best_epoch))
        plot_functions(nb_epoch, tr_loss, val_loss, f1_overall_1sec_list, er_overall_1sec_list, '_fold_{}'.format(fold))

        # Early stopping
        if pat_cnt > patience:
            break
            
    avg_er.append(best_er)
    avg_f1.append(f1_for_best_er)
    print('saved model for the best_epoch: {} with best_f1: {} f1_for_best_er: {}'.format(
        best_epoch, best_er, f1_for_best_er))
    print('best_conf_mat: {}'.format(best_conf_mat))
    print('best_conf_mat_diag: {}'.format(np.diag(best_conf_mat)))

print('\n\nMETRICS FOR ALL FOUR FOLDS: avg_er: {}, avg_f1: {}'.format(avg_er, avg_f1))
print('MODEL AVERAGE OVER FOUR FOLDS: avg_er: {}, avg_f1: {}'.format(np.mean(avg_er), np.mean(avg_f1)))



----------------------------------------------
FOLD: 1
----------------------------------------------

load.data_X_test: (506265, 40)
(8490, 256, 40, 1)
(1977, 256, 40, 1)
(8490, 256, 7)
(1977, 256, 7)


2022-05-01 08:39:02.515082: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22319 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:02:00.0, compute capability: 8.6


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 256, 40, 1)]      0         
_________________________________________________________________
conv2d (Conv2D)              (None, 256, 40, 128)      1280      
_________________________________________________________________
batch_normalization (BatchNo (None, 256, 40, 128)      512       
_________________________________________________________________
activation (Activation)      (None, 256, 40, 128)      0         
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 51, 40, 128)       0         
_________________________________________________________________
dropout (Dropout)            (None, 51, 40, 128)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 51, 40, 128)       147584

2022-05-01 08:39:03.659311: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-05-01 08:39:08.230922: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8100
2022-05-01 08:39:10.132727: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Train_loss: 0.44923004508018494, Val_loss : 0.2657454013824463, F1_overall : 0.9243848641240334, loss_overall: 0.1370337188467177 Best_loss: 0.1370337188467177, best_epoch: 0
figure name : mbf40_cnn-f128-[5, 2, 2]_bigru[32, 32]_fc[32]_spec10_e150p37_batch480_2022_05_01_08_38_46
Train_loss: 0.2955048382282257, Val_loss : 0.2088337540626526, F1_overall : 0.9729038543506057, loss_overall: 0.05289949503176413 Best_loss: 0.05289949503176413, best_epoch: 1
figure name : mbf40_cnn-f128-[5, 2, 2]_bigru[32, 32]_fc[32]_spec10_e150p37_batch480_2022_05_01_08_38_46
Train_loss: 0.2520159184932709, Val_loss : 0.15381161868572235, F1_overall : 0.9787312064539786, loss_overall: 0.041130477276429385 Best_loss: 0.041130477276429385, best_epoch: 2
figure name : mbf40_cnn-f128-[5, 2, 2]_bigru[32, 32]_fc[32]_spec10_e150p37_batch480_2022_05_01_08_38_46
Train_loss: 0.21704606711864471, Val_loss : 0.13084080815315247, F1_overall : 0.9814712896716198, loss_overall: 0.03567356246945757 Best_loss: 0.0356735624694

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Train_loss: 0.08351726830005646, Val_loss : 0.06378549337387085, F1_overall : 0.9897335614764114, loss_overall: 0.020280175924417658 Best_loss: 0.017836781234728784, best_epoch: 92
figure name : mbf40_cnn-f128-[5, 2, 2]_bigru[32, 32]_fc[32]_spec10_e150p37_batch480_2022_05_01_08_38_46
Train_loss: 0.0835522785782814, Val_loss : 0.061857324093580246, F1_overall : 0.9898269148437341, loss_overall: 0.020076559700276917 Best_loss: 0.017836781234728784, best_epoch: 92
figure name : mbf40_cnn-f128-[5, 2, 2]_bigru[32, 32]_fc[32]_spec10_e150p37_batch480_2022_05_01_08_38_46
Train_loss: 0.08306669443845749, Val_loss : 0.05883604288101196, F1_overall : 0.9905696885756766, loss_overall: 0.018691969376119888 Best_loss: 0.017836781234728784, best_epoch: 92
figure name : mbf40_cnn-f128-[5, 2, 2]_bigru[32, 32]_fc[32]_spec10_e150p37_batch480_2022_05_01_08_38_46
Train_loss: 0.0829988643527031, Val_loss : 0.06023921072483063, F1_overall : 0.9907707352851292, loss_overall: 0.018366183417494707 Best_loss: 0.

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Train_loss: 0.09630373865365982, Val_loss : 0.06654718518257141, F1_overall : 0.9862917976858415, loss_overall: 0.026707566462167688 Best_loss: 0.024008179959100203, best_epoch: 9
figure name : mbf40_cnn-f128-[5, 2, 2]_bigru[32, 32]_fc[32]_spec10_e150p37_batch480_2022_05_01_08_38_46
Train_loss: 0.09613606333732605, Val_loss : 0.07014923542737961, F1_overall : 0.9844977178337924, loss_overall: 0.030061349693251534 Best_loss: 0.024008179959100203, best_epoch: 9
figure name : mbf40_cnn-f128-[5, 2, 2]_bigru[32, 32]_fc[32]_spec10_e150p37_batch480_2022_05_01_08_38_46
Train_loss: 0.09511783719062805, Val_loss : 0.07173226028680801, F1_overall : 0.9844280336535493, loss_overall: 0.030347648261758692 Best_loss: 0.024008179959100203, best_epoch: 9
figure name : mbf40_cnn-f128-[5, 2, 2]_bigru[32, 32]_fc[32]_spec10_e150p37_batch480_2022_05_01_08_38_46
Train_loss: 0.09493046998977661, Val_loss : 0.07279066741466522, F1_overall : 0.9844793915099188, loss_overall: 0.030265848670756646 Best_loss: 0.02

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
from keras.utils import plot_model
plot_model(model, to_file='model_.png')