In [1]:
import tensorflow as tf
import tensorflow_io as tfio

import numpy as np
import pandas as pd
import os

from glob import glob

In [2]:
import tensorflow.keras as K


In [3]:
K.__version__

'2.4.0'

In [4]:
# the model requires kapre
!pip install kapre





In [5]:
import kapre

In [6]:
# model definition
def _construct_milsed_block(num_blocks, dropout_rate = False):

    sample_rate = 22050
    input_shape = (sample_rate * 10, 1) # mono 10 seconds at 22050hz
    'n_mels'
    n_fft = 2048 # frame size
    hop_length = 256
    n_mels=256
    mel_f_min=0.0
    mel_f_max=None
    return_decibel=True
    model = K.Sequential()
    composed_melgram_layer = \
        kapre.composed.get_melspectrogram_layer(input_shape=input_shape,
                                                sample_rate=sample_rate,
                                                n_fft=n_fft,
                                                n_mels=n_mels,
                                                mel_f_min=mel_f_min,
                                                mel_f_max=mel_f_max,
                                                return_decibel=return_decibel)

    # decompose the layers the model can be saved
    for layer in composed_melgram_layer.layers:
        model.add(layer)

    model.add(K.layers.BatchNormalization())

    # add blocks
    n_filters = 16
    for block in range(num_blocks):
        model.add(K.layers.Convolution2D(n_filters, (3, 3),
                                       padding='same',
                                       activation='relu',
                                       kernel_initializer='he_normal'))
        model.add(K.layers.BatchNormalization())
        model.add(K.layers.Convolution2D(n_filters, (3, 3),
                                       padding='same',
                                       activation='relu',
                                       kernel_initializer='he_normal'))
        model.add(K.layers.BatchNormalization())
        model.add(K.layers.MaxPooling2D((2,2), padding='valid'))

        # double the number of filters for the next block
        n_filters *= 2

    model.add(K.layers.GlobalMaxPooling2D())

    model.add(K.layers.Dense(1028, activation='relu'))
    if dropout_rate:
        model.add(K.layers.Dropout(dropout_rate))

    model.add(K.layers.Dense(512, activation='relu'))
    if dropout_rate:
        model.add(K.layers.Dropout(dropout_rate))

    model.add(K.layers.Dense(264, activation='softmax'))

    return model

In [7]:
model = _construct_milsed_block(7)

In [8]:
# load the pretrained weights
#model.load_weights('/kaggle/input/pretrained-bird-vocalization-cnn/milsed_7block_dense-birdsongs_2_1618704934_e5b73727.h5')
model.load_weights('input/pretrained-bird-vocalization-cnn/milsed_7block_dense-birdsongs_2_1618704934_e5b73727.h5')

In [9]:
# inspect layers
model.layers

[<kapre.time_frequency.STFT at 0x159ee1430>,
 <kapre.time_frequency.Magnitude at 0x159ee1730>,
 <kapre.time_frequency.ApplyFilterbank at 0x159ee17f0>,
 <kapre.time_frequency.MagnitudeToDecibel at 0x159ee1ac0>,
 <tensorflow.python.keras.layers.normalization_v2.BatchNormalization at 0x15a170640>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x15a1bc610>,
 <tensorflow.python.keras.layers.normalization_v2.BatchNormalization at 0x15a0e4b20>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x15a11e5e0>,
 <tensorflow.python.keras.layers.normalization_v2.BatchNormalization at 0x15a16c8b0>,
 <tensorflow.python.keras.layers.pooling.MaxPooling2D at 0x15a1b2790>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x15a16c1c0>,
 <tensorflow.python.keras.layers.normalization_v2.BatchNormalization at 0x15a1b2070>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x15a156760>,
 <tensorflow.python.keras.layers.normalization_v2.BatchNormalization at 0x15a1b2b80>,
 <ten

In [10]:
# Freeze all but the last 3 layers
for layerid in range(len(model.layers) - 3):
    model.layers[layerid].trainable = False

In [11]:
# make sure correct layers are frozen.
for layer in model.layers:
    print(layer.trainable, layer.name)

False stft
False magnitude
False apply_filterbank
False magnitude_to_decibel
False batch_normalization
False conv2d
False batch_normalization_1
False conv2d_1
False batch_normalization_2
False max_pooling2d
False conv2d_2
False batch_normalization_3
False conv2d_3
False batch_normalization_4
False max_pooling2d_1
False conv2d_4
False batch_normalization_5
False conv2d_5
False batch_normalization_6
False max_pooling2d_2
False conv2d_6
False batch_normalization_7
False conv2d_7
False batch_normalization_8
False max_pooling2d_3
False conv2d_8
False batch_normalization_9
False conv2d_9
False batch_normalization_10
False max_pooling2d_4
False conv2d_10
False batch_normalization_11
False conv2d_11
False batch_normalization_12
False max_pooling2d_5
False conv2d_12
False batch_normalization_13
False conv2d_13
False batch_normalization_14
False max_pooling2d_6
False global_max_pooling2d
True dense
True dense_1
True dense_2


Now I need to create an input generator for the current dataset.  The datagenerator for the project from which this model came is very inefficient so I will use the example generator provided by kaggle.

In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from matplotlib import pyplot as plt
import soundfile as sf

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import io
import os
import time

#count = 0
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        count += 1
#print('counted %d files.' % count)

In [13]:
short_audio_dir = 'input/birdclef-2021/train_short_audio'
train_file_pattern = os.path.join(short_audio_dir, '*/*.ogg')
audio_files = glob(train_file_pattern)
DATASET_SIZE = len(audio_files)

In [14]:
# compile model
model.compile(optimizer=K.optimizers.Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=[K.metrics.CategoricalAccuracy()])

# Simple pipeline.

- Split in train test by class.
- Load files in dataset.
- Pass to model.


In [15]:
# Train Val split
def split_from_df(df, class_col, val_prop, test_prop=0):

    train = {'files': [], 'labels':[]}
    val = {'files': [], 'labels':[]}
    test = {'files': [], 'labels':[]}
    
    grouped = df.groupby(class_col)
    for name, group in grouped:
        
        # randomly select test rows
        test_rows = group.sample(frac=test_prop, replace=False)
        test['files'] += test_rows['files'].tolist()
        test['labels'] += [name] * len(test_rows)
        
        # remove the test rows
        group = group.drop(test_rows.index)
        
        #randomly select validation rows
        val_rows = group.sample(frac=val_prop, replace=False)
        val['files'] += val_rows['files'].tolist()
        val['labels'] += [name] * len(val_rows)
        
        # remove the validation rows
        group = group.drop(val_rows.index)
        
        # train is everything left over
        train_rows = group
        train['files'] += train_rows['files'].tolist()
        train['labels'] += [name] * len(train_rows)
        
    return train, val, test

In [16]:
from glob import glob
files = glob('input/birdclef-2021/train_short_audio/*/*.ogg')

In [17]:
files[:5]

['input/birdclef-2021/train_short_audio/acafly/XC109605.ogg',
 'input/birdclef-2021/train_short_audio/acafly/XC11209.ogg',
 'input/birdclef-2021/train_short_audio/acafly/XC127032.ogg',
 'input/birdclef-2021/train_short_audio/acafly/XC129974.ogg',
 'input/birdclef-2021/train_short_audio/acafly/XC129981.ogg']

In [18]:
labels = [f.split('/')[-2] for f in files]
labels[:5]

['acafly', 'acafly', 'acafly', 'acafly', 'acafly']

In [19]:
files_df = pd.DataFrame({'files':files,'labels':labels})
files_df

Unnamed: 0,files,labels
0,input/birdclef-2021/train_short_audio/acafly/X...,acafly
1,input/birdclef-2021/train_short_audio/acafly/X...,acafly
2,input/birdclef-2021/train_short_audio/acafly/X...,acafly
3,input/birdclef-2021/train_short_audio/acafly/X...,acafly
4,input/birdclef-2021/train_short_audio/acafly/X...,acafly
...,...,...
62869,input/birdclef-2021/train_short_audio/yetvir/X...,yetvir
62870,input/birdclef-2021/train_short_audio/yetvir/X...,yetvir
62871,input/birdclef-2021/train_short_audio/yetvir/X...,yetvir
62872,input/birdclef-2021/train_short_audio/yetvir/X...,yetvir


In [20]:
train, val, test = split_from_df(files_df, 'labels', 0.15)

In [21]:
len(train['labels']), len(val['labels']), len(test['labels'])

(53442, 9432, 0)

In [22]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

def get_labels_from_path(filename):
    label = tf.strings.split(filename, sep='/')[-2]
    return filename, label
    
def get_file_dataset(file_paths):
#     file_paths = 'input/birdclef-2021/train_short_audio/*/*.ogg'
    file_paths_ds = tf.data.Dataset.list_files(file_paths, shuffle=False)
    ds = file_paths_ds.map(get_labels_from_path, num_parallel_calls=3) #AUTOTUNE)
    
    return ds

# def load_audio(file_path, label):
#     audio = tfio.audio.AudioIOTensor(file_path, dtype=tf.int32)
#     return audio[0], label

def load_audio(filename, label):
    def _soundfile_read(filename):
        with open(filename.numpy(), 'br') as audio_file:
            tmp = io.BytesIO(audio_file.read())
            audio, rate = sf.read(tmp, dtype='float32')
        return audio
    [audio,] = tf.py_function(_soundfile_read, [filename], [tf.float32]))
    return audio, label

In [23]:
def prepare_for_training(ds, shuffle_buffer_size=128, batch_size=32):
    # Randomly shuffle
    ds = ds.shuffle(buffer_size=shuffle_buffer_size)
    
    # load and decode audio from file paths
    ds = ds.map(load_audio, num_parallel_calls=AUTOTUNE)
    
    # repeat dataset forever
    ds = ds.repeat()
    
    # Prepare batches
    ds.batch(batch_size)
    
    # Prefetch
    ds = ds.prefetch(buffer_size=128) # AUTOTUNE
    
    return ds

In [24]:
ds = get_file_dataset(train['files'])
ds

<ParallelMapDataset shapes: ((), ()), types: (tf.string, tf.string)>

In [25]:
ds = prepare_for_training(ds)
ds

<PrefetchDataset shapes: ((None,), ()), types: (tf.int32, tf.string)>

In [26]:
# ds_iter = iter(ds)

In [27]:
# next(ds_iter)

In [28]:
# for row in ds:
#     row[0]
#     break