In [1]:
import os, sys
import pathlib
from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
import tensorflow_io as tfio
from tensorflow.keras import models, Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Input, Conv2D, Dense, Dropout, Flatten, MaxPooling2D, BatchNormalization, Activation
from tensorflow.keras.layers.experimental.preprocessing import Resizing, Normalization
physical_devices = tf.config.experimental.list_physical_devices('GPU')
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)
print(f'Running on Python {sys.version}, Tensorflow {tf.__version__}.')

Running on Python 3.8.10 (tags/v3.8.10:3d8993a, May  3 2021, 11:48:03) [MSC v.1928 64 bit (AMD64)], Tensorflow 2.5.0.


In [2]:

# Data loading
seed = 69
AUTOTUNE = tf.data.AUTOTUNE
tf.random.set_seed(seed)
np.random.seed(seed)

data_dir = pathlib.Path('s1_release')
labels = np.array(tf.io.gfile.listdir(str(data_dir)))
print('Commands:', labels)

# load given train set
filenames = tf.io.gfile.glob(str(data_dir) + '/*/*')
filenames = tf.random.shuffle(filenames)
num_samples = len(filenames)
print('Number of total examples:', num_samples)
print('Number of examples per label:', len(tf.io.gfile.listdir(str(data_dir/labels[0]))))
print('Example file tensor:', filenames[0])

filenames = [file.decode('utf-8') for file in filenames.numpy()]
train_files = filenames[:round(num_samples*0.8)]  # first 80%
val_files = filenames[round(num_samples*0.8):]  # last 20%

print('Training set size', len(train_files))
print('Validation set size', len(val_files))

# load given test set
data_dir = pathlib.Path('s1_test_release')
test_files = tf.io.gfile.glob(str(data_dir) + '/*')  # provided
print('Test set size', len(test_files))

Commands: ['bird' 'eight' 'falcon' 'five' 'four' 'nine' 'one' 'seven' 'six' 'snake'
 'three' 'two' 'zero']
Number of total examples: 2600
Number of examples per label: 200
Example file tensor: tf.Tensor(b's1_release\\falcon\\train_falcon_0967.wav', shape=(), dtype=string)
Training set size 2080
Validation set size 520
Test set size 649


In [3]:
audio = tfio.audio.AudioIOTensor('s1_release/bird/train_bird_8401.wav', dtype=tf.int16)
audio_tensor = tf.squeeze(audio.to_tensor())
print(audio)
print(audio_tensor)
display.display(display.Audio(audio_tensor.numpy(), rate=audio.rate.numpy()))
audio = tf.cast(tf.squeeze(tfio.audio.AudioIOTensor('s1_release/bird/train_bird_8401.wav', dtype=tf.int16).to_tensor()), tf.float32)
audio = 2* tf.divide(tf.subtract(audio, tf.reduce_min(audio)), tf.subtract(tf.reduce_max(audio), tf.reduce_min(audio))) -1
print(audio, max(audio), min(audio))
vggish = hub.load('https://tfhub.dev/google/vggish/1')

def get_waveform_and_label_id(file_path):
    audio = tf.cast(tf.squeeze(tfio.audio.AudioIOTensor(file_path, dtype=tf.int16).to_tensor()), tf.float32)
    audio = 2* tf.divide(tf.subtract(audio, tf.reduce_min(audio)), tf.subtract(tf.reduce_max(audio), tf.reduce_min(audio))) -1
    embedding = vggish(audio)
    parts = tf.strings.split(file_path, os.path.sep)
    label = parts[-2]
    label_id = tf.argmax(label == labels)
    # audio.set_shape([16000,])
    # label_id.set_shape([])
    return embedding, label_id

def preprocess_dataset(files):
  files_ds = tf.data.Dataset.from_tensor_slices(files)
  output_ds = files_ds.map(get_waveform_and_label_id, num_parallel_calls=AUTOTUNE)
  return output_ds

train_ds = preprocess_dataset(train_files)
val_ds = preprocess_dataset(val_files)
# audio_tensor, label = get_waveform_and_label('s1_release/s1_release/bird/train_bird_8400.wav')
# tensor = tf.cast(audio_tensor, tf.float32) / 32768.0
# plt.figure()
# plt.plot(tensor.numpy())

<AudioIOTensor: shape=[16000     1], dtype=<dtype: 'int16'>, rate=16000>
tf.Tensor([-245 -257 -263 ... -301 -326 -310], shape=(16000,), dtype=int16)


tf.Tensor([0.1926763  0.19153202 0.19095981 ... 0.18733609 0.18495214 0.1864779 ], shape=(16000,), dtype=float32) tf.Tensor(1.0, shape=(), dtype=float32) tf.Tensor(-1.0, shape=(), dtype=float32)


In [4]:
# Training
batch_size = 32
train_ds = train_ds.batch(batch_size).cache().prefetch(AUTOTUNE)
val_ds = val_ds.batch(batch_size).cache().prefetch(AUTOTUNE)

input_shape = 16000
print('Input shape:', input_shape)
num_labels = len(labels)
# norm_layer = Normalization()
# norm_layer.adapt(train_ds.map(lambda x, _: x))

# VGGish feature extractor
xInput = Input(128)
x = Dense(512)(xInput)
x = Activation('relu')(x)
x = Dense(256)(x)
x = Activation('relu')(x)
x = Dense(128)(x)
x = Activation('relu')(x)
xOutput = Dense(num_labels)(x)  # no activation as loss using logit=True
model = tf.keras.models.Model(xInput, xOutput)

Input shape: 16000


In [5]:
opt = keras.optimizers.Adam(learning_rate=1e-3, epsilon=1e-6)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']
epoch = 100
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0, patience=10, verbose=1,
                                     mode='auto', baseline=None, restore_best_weights=True),
    # tf.keras.callbacks.ModelCheckpoint('./best_model',monitor='val_accuracy',save_best_only=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=4, verbose=1)
]
model.compile(optimizer=opt, loss=loss, metrics=metrics)
plot_model(model, show_shapes=True, show_dtype=True, show_layer_names=True, to_file='SC1v2.png')
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
dense (Dense)                (None, 512)               66048     
_________________________________________________________________
activation (Activation)      (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
activation_2 (Activation)    (None, 128)               0     

In [6]:
history = model.fit(train_ds, epochs=epoch, validation_data=val_ds, callbacks=callbacks, use_multiprocessing=True, verbose=1)
# model.save('sc1')

Epoch 1/100












Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100

Epoch 00016: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 17/100
Epoch 18/100
Restoring model weights from the end of the best epoch.
Epoch 00018: early stopping
