In [1]:
import os, sys
import pathlib
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_io as tfio
from tensorflow import keras
from tensorflow.image import ResizeMethod
from tensorflow.keras import models, Model
from tensorflow.keras import layers
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow_addons as tfa
physical_devices = tf.config.experimental.list_physical_devices('GPU')
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)
print(f'Running on Python {sys.version}, Tensorflow {tf.__version__}.')

Running on Python 3.8.10 (tags/v3.8.10:3d8993a, May  3 2021, 11:48:03) [MSC v.1928 64 bit (AMD64)], Tensorflow 2.5.0.


In [2]:
# TODO BEFORE SUBMITTING CODE:
# remove val as test codes from data loading, see if need remove test files folder
# uncomment test_preprocess func and codes in data loading

# Data loading
seed = 69
AUTOTUNE = tf.data.AUTOTUNE
tf.random.set_seed(seed)
np.random.seed(seed)
sample_rate = 16000

# load normalized train set
# data_dir = pathlib.Path('s1_release')
data_dir = pathlib.Path('s1_release_normalized')
labels = np.array(tf.io.gfile.listdir(str(data_dir)))
num_labels = len(labels)
print('Commands:', labels)

# Original
# filenames = tf.io.gfile.glob(str(data_dir) + '/*/*')
# filenames = tf.random.shuffle(filenames)
# num_samples = len(filenames)
# print('Number of total examples:', num_samples)
# print('Number of examples per label:', len(tf.io.gfile.listdir(str(data_dir/labels[0]))))
# print('Example file tensor:', filenames[0])
#
# train_files = filenames[:round(num_samples*0.8)]  # first 80%
# val_files = filenames[round(num_samples*0.8):]  # last 20%
#
# print('Training set size', len(train_files))
# print('Validation set size', len(val_files))

# load test as val
train_files = tf.io.gfile.glob(str(data_dir) + '/*/*')
train_files = tf.random.shuffle(train_files)
num_samples = len(train_files)
print('Number of train examples:', num_samples)
print('Number of train examples per label:', len(tf.io.gfile.listdir(str(data_dir/labels[0]))))

# val
val_file_ans = pd.read_csv('answers.csv', names=['file', 'label'])
data_dir = pathlib.Path('s1_test_release_normalized')
val_files = tf.io.gfile.glob(str(data_dir) + '/*')
print('Val set size', len(val_files))
for file in val_files:
    row = val_file_ans[val_file_ans['file'] == os.path.basename(file).replace('_normalized', '')].index[0]
    label = val_file_ans['label'][int(row)]
    os.makedirs('test', exist_ok=True)
    os.makedirs(os.path.join('test', label), exist_ok=True)
    shutil.copy(file, os.path.join('test', label))
data_dir = pathlib.Path('test')
val_files = tf.io.gfile.glob(str(data_dir) + '/*/*')

# load given test set
data_dir = pathlib.Path('s1_test_release_normalized')
test_files = tf.io.gfile.glob(str(data_dir) + '/*')  # provided
print('Test set size', len(test_files))

Commands: ['bird' 'eight' 'falcon' 'five' 'four' 'nine' 'one' 'seven' 'six' 'snake'
 'three' 'two' 'zero']
Number of train examples: 3250
Number of train examples per label: 250
Val set size 649
Test set size 649


In [3]:
stft_params = [{'nfft': 512, 'window': 501, 'stride': 21}, {'nfft': 1024, 'window': 1001, 'stride': 31}, {'nfft': 2048, 'window': 1600, 'stride': 61}]  # stride = hop size
# stft_params = [{'nfft': 256, 'window': 240, 'stride': 80}, {'nfft': 512, 'window': 480, 'stride': 160}, {'nfft': 1024, 'window': 960, 'stride': 320}]  # stride = hop size
image_size = 64

def train_preprocess(file_path):
    # Read file into AudioIOTensor, need to specify dtype as running it inside a tf.data function without eager execution CHECKED
    audio_tensor = tf.squeeze(tf.cast(tfio.audio.decode_wav(tf.io.read_file(file_path), dtype=tf.int16), tf.float32) / 32768.0, axis=-1)

    # Trim noise (TODO: try with different gate, default 0.1)
    # position = tfio.audio.trim(audio_tensor, axis=0, epsilon=0.1)  # epsilon = noise gate (how loud between 0 to 1)
    # audio_tensor = audio_tensor[position[0]:position[1]]
    # faded_audio_tensor = tfio.audio.fade(trimmed_audio_tensor, fade_in=1000, fade_out=1000, mode="logarithmic")

    # zero pad to 16000  CHECKED
    zero_padding = tf.zeros([sample_rate] - tf.shape(audio_tensor), dtype=tf.float32)
    audio_tensor = tf.cast(audio_tensor, tf.float32)
    padded_audio_tensor = tf.concat([audio_tensor, zero_padding], 0)

    # spectrogram CHECKED
    spectrogram = tfio.audio.spectrogram(padded_audio_tensor, nfft=1024, window=1001, stride=31)
    spectrogram = tf.abs(spectrogram)

    spectrogram = tf.abs(tfio.audio.spectrogram(padded_audio_tensor, nfft=stft_params[0]['nfft'], window=stft_params[0]['window'], stride=stft_params[0]['stride']))
    spectrogram1 = tf.abs(tfio.audio.spectrogram(padded_audio_tensor, nfft=stft_params[0]['nfft'], window=stft_params[0]['window'], stride=stft_params[0]['stride']))
    spectrogram2 = tf.abs(tfio.audio.spectrogram(padded_audio_tensor, nfft=stft_params[1]['nfft'], window=stft_params[1]['window'], stride=stft_params[1]['stride']))
    spectrogram3 = tf.abs(tfio.audio.spectrogram(padded_audio_tensor, nfft=stft_params[2]['nfft'], window=stft_params[2]['window'], stride=stft_params[2]['stride']))

    # Convert to mel-spectrogram
    mel_spectrogram = tfio.audio.melscale(spectrogram, rate=sample_rate, mels=80, fmin=20, fmax=8000)
    mel_spectrogram1 = tfio.audio.melscale(spectrogram1, rate=sample_rate, mels=80, fmin=20, fmax=8000)
    mel_spectrogram2 = tfio.audio.melscale(spectrogram2, rate=sample_rate, mels=80, fmin=20, fmax=8000)
    mel_spectrogram3 = tfio.audio.melscale(spectrogram3, rate=sample_rate, mels=80, fmin=20, fmax=8000)

    # Convert to db scale mel-spectrogram (tested causes loss to nan)
    # mel_spectrogram = tfio.audio.dbscale(mel_spectrogram, top_db=80)
    # mel_spectrogram1 = tfio.audio.dbscale(mel_spectrogram1, top_db=80)
    # mel_spectrogram2 = tfio.audio.dbscale(mel_spectrogram2, top_db=80)
    # mel_spectrogram3 = tfio.audio.dbscale(mel_spectrogram3, top_db=80)

    # Convert to log-mel spectrogram
    log_mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)
    log_mel_spectrogram1 = tf.math.log(mel_spectrogram1 + 1e-6)
    log_mel_spectrogram2 = tf.math.log(mel_spectrogram2 + 1e-6)
    log_mel_spectrogram3 = tf.math.log(mel_spectrogram3 + 1e-6)

    # Freq masking
    freq_mask = tfio.audio.freq_mask(log_mel_spectrogram, param=7)
    freq_mask1 = tfio.audio.freq_mask(log_mel_spectrogram1, param=7)
    freq_mask2 = tfio.audio.freq_mask(log_mel_spectrogram2, param=7)
    freq_mask3 = tfio.audio.freq_mask(log_mel_spectrogram3, param=7)
    # double aug
    # freq_mask1 = tfio.audio.freq_mask(freq_mask1, param=7)
    # freq_mask2 = tfio.audio.freq_mask(freq_mask2, param=7)
    # freq_mask3 = tfio.audio.freq_mask(freq_mask3, param=7)

    # Time masking
    time_mask = tfio.audio.time_mask(freq_mask, param=25)
    time_mask1 = tfio.audio.time_mask(freq_mask1, param=25)
    time_mask2 = tfio.audio.time_mask(freq_mask2, param=25)
    time_mask3 = tfio.audio.time_mask(freq_mask3, param=25)
    # double aug
    # time_mask1 = tfio.audio.time_mask(time_mask1, param=25)
    # time_mask2 = tfio.audio.time_mask(time_mask2, param=25)
    # time_mask3 = tfio.audio.time_mask(time_mask3, param=25)

    # MFCC
    # time_mask = tf.signal.mfccs_from_log_mel_spectrograms(time_mask)
    # time_mask1 = tf.signal.mfccs_from_log_mel_spectrograms(time_mask1)
    # time_mask2 = tf.signal.mfccs_from_log_mel_spectrograms(time_mask2)
    # time_mask3 = tf.signal.mfccs_from_log_mel_spectrograms(time_mask3)

    # label id extraction
    label = tf.strings.split(file_path, os.path.sep)[-2]
    label_id = tf.argmax(label == labels)

    time_mask1 = tf.expand_dims(time_mask1, -1)
    time_mask1 = tf.image.resize(time_mask1, (image_size, image_size), method=ResizeMethod.LANCZOS5)
    time_mask2 = tf.expand_dims(time_mask2, -1)
    time_mask2 = tf.image.resize(time_mask2, (image_size, image_size), method=ResizeMethod.LANCZOS5)
    time_mask3 = tf.expand_dims(time_mask3, -1)
    time_mask3 = tf.image.resize(time_mask3, (image_size, image_size), method=ResizeMethod.LANCZOS5)
    log_mel_spectrogram = tf.squeeze(tf.stack([time_mask1, time_mask2, time_mask3], axis=-1))
    log_mel_spectrogram.set_shape([image_size, image_size, 3])
    return log_mel_spectrogram, label_id  # Switch output here, log, dbscale, freq mask, time mask, mfccs

def val_preprocess(file_path):
    # Read file into AudioIOTensor, need to specify dtype as running it inside a tf.data function without eager execution CHECKED
    audio_tensor = tf.squeeze(tf.cast(tfio.audio.decode_wav(tf.io.read_file(file_path), dtype=tf.int16), tf.float32) / 32768.0, axis=-1)

    # Trim noise (TODO: try with different gate, default 0.1)
    # position = tfio.audio.trim(audio_tensor, axis=0, epsilon=0.1)  # epsilon = noise gate (how loud between 0 to 1)
    # audio_tensor = audio_tensor[position[0]:position[1]]
    # faded_audio_tensor = tfio.audio.fade(trimmed_audio_tensor, fade_in=1000, fade_out=1000, mode="logarithmic")

    # zero pad to 16000  CHECKED
    zero_padding = tf.zeros([sample_rate] - tf.shape(audio_tensor), dtype=tf.float32)
    audio_tensor = tf.cast(audio_tensor, tf.float32)
    padded_audio_tensor = tf.concat([audio_tensor, zero_padding], 0)

     # spectrogram CHECKED
    spectrogram = tfio.audio.spectrogram(padded_audio_tensor, nfft=1024, window=1001, stride=31)
    spectrogram = tf.abs(spectrogram)

    spectrogram = tf.abs(tfio.audio.spectrogram(padded_audio_tensor, nfft=stft_params[0]['nfft'], window=stft_params[0]['window'], stride=stft_params[0]['stride']))
    spectrogram1 = tf.abs(tfio.audio.spectrogram(padded_audio_tensor, nfft=stft_params[0]['nfft'], window=stft_params[0]['window'], stride=stft_params[0]['stride']))
    spectrogram2 = tf.abs(tfio.audio.spectrogram(padded_audio_tensor, nfft=stft_params[1]['nfft'], window=stft_params[1]['window'], stride=stft_params[1]['stride']))
    spectrogram3 = tf.abs(tfio.audio.spectrogram(padded_audio_tensor, nfft=stft_params[2]['nfft'], window=stft_params[2]['window'], stride=stft_params[2]['stride']))

    # Convert to mel-spectrogram
    mel_spectrogram = tfio.audio.melscale(spectrogram, rate=sample_rate, mels=80, fmin=20, fmax=8000)
    mel_spectrogram1 = tfio.audio.melscale(spectrogram1, rate=sample_rate, mels=80, fmin=20, fmax=8000)
    mel_spectrogram2 = tfio.audio.melscale(spectrogram2, rate=sample_rate, mels=80, fmin=20, fmax=8000)
    mel_spectrogram3 = tfio.audio.melscale(spectrogram3, rate=sample_rate, mels=80, fmin=20, fmax=8000)

    # Convert to db scale mel-spectrogram (tested causes loss to nan)
    # mel_spectrogram = tfio.audio.dbscale(mel_spectrogram, top_db=80)
    # mel_spectrogram1 = tfio.audio.dbscale(mel_spectrogram1, top_db=80)
    # mel_spectrogram2 = tfio.audio.dbscale(mel_spectrogram2, top_db=80)
    # mel_spectrogram3 = tfio.audio.dbscale(mel_spectrogram3, top_db=80)

    # Convert to log-mel spectrogram
    log_mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)
    log_mel_spectrogram1 = tf.math.log(mel_spectrogram1 + 1e-6)
    log_mel_spectrogram2 = tf.math.log(mel_spectrogram2 + 1e-6)
    log_mel_spectrogram3 = tf.math.log(mel_spectrogram3 + 1e-6)

    # label id extraction
    label = tf.strings.split(file_path, os.path.sep)[-2]
    label_id = tf.argmax(label == labels)

    # MFCC
    # log_mel_spectrogram = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrogram)
    # log_mel_spectrogram1 = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrogram1)
    # log_mel_spectrogram2 = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrogram2)
    # log_mel_spectrogram3 = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrogram3)

    log_mel_spectrogram1 = tf.expand_dims(log_mel_spectrogram1, -1)
    log_mel_spectrogram1 = tf.image.resize(log_mel_spectrogram1, (image_size, image_size), method=ResizeMethod.LANCZOS5)
    log_mel_spectrogram2 = tf.expand_dims(log_mel_spectrogram2, -1)
    log_mel_spectrogram2 = tf.image.resize(log_mel_spectrogram2, (image_size, image_size), method=ResizeMethod.LANCZOS5)
    log_mel_spectrogram3 = tf.expand_dims(log_mel_spectrogram3, -1)
    log_mel_spectrogram3 = tf.image.resize(log_mel_spectrogram3, (image_size, image_size), method=ResizeMethod.LANCZOS5)
    log_mel_spectrogram = tf.squeeze(tf.stack([log_mel_spectrogram1, log_mel_spectrogram2, log_mel_spectrogram3], axis=-1))
    log_mel_spectrogram.set_shape([image_size, image_size, 3])
    return log_mel_spectrogram, label_id  # switch output here to db or log

def test_preprocess(file_path):
    # Read file into AudioIOTensor, need to specify dtype as running it inside a tf.data function without eager execution CHECKED
    audio_tensor = tf.squeeze(tf.cast(tfio.audio.decode_wav(tf.io.read_file(file_path), dtype=tf.int16), tf.float32) / 32768.0, axis=-1)

    # Trim noise (TODO: try with different gate, default 0.1)
    # position = tfio.audio.trim(audio_tensor, axis=0, epsilon=0.1)  # epsilon = noise gate (how loud between 0 to 1)
    # audio_tensor = audio_tensor[position[0]:position[1]]
    # faded_audio_tensor = tfio.audio.fade(trimmed_audio_tensor, fade_in=1000, fade_out=1000, mode="logarithmic")

    # zero pad to 16000  CHECKED
    zero_padding = tf.zeros([sample_rate] - tf.shape(audio_tensor), dtype=tf.float32)
    audio_tensor = tf.cast(audio_tensor, tf.float32)
    padded_audio_tensor = tf.concat([audio_tensor, zero_padding], 0)

     # spectrogram CHECKED
    spectrogram = tfio.audio.spectrogram(padded_audio_tensor, nfft=1024, window=1001, stride=31)
    spectrogram = tf.abs(spectrogram)

    spectrogram = tf.abs(tfio.audio.spectrogram(padded_audio_tensor, nfft=stft_params[0]['nfft'], window=stft_params[0]['window'], stride=stft_params[0]['stride']))
    spectrogram1 = tf.abs(tfio.audio.spectrogram(padded_audio_tensor, nfft=stft_params[0]['nfft'], window=stft_params[0]['window'], stride=stft_params[0]['stride']))
    spectrogram2 = tf.abs(tfio.audio.spectrogram(padded_audio_tensor, nfft=stft_params[1]['nfft'], window=stft_params[1]['window'], stride=stft_params[1]['stride']))
    spectrogram3 = tf.abs(tfio.audio.spectrogram(padded_audio_tensor, nfft=stft_params[2]['nfft'], window=stft_params[2]['window'], stride=stft_params[2]['stride']))

    # Convert to mel-spectrogram
    mel_spectrogram = tfio.audio.melscale(spectrogram, rate=sample_rate, mels=80, fmin=20, fmax=8000)
    mel_spectrogram1 = tfio.audio.melscale(spectrogram1, rate=sample_rate, mels=80, fmin=20, fmax=8000)
    mel_spectrogram2 = tfio.audio.melscale(spectrogram2, rate=sample_rate, mels=80, fmin=20, fmax=8000)
    mel_spectrogram3 = tfio.audio.melscale(spectrogram3, rate=sample_rate, mels=80, fmin=20, fmax=8000)

    # Convert to db scale mel-spectrogram (tested causes loss to nan)
    # mel_spectrogram = tfio.audio.dbscale(mel_spectrogram, top_db=80)
    # mel_spectrogram1 = tfio.audio.dbscale(mel_spectrogram1, top_db=80)
    # mel_spectrogram2 = tfio.audio.dbscale(mel_spectrogram2, top_db=80)
    # mel_spectrogram3 = tfio.audio.dbscale(mel_spectrogram3, top_db=80)

    # Convert to log-mel spectrogram
    log_mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)
    log_mel_spectrogram1 = tf.math.log(mel_spectrogram1 + 1e-6)
    log_mel_spectrogram2 = tf.math.log(mel_spectrogram2 + 1e-6)
    log_mel_spectrogram3 = tf.math.log(mel_spectrogram3 + 1e-6)

    # MFCC
    # log_mel_spectrogram = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrogram)
    # log_mel_spectrogram1 = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrogram1)
    # log_mel_spectrogram2 = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrogram2)
    # log_mel_spectrogram3 = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrogram3)

    log_mel_spectrogram1 = tf.expand_dims(log_mel_spectrogram1, -1)
    log_mel_spectrogram1 = tf.image.resize(log_mel_spectrogram1, (image_size, image_size), method=ResizeMethod.LANCZOS5)
    log_mel_spectrogram2 = tf.expand_dims(log_mel_spectrogram2, -1)
    log_mel_spectrogram2 = tf.image.resize(log_mel_spectrogram2, (image_size, image_size), method=ResizeMethod.LANCZOS5)
    log_mel_spectrogram3 = tf.expand_dims(log_mel_spectrogram3, -1)
    log_mel_spectrogram3 = tf.image.resize(log_mel_spectrogram3, (image_size, image_size), method=ResizeMethod.LANCZOS5)
    log_mel_spectrogram = tf.squeeze(tf.stack([log_mel_spectrogram1, log_mel_spectrogram2, log_mel_spectrogram3], axis=-1))
    log_mel_spectrogram.set_shape([image_size, image_size, 3])
    return log_mel_spectrogram  # switch output here to db or log


train_ds = tf.data.Dataset.from_tensor_slices(train_files).map(train_preprocess, num_parallel_calls=AUTOTUNE)
val_ds = tf.data.Dataset.from_tensor_slices(val_files).map(val_preprocess, num_parallel_calls=AUTOTUNE)
test_ds = tf.data.Dataset.from_tensor_slices(test_files).map(test_preprocess, num_parallel_calls=AUTOTUNE)

for spectrogram, label in train_ds.take(1):
    input_shape = spectrogram.shape

In [4]:
def create_ffn(hidden_units, dropout_rate):
    ffn_layers = []
    for units in hidden_units[:-1]:
        ffn_layers.append(layers.Dense(units, activation=tf.nn.gelu))

    ffn_layers.append(layers.Dense(units=hidden_units[-1]))
    ffn_layers.append(layers.Dropout(dropout_rate))

    ffn = keras.Sequential(ffn_layers)
    return ffn

class Patches(layers.Layer):
    def __init__(self, patch_size):
        super(Patches, self).__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID",
        )
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        return patches

class PatchEncoder(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super(PatchEncoder, self).__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(units=projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )

    def call(self, patches):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        encoded = self.projection(patches) + self.position_embedding(positions)
        return encoded

def create_cross_attention_module(
    latent_dim, data_dim, projection_dim, ffn_units, dropout_rate
):

    inputs = {
        # Recieve the latent array as an input of shape [1, latent_dim, projection_dim].
        "latent_array": layers.Input(shape=(latent_dim, projection_dim)),
        # Recieve the data_array (encoded image) as an input of shape [batch_size, data_dim, projection_dim].
        "data_array": layers.Input(shape=(data_dim, projection_dim)),
    }

    # Apply layer norm to the inputs
    latent_array = layers.LayerNormalization(epsilon=1e-6)(inputs["latent_array"])
    data_array = layers.LayerNormalization(epsilon=1e-6)(inputs["data_array"])

    # Create query tensor: [1, latent_dim, projection_dim].
    query = layers.Dense(units=projection_dim)(latent_array)
    # Create key tensor: [batch_size, data_dim, projection_dim].
    key = layers.Dense(units=projection_dim)(data_array)
    # Create value tensor: [batch_size, data_dim, projection_dim].
    value = layers.Dense(units=projection_dim)(data_array)

    # Generate cross-attention outputs: [batch_size, latent_dim, projection_dim].
    attention_output = layers.Attention(use_scale=True, dropout=0.1)(
        [query, key, value], return_attention_scores=False
    )
    # Skip connection 1.
    attention_output = layers.Add()([attention_output, latent_array])

    # Apply layer norm.
    attention_output = layers.LayerNormalization(epsilon=1e-6)(attention_output)
    # Apply Feedforward network.
    ffn = create_ffn(hidden_units=ffn_units, dropout_rate=dropout_rate)
    outputs = ffn(attention_output)
    # Skip connection 2.
    outputs = layers.Add()([outputs, attention_output])

    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

def create_transformer_module(
    latent_dim,
    projection_dim,
    num_heads,
    num_transformer_blocks,
    ffn_units,
    dropout_rate,
):

    # input_shape: [1, latent_dim, projection_dim]
    inputs = layers.Input(shape=(latent_dim, projection_dim))

    x0 = inputs
    # Create multiple layers of the Transformer block.
    for _ in range(num_transformer_blocks):
        # Apply layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-6)(x0)
        # Create a multi-head self-attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)
        # Skip connection 1.
        x2 = layers.Add()([attention_output, x0])
        # Apply layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # Apply Feedforward network.
        ffn = create_ffn(hidden_units=ffn_units, dropout_rate=dropout_rate)
        x3 = ffn(x3)
        # Skip connection 2.
        x0 = layers.Add()([x3, x2])

    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=x0)
    return model

class Perceiver(keras.Model):
    def __init__(
        self,
        patch_size,
        data_dim,
        latent_dim,
        projection_dim,
        num_heads,
        num_transformer_blocks,
        ffn_units,
        dropout_rate,
        num_iterations,
        classifier_units,
    ):
        super(Perceiver, self).__init__()

        self.latent_dim = latent_dim
        self.data_dim = data_dim
        self.patch_size = patch_size
        self.projection_dim = projection_dim
        self.num_heads = num_heads
        self.num_transformer_blocks = num_transformer_blocks
        self.ffn_units = ffn_units
        self.dropout_rate = dropout_rate
        self.num_iterations = num_iterations
        self.classifier_units = classifier_units

    def build(self, input_shape):
        # Create latent array.
        self.latent_array = self.add_weight(
            shape=(self.latent_dim, self.projection_dim),
            initializer="random_normal",
            trainable=True,
        )

        # Create patching module.
        self.patcher = Patches(self.patch_size)

        # Create patch encoder.
        self.patch_encoder = PatchEncoder(self.data_dim, self.projection_dim)

        # Create cross-attenion module.
        self.cross_attention = create_cross_attention_module(
            self.latent_dim,
            self.data_dim,
            self.projection_dim,
            self.ffn_units,
            self.dropout_rate,
        )

        # Create Transformer module.
        self.transformer = create_transformer_module(
            self.latent_dim,
            self.projection_dim,
            self.num_heads,
            self.num_transformer_blocks,
            self.ffn_units,
            self.dropout_rate,
        )

        # Create global average pooling layer.
        self.global_average_pooling = layers.GlobalAveragePooling1D()

        # Create a classification head.
        self.classification_head = create_ffn(
            hidden_units=self.classifier_units, dropout_rate=self.dropout_rate
        )

        super(Perceiver, self).build(input_shape)

    def call(self, inputs):
        # Create patches.
        patches = self.patcher(inputs)
        # Encode patches.
        encoded_patches = self.patch_encoder(patches)
        # Prepare cross-attention inputs.
        cross_attention_inputs = {
            "latent_array": tf.expand_dims(self.latent_array, 0),
            "data_array": encoded_patches,
        }
        # Apply the cross-attention and the Transformer modules iteratively.
        for _ in range(num_iterations):
            # Apply cross-attention from the latent array to the data array.
            latent_array = self.cross_attention(cross_attention_inputs)
            # Apply self-attention Transformer to the latent array.
            latent_array = self.transformer(latent_array)
            # Set the latent array of the next iteration.
            cross_attention_inputs["latent_array"] = latent_array

        # Apply global average pooling to generate a [batch_size, projection_dim] repesentation tensor.
        representation = self.global_average_pooling(latent_array)
        # Generate logits.
        logits = self.classification_head(representation)
        return logits

In [5]:
num_classes = 13
learning_rate = 0.001
weight_decay = 0.0001
batch_size = 32
num_epochs = 120
dropout_rate = 0.2
patch_size = 2  # Size of the patches to be extract from the input images.
num_patches = (image_size // patch_size) ** 2  # Size of the data array.
latent_dim = 256  # Size of the latent array.
projection_dim = 256  # Embedding size of each element in the data and latent arrays.
num_heads = 8  # Number of Transformer heads.
ffn_units = [
    projection_dim,
    projection_dim,
]  # Size of the Transformer Feedforward network.
num_transformer_blocks = 4
num_iterations = 2  # Repetitions of the cross-attention and Transformer modules.
classifier_units = [
    projection_dim,
    num_classes,
]  # Size of the Feedforward network of the final classifier.

# Training
train_ds = train_ds.batch(batch_size).cache().prefetch(AUTOTUNE)
val_ds = val_ds.batch(batch_size).cache().prefetch(AUTOTUNE)

print('Input shape:', input_shape)
norm_layer = Normalization()
norm_layer.adapt(train_ds.map(lambda x, _: x))

xInput = Input(input_shape, dtype=tf.uint8)
x = norm_layer(xInput)
x = tf.cast(x, tf.float32)
perceiver = Perceiver(
    patch_size,
    num_patches,
    latent_dim,
    projection_dim,
    num_heads,
    num_transformer_blocks,
    ffn_units,
    dropout_rate,
    num_iterations,
    classifier_units,
)
xOutput = perceiver(x)
model = Model(xInput, xOutput, name='Perceiver')
plot_model(model, show_shapes=True, show_dtype=True, show_layer_names=True, to_file='SC1 Perceiver.png') # SC1_ensemble.png
model.summary()

Input shape: (64, 64, 3)
Model: "Perceiver"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 64, 64, 3)]       0         
_________________________________________________________________
normalization (Normalization (None, 64, 64, 3)         7         
_________________________________________________________________
tf.cast (TFOpLambda)         (None, 64, 64, 3)         0         
_________________________________________________________________
perceiver (Perceiver)        (None, 13)                9675278   
Total params: 9,675,285
Trainable params: 9,675,278
Non-trainable params: 7
_________________________________________________________________


In [6]:
# Create LAMB optimizer with weight decay.
optimizer = tfa.optimizers.LAMB(
    learning_rate=learning_rate, weight_decay_rate=weight_decay)

# Compile the model.
model.compile(
    optimizer=optimizer,
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['acc'],
)

callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_acc', min_delta=0, patience=7, verbose=1,
                                     mode='auto', baseline=None, restore_best_weights=True),
    # tf.keras.callbacks.ModelCheckpoint('./best_model',monitor='val_accuracy',save_best_only=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_acc', factor=0.1, patience=4, verbose=1)]

In [7]:
history = model.fit(train_ds, epochs=num_epochs, validation_data=val_ds, callbacks=callbacks, use_multiprocessing=True, verbose=1)
model.save('sc1v2')

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120

Epoch 00022: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 23/120
Epoch 24/120
Epoch 25/120
Restoring model weights from the end of the best epoch.
Epoch 00025: early stopping


AttributeError: 'NoneType' object has no attribute 'replace'

In [None]:
# Generate prediction csv
# print('Loading model...')
# model = models.load_model('best models/densenet my 95.68')
print('Model loaded. Predicting...')
i = 0
predicted_labels, filenames = [], []
for spectrogram in test_ds.batch(1):
    filenames.append(os.path.basename(test_files[i].replace('_normalized', '')))
    prediction = model(spectrogram)
    prediction_value = tf.nn.softmax(prediction[0]).numpy()
    predicted_label = labels[np.argmax(prediction_value)]
    predicted_labels.append(predicted_label)
    i+=1

df = pd.DataFrame(list(zip(filenames, predicted_labels)))
df.to_csv('challenge_2_team_Tensor is not flowing.csv', index=False, header=False)  # tested submission file format passed