Here, we build the CNN as described in our reference paper by Xu and Zhou (2020). We choose to implement the model in TensorFlow / Keras as this is the Deep Learning library we are the most comfortable with.

In [4]:
import tensorflow as tf
import numpy as np
from typing import List, Union

## Building the model

### Define Squeeze-Excitation block

In [5]:
class SqueezeLayer(tf.keras.layers.Layer):
    def __init__(self):
        super(SqueezeLayer, self).__init__()
    
    def call(self, input_tensor):
        # channel-wise average
        return tf.einsum("...ijk->...k", input_tensor) / (input_tensor.get_shape()[-3] * input_tensor.get_shape()[-2])

class SqueezeExcitationBlock(tf.keras.Model):
    def __init__(self, reduction_ratio: float, name=None):
        super(SqueezeExcitationBlock, self).__init__(name=name)
        self.reduction_ratio = reduction_ratio

    def build(self, input_tensor):
        n_channels = input_tensor.get_shape()[-1]
        
        self.squeeze_layer = SqueezeLayer()

        # hidden layer
        self.layer1 = tf.keras.layers.Dense(round(n_channels * self.reduction_ratio), activation=tf.keras.activations.relu, use_bias=False)

        # importance weights
        self.layer2 = tf.keras.layers.Dense(n_channels, activation=tf.keras.activations.sigmoid, use_bias=False)

    def call(self, input_tensor):
        # squeeze
        w = self.squeeze_layer(input_tensor)

        # excitation
        w = self.layer1(w)
        w = self.layer2(w)

        # channel-wise multiplication
        return tf.einsum("...k,...ijk->...ijk", w, input_tensor)

### Define convolutional block

In the original paper, a dropout layer was added after each convolutional block

In [6]:

class ConvolutionalBlock(tf.keras.Model):
    def __init__(self, filters, kernel_size=3, padding='same', dropout_rate=0.25, name=None):
        super(ConvolutionalBlock, self).__init__(name=name)
        self.conv = tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, padding=padding)
        self.max_pool = tf.keras.layers.MaxPooling2D(pool_size=(2,2), padding='valid')
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, input_tensor):
        x = self.conv(input_tensor)
        x = self.max_pool(x)
        x = self.dropout(x)
        return x

### Define the whole model

In [None]:
class SqueezeExcitationCNN(tf.keras.Model):
    def __init__(
        self,
        n_blocks: int = 5,
        filters: List[int] = [32, 64, 128, 256, 128],
        kernel_size: Union[List[int], int] = 3,
        padding: Union[List[str], str] = 'same',
        dropout_rate: Union[List[int], int] = 0.25,
        pooling: str = 'flatten',  # or 'gap',
        return_features: bool = False,  # if True, returns the feature vector, not the softmax output
        n_genres: int = 10,
        name=None
    ):
        super(SqueezeExcitationCNN, self).__init__(name=name)
        try:
            assert ...# All same length = n blocks
        except AssertionError as err:
            print("All parameters should be scalars or lists of length equal to the number of blocks")
            raise err.with_traceback()

        self.filters = filters
        ...

    def build(self, input_tensor):
        x = input_tensor

        # Alternating Conv and SE blocks
        self.conv_blocks = []
        self.se_blocks = []
        for k in range(self.n_blocks):
            self.conv_blocks.append(
                ConvolutionalBlock(
                    filters=self.filters[k],
                    kernel_size=self.kernel_size[k],
                    padding=self.padding[k],
                    dropout_rate=self.dropout_rate[k],
                    name=(self.name + "_ConvBlock%d" % k)
                )
            )
            self.se_blocks.append(
                SqueezeExcitationBlock(
                    reduction_ratio=self.reduction_ratios[k],
                    name=(self.name + "_SE_Block%d" % k)
                )
            )

        if not self.return_features:
            # classifier as decribed in the paper
            self.classifier = tf.keras.Dense(self.n_genres, activation=tf.keras.activations.softmax)

    def call(self, input_tensor):
        x = input_tensor

        # alternating conv and se blocks
        for conv, se in zip(self.conv_blocks, self.se_blocks):
            x = conv(x)
            x = se(x)

        # flatten or gap to feature vector
        x = self.vectorize(x)

        if self.return_features:
            return x
        else:
            return self.classifier(x)

## Hyperparameter tuning

### Model architecture hyperparameters

Depth, number of neurons, number of SE blocks (ref paper)

### Training hyperparameters

Learning rate, optimizer algorithm, ways to mitigate overfitting (regularization, dropout)
Batch norm (should it be in model architecture ?)

### Reduction ratio in SE Blocks
In the original paper they define r such that in the SE block the weight matrices are of shape C/r x C
After bayesian optimization, with search space for r being [8, 32], they settle on an optimal value of 31.43 (so basically, 32?)
Apparently, they settled on a unique value for the reduction ratio, whereas we could have different reduction ratio values for the 5 different SE blocks.
Furthermore, in their definition of the reduction ratio, the ratio does not have the same effect depending on the SE block (it reduces the number of channels in the first SE block to 1, 2 in the 2nd channel, 4 in the 3rd, then 8 and 4).

## Training the model