In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_addons as tfa
import librosa
from keras.utils import to_categorical

In [69]:
num_classes = 11
input_shape = (128, 384, 3)

In [4]:
train_data = pd.read_csv("train_data.csv")

In [5]:
val_data = pd.read_csv("val_data.csv")

In [6]:
class AudioDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, csv_file, batch_size=32, num_classes=11, sample_rate=22050, duration=10.0, shuffle=True, n_mels=128):
        self.csv_file = csv_file
        self.batch_size = batch_size
        self.num_classes = num_classes
        self.sample_rate = sample_rate
        self.duration = duration
        self.shuffle = shuffle
        self.n_mels  = n_mels
        
        # Read the CSV file
        self.data = pd.read_csv(csv_file)
        
        # Get the unique class labels
        self.classes = sorted(self.data['class_label'].unique())
        
        # Create a dictionary to map class labels to integers
        self.class_to_int = dict(zip(self.classes, range(len(self.classes))))
        
        # Shuffle the data if requested
        if self.shuffle:
            self.data = self.data.sample(frac=1).reset_index(drop=True)
    
    def __len__(self):
        # Return the number of batches
        return int(np.ceil(len(self.data) / float(self.batch_size)))
    
    def __getitem__(self, idx):
        # Get the batch of file paths and labels
        batch_data = self.data[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_data = batch_data.reset_index()
        # Initialize the arrays for the audio data and labels
        batch_x = np.zeros((len(batch_data), self.n_mels, 431,1))
        batch_y = np.zeros((len(batch_data), self.num_classes))
        
        # Load the audio files and their corresponding labels
        for i, row in batch_data.iterrows():
            file_path = row['file_path']
            class_label = row['class_label']
            
            # Load the audio file
            signal, sr = librosa.load(file_path, sr=self.sample_rate, mono=True)
            
            # Pad or truncate the signal to the desired length
            signal = librosa.util.fix_length(signal, size=self.sample_rate * self.duration)
            
            # Convert the audio file to spectrogram
            S = librosa.feature.melspectrogram(y=signal, sr=sr, n_mels=self.n_mels)
            S_dB = np.array(librosa.power_to_db(S, ref=np.max))
            S_dB = S_dB.reshape(S_dB.shape[0],S_dB.shape[1],1)
            
            # Save the audio data and label to the batch arrays
            batch_x[i, :] = S_dB
            batch_y[i, :] = to_categorical(self.class_to_int[class_label], num_classes=self.num_classes)
            

        return batch_x, batch_y

In [7]:
train_generator = AudioDataGenerator('train_data.csv', batch_size=32, num_classes=11, sample_rate=22050, duration=10, shuffle=True, n_mels=128)

In [8]:
val_generator = AudioDataGenerator('val_data.csv', batch_size=32, num_classes=11, sample_rate=22050, duration=10, shuffle=True, n_mels=128)

# Model creation

In [9]:
image_size_y=128
image_size_x = 384
patch_size = 64  # Size of the patches to be extract from the input images
num_patches = (image_size_y*image_size_x) // ((patch_size) ** 2)
print(num_patches)

12


In [24]:
# Hyperparameters

learning_rate = 0.01
weight_decay = 0.01
batch_size = 32
num_epochs = 60
image_size_y = 128  # We'll resize input images to this size
image_size_x = 384
patch_size = 64  # Size of the patches to be extract from the input images
num_patches = (image_size_y*image_size_x) // ((patch_size) ** 2)
projection_dim = 128
num_heads = 12
transformer_units = [
    projection_dim * 2,
    projection_dim,
]  # Size of the transformer layers
transformer_layers = 8
mlp_head_units = [2048, 1024]  # Size of the dense layers of the final classifier

In [45]:
# Data Augmentation

data_augmentation_resize = keras.Sequential(
    [
        layers.Resizing(image_size_y, image_size_x),
        layers.Lambda(lambda x: tf.image.grayscale_to_rgb(x)),
    ],
    name="data_augmentation",
)

In [26]:
def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = layers.Dense(units, activation=tf.nn.gelu)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x

In [27]:
class Patches(layers.Layer):
    def __init__(self, patch_size):
        super().__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID",
        )
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        return patches

In [28]:
class PatchEncoder(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super().__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(units=projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )
        self.reshape_patches = layers.Reshape((num_patches, projection_dim))

    def call(self, patch):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        encoded = self.reshape_patches(self.projection(patch)) + self.position_embedding(positions)
        return encoded

In [29]:
def create_vit_classifier():
    inputs = layers.Input(shape=input_shape)
    # Augment data.
    augmented = data_augmentation(inputs)
    # Create patches.
    patches = Patches(patch_size)(augmented)
    # Encode patches.
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)
        # Skip connection 1.
        x2 = layers.Add()([attention_output, encoded_patches])
        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # MLP.
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
        # Skip connection 2.
        encoded_patches = layers.Add()([x3, x2])

    # Create a [batch_size, projection_dim] tensor.
    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
    representation = layers.Flatten()(representation)
    representation = layers.Dropout(0.5)(representation)
    # Add MLP.
    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.2)
    # Classify outputs.
    logits = layers.Dense(num_classes)(features)
    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=logits)
    return model

In [62]:
def create_vit_classifier(base_model, num_classes):
    # Freeze the pre-trained model.
    base_model.trainable = False

    inputs = layers.Input(shape=input_shape)
    # Augment data.
    augmented = data_augmentation(inputs)
    # Create patches.
    patches = Patches(patch_size)(augmented)
    # Encode patches.
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)
        # Skip connection 1.
        x2 = layers.Add()([attention_output, encoded_patches])
        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # MLP.
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
        # Skip connection 2.
        encoded_patches = layers.Add()([x3, x2])

    # Create a [batch_size, projection_dim] tensor.
    representation = layers.LayerNormalization(epsilon=1e-6)(augmented)
#     representation = layers.Flatten()(representation)
#     representation = layers.Dropout(0.5)(representation)

    # Create the transfer learning model.
    transfer_model = create_transfer_learning_model(base_model, num_classes)
    
    # Pass the representation through the transfer learning model.
    logits = transfer_model(representation)

    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=logits)
    return model


In [63]:
# Defining metrics

from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [64]:
def create_transfer_learning_model(base_model, num_classes):
    # Freeze the pre-trained model.
    base_model.trainable = False
    
    # Get the output of the last layer of the pre-trained model.
    last_layer = base_model.layers[-1].output
    
    # Flatten the output.
    flattened = layers.Flatten()(last_layer)
    
    # Add a fully connected layer with dropout.
    x = layers.Dense(128, activation='relu')(flattened)
    x = layers.Dropout(0.5)(x)
    
    # Add a final output layer.
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    
    # Create the transfer learning model.
    model = keras.Model(inputs=base_model.inputs, outputs=outputs)
    
    return model

In [65]:
def run_experiment(model):
    optimizer = tfa.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=weight_decay
    )

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=[
            'accuracy',
            f1_m,
            precision_m, 
            recall_m
        ],
    )


    history = model.fit(
        train_generator,
        batch_size=batch_size,
        epochs=num_epochs
    )

    loss, accuracy, f1_score, precision, recall = model.evaluate(val_generator)
    print('Validation accuracy:', accuracy)
    print('Validation loss:', loss)
    print('Validation f1:', f1_score)
    print('Validation precision:', precision)
    print('Validation recall:', recall)


    return history

In [66]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"]="-1" 

In [67]:
base_model = keras.applications.ResNet50(
    include_top=False,
    weights='imagenet',
    input_shape=input_shape
)

In [70]:
vit_classifier = create_vit_classifier(base_model,11)



ValueError: Exception encountered when calling layer "model_5" (type Functional).

Input 0 of layer "dense_114" is incompatible with the layer: expected axis -1 of input shape to have value 114688, but received input with shape (None, 98304)

Call arguments received by layer "model_5" (type Functional):
  • inputs=tf.Tensor(shape=(None, 128, 384, 3), dtype=float32)
  • training=False
  • mask=None

In [61]:
vit_classifier.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 128, 431, 1  0           []                               
                                )]                                                                
                                                                                                  
 data_augmentation (Sequential)  (None, 128, 384, No  0          ['input_2[0][0]']                
                                ne)                                                               
                                                                                                  
 patches_1 (Patches)            (None, None, 4096)   0           ['data_augmentation[0][0]']      
                                                                                            

 ormalization)                                                                                    
                                                                                                  
 multi_head_attention_11 (Multi  (None, 12, 128)     791168      ['layer_normalization_23[0][0]', 
 HeadAttention)                                                   'layer_normalization_23[0][0]'] 
                                                                                                  
 add_22 (Add)                   (None, 12, 128)      0           ['multi_head_attention_11[0][0]',
                                                                  'add_21[0][0]']                 
                                                                                                  
 layer_normalization_24 (LayerN  (None, 12, 128)     256         ['add_22[0][0]']                 
 ormalization)                                                                                    
          

                                                                                                  
 dense_34 (Dense)               (None, 12, 128)      32896       ['dropout_31[0][0]']             
                                                                                                  
 dropout_32 (Dropout)           (None, 12, 128)      0           ['dense_34[0][0]']               
                                                                                                  
 add_29 (Add)                   (None, 12, 128)      0           ['dropout_32[0][0]',             
                                                                  'add_28[0][0]']                 
                                                                                                  
 layer_normalization_31 (LayerN  (None, 12, 128)     256         ['add_29[0][0]']                 
 ormalization)                                                                                    
          

In [35]:
history = run_experiment(vit_classifier)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60

KeyboardInterrupt: 