In [1]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
DATASET_PATH = "data"
classes = ["dog", "cat", "bird"]

def load_mel_spectrogram(file_path, n_mels=128, max_len=128):
    y, sr = librosa.load(file_path, sr=None)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    # Resize to fixed dimension
    if mel_db.shape[1] < max_len:
        pad_width = max_len - mel_db.shape[1]
        mel_db = np.pad(mel_db, ((0,0),(0,pad_width)), mode='constant')
    else:
        mel_db = mel_db[:, :max_len]

    return mel_db

In [4]:
X = []
y = []

for label in classes:
    folder = os.path.join("../data", label)
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)

        mel = load_mel_spectrogram(file_path)
        mel = np.expand_dims(mel, axis=-1)  # shape (128,128,1)

        X.append(mel)
        y.append(label)

X = np.array(X)
y = np.array(y)

In [5]:
label_encoder = LabelEncoder()
y_int = label_encoder.fit_transform(y)

X_train, X_val, y_train_int, y_val_int = train_test_split(
    X, y_int, test_size=0.2, random_state=42, stratify=y_int
)

In [6]:
class PatchLayer(layers.Layer):
    def __init__(self, patch_size):
        super().__init__()
        self.patch_size = patch_size

    def call(self, images):
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID"
        )
        patch_dim = patches.shape[-1]
        patches = tf.reshape(patches, [tf.shape(images)[0], -1, patch_dim])
        return patches

In [7]:
class PatchEmbedding(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super().__init__()
        self.projection = layers.Dense(projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )

    def call(self, patch_inputs):
        positions = tf.range(start=0, limit=tf.shape(patch_inputs)[1])
        return self.projection(patch_inputs) + self.position_embedding(positions)