In [8]:
import os
import numpy as np
import librosa

def load_and_preprocess(filepath, sr=44100, n_mels=128, duration=6, hop_length=512):
    """
    Loads an audio file, trims/pads to a fixed duration, and returns a Mel spectrogram.
    """
    # Load audio; if audio is shorter than duration, it'll be padded with zeros.
    y, sr = librosa.load(filepath, sr=sr, duration=duration)
    # Ensure the audio is exactly the desired length in samples.
    target_length = sr * duration
    if len(y) < target_length:
        y = np.pad(y, (0, target_length - len(y)), mode='constant')
    else:
        y = y[:target_length]

    # Compute Mel spectrogram
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
    # Convert power spectrogram (amplitude squared) to decibels (log scale)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Normalize to 0-1 (optional but helps training)
    mel_spec_norm = (mel_spec_db + 80) / 80  # assuming minimum is around -80 dB
    
    return mel_spec_norm


spectrogram = load_and_preprocess('samples/zelda_lullaby/sample_0.wav')
print(spectrogram.shape)


(128, 517)


blabla

In [10]:
import glob

def load_dataset(directories, sr=44100, n_mels=128, duration=6, hop_length=512):
    data = []
    labels = []
    for label, dir_path in enumerate(directories):
        # List all WAV files in the directory
        file_list = glob.glob(os.path.join(dir_path, '*.wav'))
        for filepath in file_list:
            spec = load_and_preprocess(filepath, sr=sr, n_mels=n_mels, duration=duration, hop_length=hop_length)
            
            # For example, let's force a width of 128:
            if spec.shape[1] < 128:
                pad_width = 128 - spec.shape[1]
                spec = np.pad(spec, ((0,0), (0, pad_width)), mode='constant')
            else:
                spec = spec[:, :128]
            data.append(spec)
            labels.append(label)
    return np.array(data), np.array(labels)

directories = os.listdir('samples')
directories = [os.path.join('samples', d) for d in directories]
X, y = load_dataset(directories)

print("Dataset shape:", X.shape)  # Expected shape: (num_samples, 128, 128)
print("Labels shape:", y.shape)

Dataset shape: (40, 128, 517)
Labels shape: (40,)


In [11]:
X = X[..., np.newaxis]  # Now shape is (num_samples, 128, 128, 1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
import tensorflow as tf
from tensorflow.keras import layers, models

input_shape = (128, 128, 1)  # Height, Width, Channels

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
    layers.MaxPooling2D((2, 2)),
    
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(directories), activation='softmax')  # Two classes for two melodies
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
history = model.fit(X_train, y_train, epochs=20, batch_size=16,
                    validation_data=(X_test, y_test))

Epoch 1/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 282ms/step - accuracy: 0.4167 - loss: 0.6900 - val_accuracy: 0.3750 - val_loss: 0.6613
Epoch 2/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step - accuracy: 0.7083 - loss: 0.6234 - val_accuracy: 0.8750 - val_loss: 0.4711
Epoch 3/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - accuracy: 0.8750 - loss: 0.4469 - val_accuracy: 0.7500 - val_loss: 0.3618
Epoch 4/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 0.8750 - loss: 0.2826 - val_accuracy: 0.7500 - val_loss: 0.4874
Epoch 5/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 0.9375 - loss: 0.1969 - val_accuracy: 0.8750 - val_loss: 0.3510
Epoch 6/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 0.9792 - loss: 0.0622 - val_accuracy: 0.8750 - val_loss: 0.1540
Epoch 7/20
[1m2/2[0m [32m━━━━━━━━━━━━