In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras import layers, models
import numpy as np

In [2]:
# Load data
version = 3 # just 3 is available
ds = tfds.load(f'speech_commands:0.0.{version}')
train = ds['train']

2024-06-25 19:58:08.377605: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-06-25 19:58:08.377692: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2024-06-25 19:58:08.377701: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2024-06-25 19:58:08.377742: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-25 19:58:08.377767: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:
#create mel spectrogram using tfio
import tensorflow_io as tfio

def create_mel_spectrogram(audio):
    """
    Create a mel spectrogram from an audio tensor.
    
    Args:
    audio (tf.Tensor): A 1D tensor containing the audio samples.
    
    Returns:
    tf.Tensor: A 2D tensor containing the mel spectrogram.
    """
    # audio = tf.cast(audio, tf.float32)
    # audio = tfio.audio.resample(audio, 16000, 8000)
    
    audio = tf.signal.stft(audio, frame_length=255, frame_step=128)
    audio = tf.abs(audio)
    audio = tfio.audio.melscale(audio, rate=16000, mels=128, fmin=0, fmax=8000)
    audio = tf.math.log(audio + 1e-6)
    audio = tf.transpose(audio)
    
    return audio

In [4]:
# Preprocess data
def preprocess_data(sample):
    """
    Preprocess a sample from the dataset.
    
    Args:
    sample (dict): A sample from the dataset.
    
    Returns:
    tuple: A tuple containing the mel spectrogram and the label.
    """
    audio = sample['audio']
    #convert to float32
    audio = tf.cast(audio, tf.float32)
    audio = audio[:16000]
    zero_padding = tf.zeros([16000] - tf.shape(audio), dtype=tf.float32)
    audio = tf.concat([audio, zero_padding], 0)
    label = sample['label']
    spectrogram = create_mel_spectrogram(audio)
    
    return spectrogram, label

In [5]:
# Test preprocess_data
sample = train.take(1)
sample = sample.map(preprocess_data)

In [6]:
# Create dataset
train_samples = train.take(1000)
train_data = train_samples.map(preprocess_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [7]:
print(train_data)

<_ParallelMapDataset element_spec=(TensorSpec(shape=(128, None), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>


In [8]:
# batch and prefetch
train_data = train_data.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

In [17]:
#evaluate input shape
for spectrogram, label in train_data.take(1):
    print(spectrogram.shape)
    print(label.shape)

(32, 128, 124)
(32,)


2024-06-25 20:04:02.086029: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [53]:
# Create model
input_shape = (128, 124, 1)  # Shape of each spectrogram sample

model = models.Sequential([
    layers.Input(shape=input_shape),
    layers.Conv2D(32, 3, activation='relu'),
    layers.Dense(10, activation='softmax')  # Adjust output shape as needed
])

In [54]:
# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [55]:
model.summary()

In [56]:
# Train the model
EPOCHS = 3
history = model.fit(train_data, epochs=EPOCHS)

Epoch 1/3


ValueError: Argument `output` must have rank (ndim) `target.ndim - 1`. Received: target.shape=(None,), output.shape=(None, 126, None, 10)