In [1]:
import tensorflow as tf
import numpy as np
import librosa
import random

# Simulate 3 command words: "yes", "no", "stop"

In [2]:
def generate_command_data(classes=3, samples_per_class=50, max_len=100, n_mfcc=13):
    X, y = [], []
    for label in range(classes):
        for _ in range(samples_per_class):
            base_freq = 400 + label * 100  # Slightly different tone per command
            signal = np.sin(np.linspace(0, 2 * np.pi * base_freq, 16000))  # 1-second audio
            mfcc = librosa.feature.mfcc(y=signal.astype(np.float32), sr=16000, n_mfcc=n_mfcc)
            mfcc = mfcc.T[:max_len]
            if mfcc.shape[0] < max_len:
                pad = max_len - mfcc.shape[0]
                mfcc = np.pad(mfcc, ((0, pad), (0, 0)), mode='constant')
            X.append(mfcc)
            y.append(label)
    return np.array(X), tf.keras.utils.to_categorical(y, num_classes=classes)

# Generate dataset

In [3]:
X, y = generate_command_data()
X = X[..., np.newaxis]  # Add channel dim for Conv2D

# Train-test split

In [4]:
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# Build CNN model

In [5]:
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=X.shape[1:]),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(y.shape[1], activation='softmax')  # Command class prediction
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


# Compile and train

In [6]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, validation_split=0.2)

Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 124ms/step - accuracy: 0.2565 - loss: 1.3751 - val_accuracy: 0.0000e+00 - val_loss: 3.3539
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.7799 - loss: 0.5235 - val_accuracy: 0.1667 - val_loss: 5.3174
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 1.0000 - loss: 0.3315 - val_accuracy: 0.1667 - val_loss: 6.6314
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 1.0000 - loss: 0.2039 - val_accuracy: 0.1667 - val_loss: 7.6894
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 1.0000 - loss: 0.1176 - val_accuracy: 0.1667 - val_loss: 8.8146
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 1.0000 - loss: 0.0670 - val_accuracy: 0.1667 - val_loss: 10.0012
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1b2206e31a0>

# Evaluate

In [7]:
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"🎙️ Speech Command Recognition Accuracy: {acc:.2f}")

🎙️ Speech Command Recognition Accuracy: 0.00
