In [15]:
import librosa
import numpy as np
import os

In [16]:
positive_dataset = os.listdir(os.path.join('..', 'samples', 'positive'))
negative_dataset = os.listdir(os.path.join('..', 'samples', 'negative'))

In [17]:
SAMPLE_RATE = 16000
DURATION = 1.5

In [18]:
def preprocess_dataset():
    print("Processing dataset...")
    all_mfccs = []
    all_labels = []

    expected_samples = int(SAMPLE_RATE * DURATION)

    for label, dataset in enumerate([negative_dataset, positive_dataset]):
        for index, file_name in enumerate(dataset):
            file_path = os.path.join('..', 'samples', 'positive' if label == 1 else 'negative', file_name)

            try:
                signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
                if len(signal) > expected_samples:
                        signal = signal[:expected_samples]
                elif len(signal) < expected_samples:
                        signal = np.pad(signal, (0, expected_samples - len(signal)), 'constant')

                mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13, n_fft=2048, hop_length=512)

                mfcc = mfcc.T

                all_mfccs.append(mfcc)
                all_labels.append(label)

                print(f'Processed data {index}:{file_name} success')
            except Exception as e:
                print(f'Failed to process {file_name}: {e}')

    return np.array(all_mfccs), np.array(all_labels)

In [19]:
X_data, y_data = preprocess_dataset()

print(f"Shape of X_data (MFCCs): {X_data.shape}")
print(f"Shape of y_data (labels): {y_data.shape}")

Processing dataset...
Processed data 0:90_sample.wav success
Processed data 1:6_sample.wav success
Processed data 2:165_sample.wav success
Processed data 3:399_sample.wav success
Processed data 4:422_sample.wav success
Processed data 5:79_sample.wav success
Processed data 6:349_sample.wav success
Processed data 7:74_sample.wav success
Processed data 8:324_sample.wav success
Processed data 9:125_sample.wav success
Processed data 10:332_sample.wav success
Processed data 11:467_sample.wav success
Processed data 12:118_sample.wav success
Processed data 13:232_sample.wav success
Processed data 14:459_sample.wav success
Processed data 15:242_sample.wav success
Processed data 16:497_sample.wav success
Processed data 17:114_sample.wav success
Processed data 18:237_sample.wav success
Processed data 19:412_sample.wav success
Processed data 20:4_sample.wav success
Processed data 21:476_sample.wav success
Processed data 22:266_sample.wav success
Processed data 23:222_sample.wav success
Processed d

In [20]:
X_data = X_data[..., np.newaxis]

In [21]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models

2025-09-05 00:07:44.981536: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data, test_size=0.2, random_state=42, stratify=y_data
)

Data split into training and testing sets

In [23]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (702, 47, 13, 1)
X_test shape: (176, 47, 13, 1)


In [24]:
input_shape = (X_train.shape[1], X_train.shape[2], 1)

In [25]:
model = models.Sequential([
    # Input Layer
    layers.Input(shape=input_shape),

    # First Convolutional Block
    layers.Conv2D(32, kernel_size=(3, 3), activation='relu'),
    layers.MaxPooling2D(pool_size=(2, 2)),

    # Second Convolutional Block
    layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),
    layers.MaxPooling2D(pool_size=(2, 2)),

    # Flatten the features to feed into the dense layer
    layers.Flatten(),

    # Dense Layer for classification
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5), # Dropout helps prevent overfitting

    # Output Layer
    # Sigmoid is used for binary (2-class) classification
    layers.Dense(1, activation='sigmoid')
])

In [26]:
model.summary()

In [27]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy', # Perfect for Yes/No (1/0) classification
    metrics=['accuracy']
)

Model Training

In [28]:
history = model.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test, y_test)
)

Epoch 1/20
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - accuracy: 0.6652 - loss: 1.8361 - val_accuracy: 0.8807 - val_loss: 0.3795
Epoch 2/20
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.8063 - loss: 0.4705 - val_accuracy: 0.8920 - val_loss: 0.3124
Epoch 3/20
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.8618 - loss: 0.3483 - val_accuracy: 0.8807 - val_loss: 0.2771
Epoch 4/20
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - accuracy: 0.8960 - loss: 0.2878 - val_accuracy: 0.9148 - val_loss: 0.1791
Epoch 5/20
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.9117 - loss: 0.2529 - val_accuracy: 0.9432 - val_loss: 0.1418
Epoch 6/20
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.9245 - loss: 0.2094 - val_accuracy: 0.9318 - val_loss: 0.1415
Epoch 7/20
[1m22/22[0m [32m━━━━

Training Complete

In [30]:
model.save('wake_word.keras')