# 3. Training and testing a simple speech recognition model on a toy dataset
The input audio signals are a subset of Google's Speech Command Dataset.

Download input files from this [Google Drive link](https://drive.google.com/file/d/1LWM97dl8ZIr7k05kVeVKLqje1DUyJjfh/view?usp=sharing) and extract it into this directory ("part2").

Imports

In [None]:
import json
import os

import IPython.display as ipd
import librosa
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm

### Preprocess the input dataset
Extracts 13 MFCC features for every frame in an audio signal.

Saves the labels per audio signal and MFCC feature vectors per frame in a .json file.

In [None]:
DATASET_PATH = "dataset"
PREPROCESSED_DATA_PATH = "preprocessed_data.json"
N_SAMPLES_TO_CONSIDER = 22050

In [None]:
n_mfcc = 13
n_fft = 2048
hop_length = 512

preprocessed_data = {
    "mapping": [],
    "labels": [],
    "MFCCs": [],
    "files": []
}

for i, (dir_path, dir_names, filenames) in enumerate(os.walk(DATASET_PATH)):
    if dir_path is not DATASET_PATH:
        label = dir_path.split("/")[-1]
        preprocessed_data["mapping"].append(label)
        print(f"Processing audios for the word: '{label}'")

        for f in tqdm(filenames):
            if not f.endswith(".wav"):
                continue

            file_path = os.path.join(dir_path, f)
            signal, sample_rate = librosa.load(file_path)

            if len(signal) >= N_SAMPLES_TO_CONSIDER:
                signal = signal[:N_SAMPLES_TO_CONSIDER]
                MFCCs = librosa.feature.mfcc(
                    y=signal,
                    sr=sample_rate,
                    n_mfcc=n_mfcc,
                    n_fft=n_fft,
                    hop_length=hop_length
                )

                preprocessed_data["MFCCs"].append(MFCCs.T.tolist())
                preprocessed_data["labels"].append(i-1)
                preprocessed_data["files"].append(file_path)

with open(PREPROCESSED_DATA_PATH, "w") as f:
    json.dump(preprocessed_data, f, indent=4)


### Train a CNN-based speech recognition model

In [None]:
EPOCHS = 20
BATCH_SIZE = 32
PATIENCE = 5
MODEL_PATH = "model.h5"

In [None]:
def load_data(data_path):
    with open(data_path, "r") as fp:
        data = json.load(fp)

    X = np.array(data["MFCCs"])
    y = np.array(data["labels"])
    return X, y


def split_data(X, y, test_size=0.2, validation_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validation_size)

    X_train = X_train[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    X_validation = X_validation[..., np.newaxis]

    return X_train, y_train, X_validation, y_validation, X_test, y_test


def build_model(input_shape, learning_rate=0.0001):
    model = tf.keras.models.Sequential()

    model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu', input_shape=input_shape))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))

    model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))

    model.add(tf.keras.layers.Conv2D(32, (2, 2), activation='relu',))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same'))

    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    tf.keras.layers.Dropout(0.3)

    model.add(tf.keras.layers.Dense(4, activation='softmax'))

    optimiser = tf.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimiser, loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    model.summary()

    return model


def train_model(model, X_train, y_train, X_validation, y_validation):
    history = model.fit(
        X_train,
        y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=(X_validation, y_validation),
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor="accuracy", min_delta=0.001, patience=PATIENCE)
        ]
    )

    return history


def plot_history(history):
    fig, axs = plt.subplots(2, figsize=(15, 10))

    axs[0].plot(history.history["accuracy"], label="accuracy")
    axs[0].plot(history.history['val_accuracy'], label="val_accuracy")
    axs[0].set_ylabel("Accuracy")
    axs[0].legend(loc="lower right")
    axs[0].set_title("Accuracy vs. epoch")

    axs[1].plot(history.history["loss"], label="loss")
    axs[1].plot(history.history['val_loss'], label="val_loss")
    axs[1].set_xlabel("Epoch")
    axs[1].set_ylabel("Loss")
    axs[1].legend(loc="upper right")
    axs[1].set_title("Loss vs. epoch")

    plt.show()

In [None]:
X, y = load_data(PREPROCESSED_DATA_PATH)
X_train, y_train, X_validation, y_validation, X_test, y_test = split_data(X, y)

input_shape = (X_train.shape[1], X_train.shape[2], 1)
model = build_model(input_shape)

history = train_model(model, X_train, y_train, X_validation, y_validation)

model.save(MODEL_PATH)

In [None]:
plot_history(history)

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {round(test_acc * 100, 2)}%")

### Test the model with your own speech
You can record your own voice in a .wav file - https://voice-recorder-online.com/.

Try to pronounce one of the words that our model can recognize: "up", "down", "right", "left".

Look at what our simple speech recognition model predicts.

In [None]:
class SpeechRecognitionModel:
    def __init__(self):
        self.model = tf.keras.models.load_model(MODEL_PATH)
        with open(PREPROCESSED_DATA_PATH, "r") as f:
            self.mapping = json.load(f)["mapping"]

    @staticmethod
    def preprocess(file_path, num_mfcc=13, n_fft=2048, hop_length=512):
        signal, sample_rate = librosa.load(file_path)
        if len(signal) >= N_SAMPLES_TO_CONSIDER:
            signal = signal[:N_SAMPLES_TO_CONSIDER]
            MFCCs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)

            return MFCCs.T
    
    def predict(self, file_path):
        MFCCs = self.preprocess(file_path)
        MFCCs = MFCCs[np.newaxis, ..., np.newaxis]

        predictions = self.model.predict(MFCCs)
        predicted_index = np.argmax(predictions)
        predicted_keyword = self.mapping[predicted_index]

        return predicted_keyword

In [None]:
speech_recognition_model = SpeechRecognitionModel()

In [None]:
audio_file = "test/example1.wav"  # Replace the filepath accordingly

ipd.Audio(audio_file)

In [None]:
prediction = speech_recognition_model.predict(audio_file)
print(f"You said {prediction}!")