# Audio Classification with TensorFlow

Train a CNN based classifier with __TensorFlow__ on Spoken Digit dataset

### Typical Audio Classification Approach

Typical approach for audio classification would look like this:

1. Gather audio data
2. Convert audio to frequency domain representation like MFCC or Mel Spectrogram
3. Train a CNN on the frequency domain feature
4. Deploy the model

### Setup

Install required packages: `teal` and `pydub`

`pydub` is needed for the dataset downloaded from TFDS and `teal` is a library of audio specific layers for TensorFlow

If you're using Colab, these are the only two packages that are needed as TensorFlow and TFDS are already installed on Google Colab, otherwise these 2 must be installed as well

In [None]:
!pip install git+https://github.com/am1tyadav/teal pydub -q

Restart kernel for installation to take effect

In [None]:
import IPython

IPython.Application.instance().kernel.do_shutdown(True)

Import TensorFlow and Teal after the kernel restarts

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from IPython.display import Audio
import tensorflow_datasets as tfds
import teal

### Download the Dataset

In [None]:
dataset = tfds.load("spoken_digit", data_dir="./tmp")

Very few samples in this dataset are longer than 1 second, so we can crop accordingly

In [None]:
SAMPLE_RATE = 8000
DURATION = 1
SAMPLE_LEN = DURATION * SAMPLE_RATE


def process_example(example):
    audio = example["audio"]
    audio = tf.cast(audio, dtype=tf.float32) / 32768.
    label = tf.cast(example["label"], dtype=tf.float32)

    num_samples = tf.shape(audio)[0]
    if num_samples > SAMPLE_LEN:
        return audio[:SAMPLE_LEN], label
    
    # Otherwise pad audio 
    padding = SAMPLE_LEN - num_samples
    
    if padding == 0:
        return audio, label

    return tf.pad(audio, [[0, padding]]), label

In [None]:
BATCH_SIZE = 8
TOTAL_EXAMPLES = len(dataset["train"])
NUM_TRAIN = int(0.7 * TOTAL_EXAMPLES)
NUM_VALID = TOTAL_EXAMPLES - NUM_TRAIN

dataset = dataset["train"]
dataset = dataset.map(process_example)

print(f"Splitting dataset into {NUM_TRAIN} training examples and {NUM_VALID} validation examples")

dataset = dataset.shuffle(buffer_size=TOTAL_EXAMPLES)
train_ds = dataset.take(NUM_TRAIN).batch(BATCH_SIZE)
valid_ds = dataset.skip(NUM_TRAIN).batch(BATCH_SIZE)

In [None]:
audio, label = next(iter(train_ds))

In [None]:
print(label)

Audio(audio[0], rate=SAMPLE_RATE)

## Models

### Feature Model - Log Mel Spectrogram

In [None]:
N_FFT = 1024
HOP_LEN = 256
N_MELS = 28


feature_model = models.Sequential([
    layers.Input(shape=(SAMPLE_LEN, )),
    teal.NormalizeAudio(),
    teal.AudioToMelSpectrogram(SAMPLE_RATE, N_FFT, HOP_LEN, N_MELS),
    teal.PowerToDb(),
    teal.NormalizeSpectrum()
], name="feature_model")

feature_model.summary()

### Augmentation Model

In [None]:
aug_model = models.Sequential([
    layers.Input(shape=(SAMPLE_LEN, )),
    teal.InversePolarity(0.5),
    teal.RandomGain(0.2),
    teal.RandomNoise(0.4),
    teal.PitchShift(0.5, shift=50),
    teal.RandomGain(0.2)
], name="augmentation_model")

aug_model.summary()

### CNN Model

In [None]:
cnn = models.Sequential([
    layers.Input(shape=(28, 28)),
    layers.Lambda(lambda x: tf.expand_dims(x, axis=-1)),
    layers.Conv2D(32, 3, padding="same", strides=2, activation="relu"),
    layers.BatchNormalization(),
    layers.Conv2D(64, 3, padding="same", strides=2, activation="relu"),
    layers.BatchNormalization(),
    layers.Flatten(),
    layers.Dense(32, activation="relu"),
    layers.Dropout(0.2),
    layers.Dense(10, activation="softmax")
], name="cnn")

cnn.summary()

### Composite Model

In [None]:
_input = layers.Input(shape=(SAMPLE_LEN, ))
_data = aug_model(_input)
_feature = feature_model(_data)
_output = cnn(_feature)

model = models.Model(_input, _output, name="composite_model")
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"],
    run_eagerly=True
)

model.summary()

## Training

In [None]:
EPOCHS = 2

_ = model.fit(
    train_ds,
    validation_data=valid_ds,
    epochs=EPOCHS,
    callbacks=[tf.keras.callbacks.ReduceLROnPlateau(patience=4)]
)

## Saving Model for Production

You probably don't want the augmentation model/ layers in your deployed model!

In [None]:
_input = model.input

_feature = model.layers[2](_input)
_output = model.layers[3](_feature)

model = models.Model(_input, _output, name="prod_model")
model.summary()

## Get Some Predictions

In [None]:
examples, labels = next(iter(valid_ds))

preds = model.predict(examples)

tf.argmax(preds, axis=-1)

In [None]:
labels