# Transfer Learn Audio Recognizer

In [1]:
# install
!pip install tensorflow tensorflow_hub tensorflow_io



In [2]:
# upload the dataset
from google.colab import files
uploaded = files.upload()

Saving mini_speech_commands.zip to mini_speech_commands (1).zip


In [3]:
# extract dataset
import zipfile
import os

with zipfile.ZipFile("mini_speech_commands.zip", 'r') as zip_ref:
    zip_ref.extractall(".")

data_dir = "mini_speech_commands"
print("Dataset extracted to:", data_dir)

Dataset extracted to: mini_speech_commands


In [4]:
import os
import pathlib
import random
from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models

caused by: ['/usr/local/lib/python3.11/dist-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl8str_util8EndsWithESt17basic_string_viewIcSt11char_traitsIcEES4_']
caused by: ['/usr/local/lib/python3.11/dist-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZN3tsl8str_util9LowercaseB5cxx11ESt17basic_string_viewIcSt11char_traitsIcEE']


In [5]:
# load YAMNet model from TF Hub
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

# load YAMNet class map
class_map_path = tf.keras.utils.get_file(
    'yamnet_class_map.csv',
    'https://raw.githubusercontent.com/tensorflow/models/master/research/audioset/yamnet/yamnet_class_map.csv'
)

yamnet_classes = [line.split(',')[2].strip() for line in open(class_map_path).readlines()[1:]]

In [6]:
# extract audio embeddings
def load_wav_file(file_path):
    file_contents = tf.io.read_file(file_path)
    audio, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)
    waveform = tf.squeeze(audio, axis=-1)

    return waveform, sample_rate

In [7]:
# label mapping
commands = np.array(tf.io.gfile.listdir(data_dir))
commands = commands[commands != 'README.md']
label_to_index = {label: idx for idx, label in enumerate(commands)}

In [8]:
# extract embeddings
X = []
y = []

for label in tqdm(commands):
    files = tf.io.gfile.glob(str(pathlib.Path(data_dir) / label / '*.wav'))
    for f in files:
        try:
            waveform, sr = load_wav_file(f)
            if sr != 16000:
                waveform = tfio.audio.resample(waveform, rate_in=sr, rate_out=16000)
            # run YAMNet
            scores, embeddings, spectrogram = yamnet_model(waveform)
            mean_embedding = tf.reduce_mean(embeddings, axis=0)
            X.append(mean_embedding.numpy())
            y.append(label_to_index[label])
        except Exception as e:
            print(f"Error processing {f}: {e}")

100%|██████████| 8/8 [00:38<00:00,  4.75s/it]


In [9]:
# prepare train/test splits
X = np.array(X)
y = np.array(y)

print("Total samples:", len(X))
if len(X) == 0:
    raise RuntimeError("No embeddings extracted. Please check your audio files.")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Total samples: 8000


In [10]:
# build classifier
model = models.Sequential([
    layers.Input(shape=(1024,)),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(len(commands), activation='softmax')
])

In [11]:
# compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [12]:
model.summary()

In [13]:
# train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32)

Epoch 1/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.2272 - loss: 2.0500 - val_accuracy: 0.3836 - val_loss: 1.7217
Epoch 2/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.3881 - loss: 1.6975 - val_accuracy: 0.4109 - val_loss: 1.5921
Epoch 3/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.4317 - loss: 1.5540 - val_accuracy: 0.4492 - val_loss: 1.5105
Epoch 4/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4609 - loss: 1.4916 - val_accuracy: 0.4570 - val_loss: 1.4749
Epoch 5/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4933 - loss: 1.4079 - val_accuracy: 0.4609 - val_loss: 1.4766
Epoch 6/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5071 - loss: 1.3467 - val_accuracy: 0.4695 - val_loss: 1.4177
Epoch 7/10
[1m160/160[0m 

In [14]:
# evaluate metrics
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.4f}")

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5077 - loss: 1.3659
Test Accuracy: 0.4881


In [15]:
# predict results
idx_to_label = {idx: label for label, idx in label_to_index.items()}
sample = X_test[0:1]
pred = model.predict(sample)
pred_label = idx_to_label[np.argmax(pred)]

print("Predicted label:", pred_label)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step
Predicted label: go
