In [None]:
# Small GPU Bugfix and all imports
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import tensorflow
physical_devices = tensorflow.config.list_physical_devices('GPU')
tensorflow.config.experimental.set_memory_growth(physical_devices[0], enable=True)

import tensorflow_datasets as tfds
import numpy as np
from tensorflow import  numpy_function
from tensorflow import convert_to_tensor
from tensorflow import float32
import tensorflow.keras as tf
import tensorflow.keras.backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Convolution1D, Flatten, Dense
from tensorflow.keras.activations import softmax
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.models import load_model
import librosa
import os

In [None]:
class TensorBoardWithLR(TensorBoard):
    def __init__(self, log_dir, **kwargs):  # add other arguments to __init__ if you need
        super().__init__(log_dir=log_dir, **kwargs)

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        logs.update({'lr': K.eval(self.model.optimizer.lr)})
        super().on_epoch_end(epoch, logs)

In [None]:
# Loading Dataset
dataset = tfds.load(name="speech_commands", split="train")
dataset = dataset.shuffle(1024)
dataset_valid = tfds.load(name="speech_commands", split="validation")

In [None]:
import simpleaudio as sa
for x in dataset.take(1):
    audio = x['audio'].numpy()
    print(x['label'])
    audio = audio * (2**15 - 1) / np.max(np.abs(audio))
    audio = audio.astype(np.int16)
    play_obj = sa.play_buffer(audio, 1, 2, fs)
    play_obj.wait_done()


In [None]:
# Dataset Prep Functions
@tensorflow.function
def pad_spectrogram(spec, len):
    zrs = tensorflow.zeros((spec.shape[0], len - spec.shape[1]), dtype=tensorflow.float32)
    padded = tensorflow.concat([zrs, spec], 1)
    return padded


def to_spectrogram(x):
    def mapping_function(audio, label):
        S = librosa.feature.melspectrogram(audio.astype(np.float32), sr=16000, n_fft=160, hop_length=400,
                                           n_mels=40)
        S_DB = librosa.power_to_db(S, ref=np.max)
        if S_DB.shape[1] != 41:
            S_DB = pad_spectrogram(S_DB, 41)
        S_DB = tensorflow.reshape(tensorflow.transpose(S_DB), (41, 40))
        x = convert_to_tensor(S_DB)
        y = convert_to_tensor(to_categorical(label, 12).astype(np.float32))
        return x, y

    audios, labels = numpy_function(mapping_function, [x['audio'], x['label']], [float32, float32])
    audios.set_shape((41, 40))
    labels.set_shape((12,))
    return audios, labels

def filter_predicate(x):
    label = x['label']
    return tf.backend.not_equal(label, 11)

In [None]:
# Apply Operations to Dataset
dataset = dataset.filter(filter_predicate)
dn = dataset.map(to_spectrogram)
dataset_valid = dataset_valid.filter(filter_predicate)
dataset_valid = dataset_valid.map(to_spectrogram)


In [None]:
# Visualize Dataset
from librosa.display import specshow
import matplotlib.pyplot as plt

for x, y in dn.take(1):
    specshow(x.numpy(), sr=16000, hop_length=400, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.show()

In [None]:
# Train Run prep
RUN_NAME = 'conv1d_nounknowns_largermodel'
tbcallback = TensorBoardWithLR(log_dir='logs/' + RUN_NAME)
lr_reduction_callback = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)
if 'saved_models' not in os.listdir('.'):
    os.mkdir('saved_models')
checkpointcallback = ModelCheckpoint('saved_models/'+RUN_NAME + '.h5', monitor='train-acc', save_best_only=False)
dn = dn.batch(64)
dataset_valid = dataset_valid.batch(64)

In [None]:
# Model Definition
model = tf.Sequential()
model.add(Convolution1D(64, 7, input_shape=(41, 40), activation='relu'))
model.add(Convolution1D(32, 5, activation='relu'))
model.add(Convolution1D(16, 3, activation='relu'))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(12, activation=softmax))
model.summary()
model.compile(tf.optimizers.Adam(0.001), loss=tf.losses.categorical_crossentropy, metrics=['acc'])

In [None]:
# Model Training
model.fit(dn, callbacks=[tbcallback, checkpointcallback, lr_reduction_callback], epochs=512, validation_data=dataset_valid, verbose=False)

In [None]:
# Model Testing
conf_matrix = np.zeros((12, 12), np.uint64)
for di in dn.take(100):
    preds = model.predict(di[0])
    for p, t in zip(np.argmax(preds, axis=-1), np.argmax(di[1], axis=-1)):
        conf_matrix[p, t] += 1
print(conf_matrix)



In [None]:
# Load Model
model = load_model('saved_models/'+RUN_NAME + '.h5')

In [None]:
#Record a 1 second chunk of audio
import sounddevice as sd
from scipy.io.wavfile import write
import matplotlib.pyplot as plt
from librosa.display import specshow

fs = 16000  # Sample rate
seconds = 1  # Duration of recording
print("START!")
myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=1)
sd.wait()
print("DONE!")
# plt.plot(myrecording[:, 0])
write('output.wav', fs, myrecording)  # Save as WAV file
plt.show()
S = librosa.feature.melspectrogram(myrecording.astype(np.float32).reshape(16000,), sr=16000, n_fft=160, hop_length=400,
                                           n_mels=40)
S_DB = librosa.power_to_db(S, ref=np.max)
# specshow(S_DB, sr=16000, hop_length=400, x_axis='time', y_axis='mel')
labels = ["Down", "Go", "Left", "No", "Off", "On", "Right", "Stop", "Up", "Yes"]
print(labels[np.argmax(model.predict(np.asarray([S_DB.T]))[0])])