In [None]:
import tensorflow.compat.v1 as tf
from tensorflow.compat.v1.keras import backend as K
from tensorflow.compat.v1.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.compat.v1.keras.callbacks import TensorBoard
from callbacks import ConfusionMatrixCallback
from model import speech_model, prepare_model_settings
from input_data import AudioProcessor, prepare_words_list
from classes import get_classes
from utils import data_gen
from IPython import embed  # noqa

tf.disable_v2_behavior()

In [None]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
K.set_session(sess)
data_dirs = ['data/train/audio']
output_representation = 'raw'
sample_rate = 16000
batch_size = 384
classes = get_classes(wanted_only=False, extend_reversed=False)
#classes =  'sheila nine stop bed four six down bird marvin cat off right seven eight up three happy go zero on wow dog yes five one tree house two left no' # noqa
#classes = classes.split(' ')
model_settings = prepare_model_settings(
    label_count=len(prepare_words_list(classes)), sample_rate=sample_rate,
    clip_duration_ms=1000, window_size_ms=30.0, window_stride_ms=10.0,
    dct_coefficient_count=80, num_log_mel_features=60,
    output_representation=output_representation)
ap = AudioProcessor(
    data_dirs=data_dirs, wanted_words=classes,
    silence_percentage=13.0, unknown_percentage=60.0,
    validation_percentage=10.0, testing_percentage=0.0,
    model_settings=model_settings,
    output_representation=output_representation)
train_gen = data_gen(ap, sess, batch_size=batch_size, mode='training',
               pseudo_frequency=0.6)
val_gen = data_gen(ap, sess, batch_size=batch_size, mode='validation',
                   pseudo_frequency=0.0)
model = speech_model('conv_1d_time_sliced_with_attention', 
    model_settings['fingerprint_size'] if output_representation != 'raw' else model_settings['desired_samples'],  # noqa
    num_classes=model_settings['label_count'], **model_settings)

In [None]:
ap.summary()

In [None]:
callbacks = [
  ConfusionMatrixCallback(
      val_gen, ap.set_size('validation') // batch_size,
      wanted_words=prepare_words_list(get_classes(wanted_only=True)),
      all_words=prepare_words_list(classes),
      label2int=ap.word_to_index),
  ReduceLROnPlateau(monitor='val_categorical_accuracy', mode='max',
                    factor=0.5, patience=4, verbose=1, min_lr=1e-5),
  TensorBoard(log_dir='logs_210'),
  ModelCheckpoint(
      'ep-{epoch:03d}-vl-{loss:.4f}.hdf5',
      save_best_only=True, monitor='val_categorical_accuracy',
      mode='max')]

In [None]:
model.summary()

In [None]:
model.fit(train_gen, steps_per_epoch=ap.set_size('training') // batch_size,
      epochs=100, verbose=1, callbacks=[callbacks])

In [None]:
model.fit_generator(train_gen, steps_per_epoch=ap.set_size('training') // batch_size,
      epochs=100, verbose=1, callbacks=[callbacks])

In [None]:
eval_res = model.evaluate(val_gen, ap.set_size('validation') // batch_size)
print(eval_res)

In [None]:
model.save("speech_model.hdf5")

In [None]:
all_classes = prepare_words_list(classes)

In [None]:
sample, sample_rate = librosa.load("audios/check.wav", sr = 16000)
predictData = librosa.resample(sample, 16000, 8000)

In [None]:
prob = model.predict(predictData.reshape(1,16000))

maxProb = max(prob[0])
probClass = all_classes[np.argmax(prob[0])]

In [None]:
print(maxProb)
print(probClass)