OM **IPYNB**


---



#Imports

In [0]:
!pip install Keras==2.2.0
!pip install pandas==0.22.0
!pip install pandas-ml==0.5.0
!pip install tensorflow>=1.14.0
!pip install tensorflow-gpu>=1.14.0
!pip install wget==3.2


In [4]:
!pip install Keras==2.2.0

Collecting Keras==2.2.0
  Using cached https://files.pythonhosted.org/packages/68/12/4cabc5c01451eb3b413d19ea151f36e33026fc0efb932bf51bcaf54acbf5/Keras-2.2.0-py2.py3-none-any.whl
Collecting keras-applications==1.0.2
  Using cached https://files.pythonhosted.org/packages/e2/60/c557075e586e968d7a9c314aa38c236b37cb3ee6b37e8d57152b1a5e0b47/Keras_Applications-1.0.2-py2.py3-none-any.whl
Collecting keras-preprocessing==1.0.1
  Using cached https://files.pythonhosted.org/packages/f8/33/275506afe1d96b221f66f95adba94d1b73f6b6087cfb6132a5655b6fe338/Keras_Preprocessing-1.0.1-py2.py3-none-any.whl
[31mERROR: tensorflow 2.2.0 has requirement keras-preprocessing>=1.1.0, but you'll have keras-preprocessing 1.0.1 which is incompatible.[0m
[31mERROR: tensorflow-gpu 2.2.0 has requirement keras-preprocessing>=1.1.0, but you'll have keras-preprocessing 1.0.1 which is incompatible.[0m
Installing collected packages: keras-applications, keras-preprocessing, Keras
  Found existing installation: Keras-Applic

In [1]:
import keras as pdml

print(pdml.__version__)

Using TensorFlow backend.


2.2.0


#Download Dataset

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import wget
import tarfile

from shutil import rmtree

DATASET_URL = 'http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz'
ARCHIVE = os.path.basename(DATASET_URL)

wget.download(DATASET_URL)

if os.path.exists('data'):
  rmtree('data')

os.makedirs('data/train')

with tarfile.open(ARCHIVE, 'r:gz') as tar:
  tar.extractall(path='data/train')

os.remove(ARCHIVE)


#Callbacks

In [0]:
import numpy as np
import sklearn.impute
from sklearn.metrics import confusion_matrix
#from pandas_ml import ConfusionMatrix
from keras.callbacks import Callback


def log_loss(y_true, y_pred, eps=1e-12):
  y_pred = np.clip(y_pred, eps, 1. - eps)
  ce = -(np.sum(y_true * np.log(y_pred), axis=1))
  mce = ce.mean()
  return mce


class ConfusionMatrixCallback(Callback):

  def __init__(self, validation_data, validation_steps, wanted_words, all_words,
               label2int):
    self.validation_data = validation_data
    self.validation_steps = validation_steps
    self.wanted_words = wanted_words
    self.all_words = all_words
    self.label2int = label2int
    self.int2label = {v: k for k, v in label2int.items()}
    with open('confusion_matrix.txt', 'w'):
      pass
    with open('wanted_confusion_matrix.txt', 'w'):
      pass

  def accuracies(self, confusion_val):
    accuracies = []
    for i in range(confusion_val.shape[0]):
      num = confusion_val[i, :].sum()
      if num:
        accuracies.append(confusion_val[i, i] / num)
      else:
        accuracies.append(0.0)
    accuracies = np.float32(accuracies)
    return accuracies

  def accuracy(self, confusion_val):
    num_correct = 0
    for i in range(confusion_val.shape[0]):
      num_correct += confusion_val[i, i]
    accuracy = float(num_correct) / confusion_val.sum()
    return accuracy

  def on_epoch_end(self, epoch, logs=None):
    y_true, y_pred = [], []
    for i in range(self.validation_steps):
      X_batch, y_true_batch = next(self.validation_data)
      y_pred_batch = self.model.predict(X_batch)

      y_true.extend(y_true_batch)
      y_pred.extend(y_pred_batch)

    y_true = np.float32(y_true)
    y_pred = np.float32(y_pred)
    val_loss = log_loss(y_true, y_pred)
    # map integer labels to strings
    y_true = list(y_true.argmax(axis=-1))
    y_pred = list(y_pred.argmax(axis=-1))
    y_true = [self.int2label[y] for y in y_true]
    y_pred = [self.int2label[y] for y in y_pred]
    confusion = ConfusionMatrix(y_true, y_pred)
    accs = self.accuracies(confusion._df_confusion.values)
    acc = self.accuracy(confusion._df_confusion.values)
    # same for wanted words
    y_true = [y if y in self.wanted_words else '_unknown_' for y in y_true]
    y_pred = [y if y in self.wanted_words else '_unknown_' for y in y_pred]
    wanted_words_confusion = ConfusionMatrix(y_true, y_pred)
    wanted_accs = self.accuracies(wanted_words_confusion._df_confusion.values)
    acc_line = ('\n[%03d]: val_categorical_accuracy: %.2f, '
                'val_mean_categorical_accuracy_wanted: %.2f') % (
                    epoch, acc, wanted_accs.mean())  # noqa
    with open('confusion_matrix.txt', 'a') as f:
      f.write('%s\n' % acc_line)
      f.write(confusion.to_dataframe().to_string())

    with open('wanted_confusion_matrix.txt', 'a') as f:
      f.write('%s\n' % acc_line)
      f.write(wanted_words_confusion.to_dataframe().to_string())

    logs['val_loss'] = val_loss
    logs['val_categorical_accuracy'] = acc
    logs['val_mean_categorical_accuracy_all'] = accs.mean()
    logs['val_mean_categorical_accuracy_wanted'] = wanted_accs.mean()


#Utils

In [0]:
import tensorflow.compat.v1 as tf


def data_gen(audio_processor,
             sess,
             batch_size=128,
             background_frequency=0.3,
             background_volume_range=0.15,
             foreground_frequency=0.3,
             foreground_volume_range=0.15,
             time_shift_frequency=0.3,
             time_shift_range=[-500, 0],
             mode='validation',
             flip_frequency=0.0,
             silence_volume_range=0.3):
  ep_count = 0
  offset = 0
  if mode != 'training':
    background_frequency = 0.0
    background_volume_range = 0.0
    foreground_frequency = 0.0
    foreground_volume_range = 0.0
    time_shift_frequency = 0.0
    time_shift_range = [0, 0]
    flip_frequency = 0.0
    # silence_volume_range: stays the same for validation
  while True:
    X, y = audio_processor.get_data(
        how_many=batch_size,
        offset=0 if mode == 'training' else offset,
        background_frequency=background_frequency,
        background_volume_range=background_volume_range,
        foreground_frequency=foreground_frequency,
        foreground_volume_range=foreground_volume_range,
        time_shift_frequency=time_shift_frequency,
        time_shift_range=time_shift_range,
        mode=mode,
        sess=sess,
        flip_frequency=flip_frequency,
        silence_volume_range=silence_volume_range)
    offset += batch_size
    if offset > audio_processor.set_size(mode) - batch_size:
      offset = 0
      print('\n[Ep:%03d: %s-mode]' % (ep_count, mode))
      ep_count += 1
    yield X, y


def tf_roll(a, shift, a_len=16000):
  # https://stackoverflow.com/questions/42651714/vector-shift-roll-in-tensorflow
  def roll_left(a, shift, a_len):
    shift %= a_len
    rolled = tf.concat([a[a_len - shift:, :], a[:a_len - shift, :]], axis=0)
    return rolled

  def roll_right(a, shift, a_len):
    shift = -shift
    shift %= a_len
    rolled = tf.concat([a[shift:, :], a[:shift, :]], axis=0)
    return rolled

  # https://stackoverflow.com/questions/35833011/how-to-add-if-condition-in-a-tensorflow-graph
  return tf.cond(
      tf.greater_equal(shift, 0),
      true_fn=lambda: roll_left(a, shift, a_len),
      false_fn=lambda: roll_right(a, shift, a_len))


#Generator

In [0]:
import hashlib
import math
import os.path
import random
import re
import sys

import numpy as np
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow.compat.v1 as tf

MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M
SILENCE_LABEL = '_silence_'
SILENCE_INDEX = 0
UNKNOWN_WORD_LABEL = '_unknown_'
UNKNOWN_WORD_INDEX = 1
BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
RANDOM_SEED = 59185


def prepare_words_list(wanted_words):
  """Prepends common tokens to the custom word list."""
  return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words


def which_set(filename, validation_percentage, testing_percentage):
  """Determines which data partition the file should belong to."""
  dir_name = os.path.basename(os.path.dirname(filename))
  if dir_name == 'unknown_unknown':
    return 'training'

  base_name = os.path.basename(filename)
  hash_name = re.sub(r'_nohash_.*$', '', base_name)

  hash_name_hashed = hashlib.sha1(tf.compat.as_bytes(hash_name)).hexdigest()
  percentage_hash = ((int(hash_name_hashed, 16) % (MAX_NUM_WAVS_PER_CLASS + 1))
                     * (100.0 / MAX_NUM_WAVS_PER_CLASS))
  if percentage_hash < validation_percentage:
    result = 'validation'
  elif percentage_hash < (testing_percentage + validation_percentage):
    result = 'testing'
  else:
    result = 'training'
  return result


def load_wav_file(filename):
  """Loads an audio file and returns a float PCM-encoded array of samples."""
  with tf.Session(graph=tf.Graph()) as sess:
    wav_filename_placeholder = tf.placeholder(tf.string, [])
    wav_loader = tf.io.read_file(wav_filename_placeholder)
    wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1)
    return sess.run(
        wav_decoder, feed_dict={
            wav_filename_placeholder: filename
        }).audio.flatten()


def save_wav_file(filename, wav_data, sample_rate):
  """Saves audio sample data to a .wav audio file."""
  with tf.Session(graph=tf.Graph()) as sess:
    wav_filename_placeholder = tf.placeholder(tf.string, [])
    sample_rate_placeholder = tf.placeholder(tf.int32, [])
    wav_data_placeholder = tf.placeholder(tf.float32, [None, 1])
    wav_encoder = tf.audio.encode_wav(wav_data_placeholder,
                                      sample_rate_placeholder)
    wav_saver = tf.io.write_file(wav_filename_placeholder, wav_encoder)
    sess.run(
        wav_saver,
        feed_dict={
            wav_filename_placeholder: filename,
            sample_rate_placeholder: sample_rate,
            wav_data_placeholder: np.reshape(wav_data, (-1, 1))
        })


class AudioProcessor(object):
  """Handles loading, partitioning, and preparing audio training data."""

  def __init__(self,
               data_dirs,
               silence_percentage,
               unknown_percentage,
               wanted_words,
               validation_percentage,
               testing_percentage,
               model_settings,
               output_representation=False):
    self.data_dirs = data_dirs
    assert output_representation in {'raw', 'spec', 'mfcc', 'mfcc_and_raw'}
    self.output_representation = output_representation
    self.model_settings = model_settings
    for data_dir in self.data_dirs:
      self.maybe_download_and_extract_dataset(data_dir)
    self.prepare_data_index(silence_percentage, unknown_percentage,
                            wanted_words, validation_percentage,
                            testing_percentage)
    self.prepare_background_data()
    self.prepare_processing_graph(model_settings)

  def maybe_download_and_extract_dataset(self, data_dir):
    if not os.path.exists(data_dir):
      print('Please download the dataset!')
      sys.exit(0)

  def prepare_data_index(self, silence_percentage, unknown_percentage,
                         wanted_words, validation_percentage,
                         testing_percentage):
    """Prepares a list of the samples organized by set and label."""
    random.seed(RANDOM_SEED)
    wanted_words_index = {}
    for index, wanted_word in enumerate(wanted_words):
      wanted_words_index[wanted_word] = index + 2
    self.data_index = {'validation': [], 'testing': [], 'training': []}
    unknown_index = {'validation': [], 'testing': [], 'training': []}
    all_words = {}
    # Look through all the subfolders to find audio samples
    for data_dir in self.data_dirs:
      search_path = os.path.join(data_dir, '*', '*.wav')
      for wav_path in tf.io.gfile.glob(search_path):
        word = re.search('.*/([^/]+)/.*.wav', wav_path).group(1).lower()
        # Treat the '_background_noise_' folder as a special case,
        # since we expect it to contain long audio samples we mix in
        # to improve training.
        if word == BACKGROUND_NOISE_DIR_NAME:
          continue
        all_words[word] = True
        set_index = which_set(wav_path, validation_percentage,
                              testing_percentage)
        # If it's a known class, store its detail, otherwise add it to the list
        # we'll use to train the unknown label.
        if word in wanted_words_index:
          self.data_index[set_index].append({'label': word, 'file': wav_path})
        else:
          unknown_index[set_index].append({'label': word, 'file': wav_path})
      if not all_words:
        raise Exception('No .wavs found at ' + search_path)
      for index, wanted_word in enumerate(wanted_words):
        if wanted_word not in all_words:
          raise Exception('Expected to find ' + wanted_word +
                          ' in labels but only found ' +
                          ', '.join(all_words.keys()))
    # We need an arbitrary file to load as the input for the silence samples.
    # It's multiplied by zero later, so the content doesn't matter.
    silence_wav_path = self.data_index['training'][0]['file']
    for set_index in ['validation', 'testing', 'training']:
      set_size = len(self.data_index[set_index])
      silence_size = int(math.ceil(set_size * silence_percentage / 100))
      for _ in range(silence_size):
        self.data_index[set_index].append({
            'label': SILENCE_LABEL,
            'file': silence_wav_path
        })
      # Pick some unknowns to add to each partition of the data set.
      random.shuffle(unknown_index[set_index])
      unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
      self.data_index[set_index].extend(unknown_index[set_index][:unknown_size])
    # Make sure the ordering is random.
    for set_index in ['validation', 'testing', 'training']:
      # not really needed since the indices are chosen by random
      random.shuffle(self.data_index[set_index])
    # Prepare the rest of the result data structure.
    self.words_list = prepare_words_list(wanted_words)
    self.word_to_index = {}
    for word in all_words:
      if word in wanted_words_index:
        self.word_to_index[word] = wanted_words_index[word]
      else:
        self.word_to_index[word] = UNKNOWN_WORD_INDEX
    self.word_to_index[SILENCE_LABEL] = SILENCE_INDEX

  def prepare_background_data(self):
    """Searches a folder for background noise audio and loads it into memory."""
    self.background_data = []
    background_dir = os.path.join(self.data_dirs[0], BACKGROUND_NOISE_DIR_NAME)
    if not os.path.exists(background_dir):
      return self.background_data
    with tf.Session(graph=tf.Graph()) as sess:
      wav_filename_placeholder = tf.placeholder(tf.string, [])
      wav_loader = tf.io.read_file(wav_filename_placeholder)
      wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1)
      search_path = os.path.join(self.data_dirs[0], BACKGROUND_NOISE_DIR_NAME,
                                 '*.wav')
      for wav_path in tf.io.gfile.glob(search_path):
        wav_data = sess.run(
            wav_decoder, feed_dict={
                wav_filename_placeholder: wav_path
            }).audio.flatten()
        self.background_data.append(wav_data)
      if not self.background_data:
        raise Exception('No background wav files were found in ' + search_path)

  def prepare_processing_graph(self, model_settings):
    """Builds a TensorFlow graph to apply the input distortions."""
    desired_samples = model_settings['desired_samples']
    self.wav_filename_placeholder_ = tf.placeholder(
        tf.string, [], name='filename')
    wav_loader = tf.io.read_file(self.wav_filename_placeholder_)
    wav_decoder = tf.audio.decode_wav(
        wav_loader, desired_channels=1, desired_samples=desired_samples)
    # Allow the audio sample's volume to be adjusted.
    self.foreground_volume_placeholder_ = tf.placeholder(
        tf.float32, [], name='foreground_volme')
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    self.foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    self.time_shift_placeholder_ = tf.placeholder(tf.int32, name='timeshift')
    shifted_foreground = tf_roll(scaled_foreground,
                                 self.time_shift_placeholder_)
    # Mix in background noise.
    self.background_data_placeholder_ = tf.placeholder(
        tf.float32, [desired_samples, 1], name='background_data')
    self.background_volume_placeholder_ = tf.placeholder(
        tf.float32, [], name='background_volume')
    background_mul = tf.multiply(self.background_data_placeholder_,
                                 self.background_volume_placeholder_)
    background_add = tf.add(background_mul, shifted_foreground)
    # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0)
    self.background_clamp_ = background_add
    self.background_clamp_ = tf.reshape(self.background_clamp_,
                                        (1, model_settings['desired_samples']))
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    stfts = tf.signal.stft(
        self.background_clamp_,
        frame_length=model_settings['window_size_samples'],
        frame_step=model_settings['window_stride_samples'],
        fft_length=None)
    self.spectrogram_ = tf.abs(stfts)
    num_spectrogram_bins = self.spectrogram_.shape[-1].value
    lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0
    linear_to_mel_weight_matrix = \
        tf.signal.linear_to_mel_weight_matrix(
            model_settings['dct_coefficient_count'],
            num_spectrogram_bins, model_settings['sample_rate'],
            lower_edge_hertz, upper_edge_hertz)
    mel_spectrograms = tf.tensordot(self.spectrogram_,
                                    linear_to_mel_weight_matrix, 1)
    mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate(
        linear_to_mel_weight_matrix.shape[-1:]))
    log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
    self.mfcc_ = tf.signal.mfccs_from_log_mel_spectrograms(
        log_mel_spectrograms)[:, :, :
                              model_settings['num_log_mel_features']]  # :13

  def set_size(self, mode):
    """Calculates the number of samples in the dataset partition."""
    return len(self.data_index[mode])

  def get_data(self,
               how_many,
               offset,
               background_frequency,
               background_volume_range,
               foreground_frequency,
               foreground_volume_range,
               time_shift_frequency,
               time_shift_range,
               mode,
               sess,
               flip_frequency=0.0,
               silence_volume_range=0.0):
    """Gather samples from the data set, applying transformations as needed."""
    # Pick one of the partitions to choose samples from.
    model_settings = self.model_settings
    candidates = self.data_index[mode]
    if how_many == -1:
      sample_count = len(candidates)
    else:
      sample_count = max(0, min(how_many, len(candidates) - offset))
    # Data and labels will be populated and returned.
    if self.output_representation == 'raw':
      data_dim = model_settings['desired_samples']
    elif self.output_representation == 'spec':
      data_dim = model_settings['spectrogram_length'] * model_settings[
          'spectrogram_frequencies']
    elif self.output_representation == 'mfcc':
      data_dim = model_settings['spectrogram_length'] * \
                 model_settings['num_log_mel_features']
    elif self.output_representation == 'mfcc_and_raw':
      data_dim = model_settings['spectrogram_length'] * \
                 model_settings['num_log_mel_features']
      raw_data = np.zeros((sample_count, model_settings['desired_samples']))

    data = np.zeros((sample_count, data_dim))
    labels = np.zeros((sample_count, model_settings['label_count']))
    desired_samples = model_settings['desired_samples']
    use_background = self.background_data and (mode == 'training')
    pick_deterministically = (mode != 'training')
    # Use the processing graph we created earlier to repeatedly to generate the
    # final output sample data we'll use in training.
    for i in xrange(offset, offset + sample_count):
      # Pick which audio sample to use.
      if how_many == -1 or pick_deterministically:
        sample_index = i
        sample = candidates[sample_index]
      else:
        sample_index = np.random.randint(len(candidates))
        sample = candidates[sample_index]

      # If we're time shifting, set up the offset for this sample.
      if np.random.uniform(0.0, 1.0) < time_shift_frequency:
        time_shift = np.random.randint(time_shift_range[0],
                                       time_shift_range[1] + 1)
      else:
        time_shift = 0
      input_dict = {
          self.wav_filename_placeholder_: sample['file'],
          self.time_shift_placeholder_: time_shift,
      }
      # Choose a section of background noise to mix in.
      if use_background:
        background_index = np.random.randint(len(self.background_data))
        background_samples = self.background_data[background_index]
        background_offset = np.random.randint(
            0,
            len(background_samples) - model_settings['desired_samples'])
        background_clipped = background_samples[background_offset:(
            background_offset + desired_samples)]
        background_reshaped = background_clipped.reshape([desired_samples, 1])
        if np.random.uniform(0, 1) < background_frequency:
          background_volume = np.random.uniform(0, background_volume_range)
        else:
          background_volume = 0.0
          # silence class with all zeros is boring!
          if sample['label'] == SILENCE_LABEL and \
                  np.random.uniform(0, 1) < 0.9:
            background_volume = np.random.uniform(0, silence_volume_range)
      else:
        background_reshaped = np.zeros([desired_samples, 1])
        background_volume = 0.0
      input_dict[self.background_data_placeholder_] = background_reshaped
      input_dict[self.background_volume_placeholder_] = background_volume
      # If we want silence, mute out the main sample but leave the background.
      if sample['label'] == SILENCE_LABEL:
        input_dict[self.foreground_volume_placeholder_] = 0.0
      else:
        # Turn it up or down
        foreground_volume = 1.0
        if np.random.uniform(0, 1) < foreground_frequency:
          foreground_volume = 1.0 + np.random.uniform(-foreground_volume_range,
                                                      foreground_volume_range)
        # flip sign
        if np.random.uniform(0, 1) < flip_frequency:
          foreground_volume *= -1.0
        input_dict[self.foreground_volume_placeholder_] = foreground_volume

      # Run the graph to produce the output audio.
      if self.output_representation == 'raw':
        data[i - offset, :] = sess.run(
            self.background_clamp_, feed_dict=input_dict).flatten()
      elif self.output_representation == 'spec':
        data[i - offset, :] = sess.run(
            self.spectrogram_, feed_dict=input_dict).flatten()
      elif self.output_representation == 'mfcc':
        data[i - offset, :] = sess.run(
            self.mfcc_, feed_dict=input_dict).flatten()
      elif self.output_representation == 'mfcc_and_raw':
        raw_val, mfcc_val = sess.run([self.background_clamp_, self.mfcc_],
                                     feed_dict=input_dict)
        data[i - offset, :] = mfcc_val.flatten()
        raw_data[i - offset, :] = raw_val.flatten()

      label_index = self.word_to_index[sample['label']]
      labels[i - offset, label_index] = 1

    if self.output_representation != 'mfcc_and_raw':
      return data, labels
    else:
      return [data, raw_data], labels

  def get_unprocessed_data(self, how_many, model_settings, mode):
    """Gets sample data without transformations."""
    candidates = self.data_index[mode]
    if how_many == -1:
      sample_count = len(candidates)
    else:
      sample_count = how_many
    desired_samples = model_settings['desired_samples']
    words_list = self.words_list
    data = np.zeros((sample_count, desired_samples))
    labels = []
    with tf.Session(graph=tf.Graph()) as sess:
      wav_filename_placeholder = tf.placeholder(tf.string, [], name='filename')
      wav_loader = tf.io.read_file(wav_filename_placeholder)
      wav_decoder = tf.audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      foreground_volume_placeholder = tf.placeholder(
          tf.float32, [], name='foreground_volume')
      scaled_foreground = tf.multiply(wav_decoder.audio,
                                      foreground_volume_placeholder)
      for i in range(sample_count):
        if how_many == -1:
          sample_index = i
        else:
          sample_index = np.random.randint(len(candidates))
        sample = candidates[sample_index]
        input_dict = {wav_filename_placeholder: sample['file']}
        if sample['label'] == SILENCE_LABEL:
          input_dict[foreground_volume_placeholder] = 0
        else:
          input_dict[foreground_volume_placeholder] = 1
        data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten()
        label_index = self.word_to_index[sample['label']]
        labels.append(words_list[label_index])
    return data, labels

  def summary(self):
    """Prints a summary of classes and label distributions."""
    set_counts = {}
    print('There are %d classes.' % (len(self.word_to_index)))
    print("1%% <-> %d samples in 'training'" % int(
        self.set_size('training') / 100))
    for set_index in ['training', 'validation', 'testing']:
      counts = {k: 0 for k in sorted(self.word_to_index.keys())}
      num_total = self.set_size(set_index)
      for data_point in self.data_index[set_index]:
        counts[data_point['label']] += (1.0 / num_total) * 100.0
      set_counts[set_index] = counts

    print('%-13s%-6s%-6s%-6s' % ('', 'Train', 'Val', 'Test'))
    for label_name in sorted(
        self.word_to_index.keys(), key=self.word_to_index.get):
      line = '%02d %-12s: ' % (self.word_to_index[label_name], label_name)
      for set_index in ['training', 'validation', 'testing']:
        line += '%.1f%% ' % (set_counts[set_index][label_name])
      print(line)


#Classes

In [0]:
from collections import OrderedDict

def get_classes(wanted_only=False):
  if wanted_only:
    classes = 'stop down off right up go on yes left no'
    classes = classes.split(' ')
    assert len(classes) == 10
  else:
    classes = ('sheila nine stop bed four six down bird marvin cat off right '
               'seven eight up three happy go zero on wow dog yes five one tree'
               ' house two left no')  # noqa
    classes = classes.split(' ')
    assert len(classes) == 30
  return classes


def get_int2label(wanted_only=False, extend_reversed=False):
  classes = get_classes(
      wanted_only=wanted_only, extend_reversed=extend_reversed)
  classes = prepare_words_list(classes)
  int2label = {i: l for i, l in enumerate(classes)}
  int2label = OrderedDict(sorted(int2label.items(), key=lambda x: x[0]))
  return int2label


def get_label2int(wanted_only=False, extend_reversed=False):
  classes = get_classes(
      wanted_only=wanted_only, extend_reversed=extend_reversed)
  classes = prepare_words_list(classes)
  label2int = {l: i for i, l in enumerate(classes)}
  label2int = OrderedDict(sorted(label2int.items(), key=lambda x: x[1]))
  return label2int


#Model

In [9]:
import keras
from keras.layers import *
from keras.regularizers import l2
from keras.models import Model


def preprocess(x):
  x = (x + 0.8) / 7.0
  x = K.clip(x, -5, 5)
  return x


def preprocess_raw(x):
  return x


Preprocess = Lambda(preprocess)

PreprocessRaw = Lambda(preprocess_raw)


def relu6(x):
  return K.relu(x, max_value=6)


def conv_1d_time_stacked_model(input_size=16000, num_classes=11):
  """ Creates a 1D model for temporal data.

  Note: Use only
  with compute_mfcc = False (e.g. raw waveform data).
  Args:
    input_size: How big the input vector is.
    num_classes: How many classes are to be recognized.

  Returns:
    Compiled keras model
  """
  input_layer = Input(shape=[input_size])
  x = input_layer
  x = Reshape([800, 20])(x)
  x = PreprocessRaw(x)

  def _reduce_conv(x, num_filters, k, strides=2, padding='valid'):
    x = Conv1D(
        num_filters,
        k,
        padding=padding,
        use_bias=False,
        kernel_regularizer=l2(0.00001))(
            x)
    x = BatchNormalization()(x)
    x = Activation(relu6)(x)
    x = MaxPool1D(pool_size=3, strides=strides, padding=padding)(x)
    return x

  def _context_conv(x, num_filters, k, dilation_rate=1, padding='valid'):
    x = Conv1D(
        num_filters,
        k,
        padding=padding,
        dilation_rate=dilation_rate,
        kernel_regularizer=l2(0.00001),
        use_bias=False)(
            x)
    x = BatchNormalization()(x)
    x = Activation(relu6)(x)
    return x

  x = _context_conv(x, 32, 1)
  x = _reduce_conv(x, 48, 3)
  x = _context_conv(x, 48, 3)
  x = _reduce_conv(x, 96, 3)
  x = _context_conv(x, 96, 3)
  x = _reduce_conv(x, 128, 3)
  x = _context_conv(x, 128, 3)
  x = _reduce_conv(x, 160, 3)
  x = _context_conv(x, 160, 3)
  x = _reduce_conv(x, 192, 3)
  x = _context_conv(x, 192, 3)
  x = _reduce_conv(x, 256, 3)
  x = _context_conv(x, 256, 3)

  x = Dropout(0.3)(x)
  x = Conv1D(num_classes, 5, activation='softmax')(x)
  x = Reshape([-1])(x)

  model = Model(input_layer, x, name='conv_1d_time_stacked')
  model.compile(
      optimizer=keras.optimizers.Adam(lr=3e-4),
      loss=keras.losses.categorical_crossentropy,
      metrics=[keras.metrics.categorical_accuracy])
  return model


def speech_model(model_type, input_size, num_classes=11, *args, **kwargs):
  if model_type == 'conv_1d_time_stacked':
    return conv_1d_time_stacked_model(input_size, num_classes)
  else:
    raise ValueError('Invalid model: %s' % model_type)


def prepare_model_settings(label_count,
                           sample_rate,
                           clip_duration_ms,
                           window_size_ms,
                           window_stride_ms,
                           dct_coefficient_count,
                           num_log_mel_features,
                           output_representation='raw'):
  """Calculates common settings needed for all models."""
  desired_samples = int(sample_rate * clip_duration_ms / 1000)
  window_size_samples = int(sample_rate * window_size_ms / 1000)
  window_stride_samples = int(sample_rate * window_stride_ms / 1000)
  length_minus_window = (desired_samples - window_size_samples)
  spectrogram_frequencies = 257
  if length_minus_window < 0:
    spectrogram_length = 0
  else:
    spectrogram_length = 1 + int(length_minus_window / window_stride_samples)

  if output_representation == 'mfcc':
    fingerprint_size = num_log_mel_features * spectrogram_length
  elif output_representation == 'raw':
    fingerprint_size = desired_samples
  elif output_representation == 'spec':
    fingerprint_size = spectrogram_frequencies * spectrogram_length
  elif output_representation == 'mfcc_and_raw':
    fingerprint_size = num_log_mel_features * spectrogram_length
  return {
      'desired_samples': desired_samples,
      'window_size_samples': window_size_samples,
      'window_stride_samples': window_stride_samples,
      'spectrogram_length': spectrogram_length,
      'spectrogram_frequencies': spectrogram_frequencies,
      'dct_coefficient_count': dct_coefficient_count,
      'fingerprint_size': fingerprint_size,
      'label_count': label_count,
      'sample_rate': sample_rate,
      'num_log_mel_features': num_log_mel_features
  }


AttributeError: ignored

# Train

In [14]:
import argparse
import os

import tensorflow.compat.v1 as tf
from keras import backend as K
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.callbacks import TensorBoard

parser = argparse.ArgumentParser(description='set input arguments')

parser.add_argument(
    '-sample_rate',
    action='store',
    dest='sample_rate',
    type=int,
    default=16000,
    help='Sample rate of audio')
parser.add_argument(
    '-batch_size',
    action='store',
    dest='batch_size',
    type=int,
    default=32,
    help='Size of the training batch')
parser.add_argument(
    '-output_representation',
    action='store',
    dest='output_representation',
    type=str,
    default='raw',
    help='raw, spec, mfcc or mfcc_and_raw')
parser.add_argument(
    '-data_dirs',
    '--list',
    dest='data_dirs',
    nargs='+',
    required=True,
    help='<Required> The list of data directories. e.g., data/train')

args = parser.parse_args()
parser.print_help()
print('input args: ', args)

if __name__ == '__main__':
  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1.0)
  sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
  K.set_session(sess)
  data_dirs = args.data_dirs
  output_representation = args.output_representation
  sample_rate = args.sample_rate
  batch_size = args.batch_size
  classes = get_classes(wanted_only=True)
  model_settings = prepare_model_settings(
      label_count=len(prepare_words_list(classes)),
      sample_rate=sample_rate,
      clip_duration_ms=1000,
      window_size_ms=30.0,
      window_stride_ms=10.0,
      dct_coefficient_count=80,
      num_log_mel_features=60,
      output_representation=output_representation)

  print(model_settings)

  ap = AudioProcessor(
      data_dirs=data_dirs,
      wanted_words=classes,
      silence_percentage=13.0,
      unknown_percentage=60.0,
      validation_percentage=10.0,
      testing_percentage=0.0,
      model_settings=model_settings,
      output_representation=output_representation)
  train_gen = data_gen(ap, sess, batch_size=batch_size, mode='training')
  val_gen = data_gen(ap, sess, batch_size=batch_size, mode='validation')

  model = speech_model(
      'conv_1d_time_stacked',
      model_settings['fingerprint_size']
      if output_representation != 'raw' else model_settings['desired_samples'],
      # noqa
      num_classes=model_settings['label_count'],
      **model_settings)

  # embed()
  checkpoints_path = os.path.join('checkpoints', 'conv_1d_time_stacked_model')
  if not os.path.exists(checkpoints_path):
    os.makedirs(checkpoints_path)

  callbacks = [
      ConfusionMatrixCallback(
          val_gen,
          ap.set_size('validation') // batch_size,
          wanted_words=prepare_words_list(get_classes(wanted_only=True)),
          all_words=prepare_words_list(classes),
          label2int=ap.word_to_index),
      ReduceLROnPlateau(
          monitor='val_categorical_accuracy',
          mode='max',
          factor=0.5,
          patience=4,
          verbose=1,
          min_lr=1e-5),
      TensorBoard(log_dir='logs'),
      ModelCheckpoint(
          os.path.join(checkpoints_path,
                       'ep-{epoch:03d}-vl-{val_loss:.4f}.hdf5'),
          save_best_only=True,
          monitor='val_categorical_accuracy',
          mode='max')
  ]
  model.fit_generator(
      train_gen,
      steps_per_epoch=ap.set_size('training') // batch_size,
      epochs=100,
      verbose=1,
      callbacks=callbacks)

  eval_res = model.evaluate_generator(val_gen,
                                      ap.set_size('validation') // batch_size)
  print(eval_res)


usage: ipykernel_launcher.py [-h] [-sample_rate SAMPLE_RATE]
                             [-batch_size BATCH_SIZE]
                             [-output_representation OUTPUT_REPRESENTATION]
                             -data_dirs DATA_DIRS [DATA_DIRS ...]
ipykernel_launcher.py: error: the following arguments are required: -data_dirs/--list


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
