<a href="https://colab.research.google.com/github/andreasfloros/ARM-ML-Embedded/blob/main/autokeras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Prepare and preprocess data
Referenced from audio_classifier pipeline

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [1]:
DATASET_ROOT_DIR = '/gdrive/MyDrive/audio_data/'

In [3]:
import os
import requests
import tarfile


os.mkdir(DATASET_ROOT_DIR)
url = 'http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz'
target_path = 'audio_data/dataset.tar.gz'

response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(target_path, 'wb') as f:
        f.write(response.raw.read())

tar = tarfile.open(target_path, "r:gz")
tar.extractall(path='audio_data/')
tar.close()

FileExistsError: ignored

In [4]:
import os
import librosa
import math
import json
import numpy as np

In [None]:
def make_track_correct_size(signal, expected_num_samples_per_track):

    # print('Original track length: {}'.format(len(signal)))
    # if track is shorter than expected, append it with zeros
    if len(signal) < expected_num_samples_per_track:
      num_zeros_to_pad = expected_num_samples_per_track - len(signal)
      zeros = num_zeros_to_pad * [0.]
      extended_signal = np.append(signal, zeros)
      return extended_signal

    # if track is longer than expected, truncate it
    elif len(signal) > expected_num_samples_per_track:
      return signal[:expected_num_samples_per_track]

    # else return the original track 
    else:
      return signal

In [None]:
def audio_track_to_features(signal, processing_method, sample_rate, window_size, window_stride, num_mfcc):
  
  if processing_method == 'fft':
    # perform Fast Fourier Transform (FFT)
    fft = np.fft.fft(signal)

    # calculate abs values on complex numbers to get magnitude
    spectrum = np.abs(fft)

    # the spectrum is symmetrical with respect to sample_rate / 2
    # so take half of the spectrum and frequency arrays
    # therefore len(half_spectrum) = sample_rate / 2
    half_spectrum = spectrum[:int(len(spectrum)/2)]

    # average every 16 samples to reduce size of array to 1 / 16 of its original size
    # e.g. sample_rate = 16k, duration = 1.024s, reduce size from 8192 to 512 
    averaged = np.mean(half_spectrum.reshape(-1, 16), axis=1)
    return averaged


  elif processing_method == 'stft':
    # perform Short Time Fourier Transform (STFT)
    stft = librosa.stft(signal = signal, 
                        n_fft = window_size, 
                        hop_length = window_stride)

    # calculate abs values on complex numbers to get magnitude
    spectrogram = np.abs(stft)

    # transpose and return the spectrogram matrix
    transposed_spectrogram = spectrogram.transpose()
    return transposed_spectrogram.flatten()


  else: # mfcc
    # perform Mel-Frequency Cepstral Coefficients (MFCC)
    mfcc = librosa.feature.mfcc(signal, 
                                sr = sample_rate, 
                                n_fft = window_size, 
                                n_mfcc = num_mfcc,
                                hop_length = window_stride)
    # transpose and return the mfcc matrix
    transposed_mfcc = mfcc.T
    return transposed_mfcc.flatten()

In [None]:
def preprocess_entire_dataset(dataset_path, json_path, processing_method, sample_rate, expected_duration, window_size, window_stride, num_mfcc):
  # expected duration is in seconds
  expected_num_samples_per_track = int(expected_duration * sample_rate)
  
  # dictionary to later be converted to final json file
  data = {
      'mapping' : [],
      'features' : [],
      'labels' : []
  }

  # we will iterate this for each of the visited sub-directorie in order to
  # give a different label for each of them
  visited_directory_index = 0

  # iterate through all subfolders
  for dirpath, dirnames, filenames in os.walk(dataset_path):

    # # ensure we are not at the dataset root directory
    # # (os.walk provides this directory as well)
    if dirpath is not DATASET_ROOT_DIR:
    # if dirpath == 'audio_data/yes' or dirpath == 'audio_data/no':

      # obtain word labels
      dirpath_components = dirpath.split('/') # audio_data/left => ['audio_data', 'left']
      word_label = dirpath_components[-1]
      data['mapping'].append(word_label)
      print('Processing {}'.format(word_label))

      # access and process files for current word
      for f in filenames:
        
        # load audio file
        file_path = os.path.join(dirpath, f)
        signal, sample_rate = librosa.load(file_path, sr=sample_rate)

        # extend or cut signal to be equal to the expected size
        signal_correct_size = make_track_correct_size(signal, expected_num_samples_per_track)

        # obtain the features of the audio track using the function defined above
        track_features = audio_track_to_features(signal = signal_correct_size, 
                                                 processing_method = processing_method,
                                                 sample_rate = sample_rate, 
                                                 window_size = window_size, 
                                                 window_stride = window_stride, 
                                                 num_mfcc = 13)
        
        # append the audio track features to the features field of the dictionary
        data['features'].append(track_features.tolist())

        # append the current directory index as the label of this track
        data['labels'].append(visited_directory_index)
        # print('file_path: {}'.format(file_path))

      # iterate the index before visiting the next directory
      visited_directory_index = visited_directory_index + 1

  print(data['mapping'])
  print(set(data['labels']))
  # create the json file from the dictionary
  with open(json_path, 'w') as fp:
    json.dump(data, fp, indent=4)

In [5]:
rm -rf `find -type d -name .ipynb_checkpoints`

In [None]:
!rm /gdrive/MyDrive/audio_data/_background_noise_/README.md

rm: cannot remove '/gdrive/MyDrive/audio_data/_background_noise_/README.md': No such file or directory


In [6]:
JSON_PATH = DATASET_ROOT_DIR+'data.json'
PROCESSING_METHOD = 'mfcc'
SAMPLE_RATE = 16000
EXPECTED_DURATION = 1.024            # in seconds
WINDOW_SIZE_SAMPLES = 512            # in samples
WINDOW_STRIDE_SAMPLES = 320          # in samples
MFCC_COEFF_NUMBER = 13

In [None]:
preprocess_entire_dataset(dataset_path = DATASET_ROOT_DIR, 
                   json_path = JSON_PATH, 
                   processing_method = PROCESSING_METHOD,
                   sample_rate = SAMPLE_RATE, 
                   expected_duration = EXPECTED_DURATION, 
                   window_size = WINDOW_SIZE_SAMPLES, 
                   window_stride = WINDOW_STRIDE_SAMPLES, 
                   num_mfcc = MFCC_COEFF_NUMBER)

Processing dog


KeyboardInterrupt: ignored

Import libraries for training

In [7]:
!pip install autokeras

Collecting autokeras
[?25l  Downloading https://files.pythonhosted.org/packages/09/12/cf698586ccc8245f08d1843dcafb65b064a2e9e2923b889dc58e1019f099/autokeras-1.0.12-py3-none-any.whl (164kB)
[K     |████████████████████████████████| 174kB 26.4MB/s 
[?25hCollecting keras-tuner>=1.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/20/ec/1ef246787174b1e2bb591c95f29d3c1310070cad877824f907faba3dade9/keras-tuner-1.0.2.tar.gz (62kB)
[K     |████████████████████████████████| 71kB 6.8MB/s 
Collecting terminaltables
  Downloading https://files.pythonhosted.org/packages/9b/c4/4a21174f32f8a7e1104798c445dacdc1d4df86f2f26722767034e4de4bff/terminaltables-3.1.0.tar.gz
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Building wheels for collected packages: keras-tuner, terminaltables
  Building wheel for keras-tuner (setup.py) ... [?25l[?25hdone
  Created whee

In [8]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow.keras as keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Reshape, Activation, BatchNormalization, Conv2D, MaxPooling2D, Flatten
import matplotlib.pyplot as plt
import autokeras as ak
import kerastuner as kt

Load the data and prepare datasets

In [7]:
def load_data(data_path):
    """Loads training dataset from json file.
        :param data_path (str): Path to json file containing data
        :return X (ndarray): Inputs
        :return y (ndarray): Targets
    """

    print('Loading dataset')

    with open(data_path, "r") as fp:
        data = json.load(fp)

    X = np.array(data['features'])
    y = np.array(data['labels'])
    return X, y


def prepare_datasets(test_size, validation_size):
    """Loads data and splits it into train, validation and test sets.
    :param test_size (float): Value in [0, 1] indicating percentage of data set to allocate to test split
    :param validation_size (float): Value in [0, 1] indicating percentage of train set to allocate to validation split
    :return X_train (ndarray): Input training set
    :return X_validation (ndarray): Input validation set
    :return X_test (ndarray): Input test set
    :return y_train (ndarray): Target training set
    :return y_validation (ndarray): Target validation set
    :return y_test (ndarray): Target test set
    """

    print('Splitting dataset into training, validation, and test splits')

    # load data
    X, y = load_data(JSON_PATH)

    # if flatten == False:
    #   Reshape(reshape_shape, input_shape=input_shape)

    # create train, validation and test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    # X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validation_size)

    # add an axis to input sets for conv networks to fit conv2D shape specs
    # currently not needed since we reshape everything anyway
    # X_train = X_train[..., np.newaxis]
    # X_validation = X_validation[..., np.newaxis]
    # X_test = X_test[..., np.newaxis]

    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = prepare_datasets(0.25, 0.2)
print('Finished preparing training, validation, and test data')
print('X_train.shape: {}'.format(X_train.shape))

Splitting dataset into training, validation, and test splits
Loading dataset
Finished preparing training, validation, and test data
X_train.shape: (79376, 676)


In [1]:
X_train.type

NameError: ignored

In [9]:
NUM_OF_SAMPLES = SAMPLE_RATE * EXPECTED_DURATION
NUM_OF_WINDOW_POSITIONS = math.ceil(NUM_OF_SAMPLES / WINDOW_STRIDE_SAMPLES)
NUM_OF_STFT_FREQUENCIES = int(1 + WINDOW_SIZE_SAMPLES / 2)

FFT_ARRAY_SIZE = int((NUM_OF_SAMPLES) / (2 * 16))  # 2 comes from half spectrum, 16 from averaging every 16 samples

if PROCESSING_METHOD == 'fft':
  RESHAPE_SHAPE = (1, FFT_ARRAY_SIZE, 1)
elif PROCESSING_METHOD == 'stft':
  RESHAPE_SHAPE = (NUM_OF_WINDOW_POSITIONS, NUM_OF_STFT_FREQUENCIES, 1)
else: # mfcc
  RESHAPE_SHAPE = (NUM_OF_WINDOW_POSITIONS, MFCC_COEFF_NUMBER, 1)

print('RESHAPE_SHAPE: {}'.format(RESHAPE_SHAPE))
INPUT_SHAPE = X_train.shape[1:]
print('INPUT_SHAPE:{}'.format(INPUT_SHAPE))

RESHAPE_SHAPE: (52, 13, 1)
INPUT_SHAPE:(676,)


Build AutoKera Model

In [19]:
class SingleLayerReshapeBlock(ak.Block):
  def __init__(self, input_shape, reshape_shape):
    super().__init__()
    self.input_shape = input_shape
    self.reshape_shape = reshape_shape

  def build(self, hp, inputs=None):
      # Get the input_node from inputs.
    input_node = tf.nest.flatten(inputs)[0]
    layer = Reshape(self.reshape_shape, input_shape=self.input_shape)
    output_node = layer(input_node)
    return output_node

  # def __call__(self, *args, **kwargs):
  #   return self.build(*args, **kwargs)

def build_conv_model():
  hp = kt.HyperParameters()
  filters = hp.Choice("filters", values=[16,32,64])

  input_node = ak.Input()
  output_node = SingleLayerReshapeBlock(INPUT_SHAPE, RESHAPE_SHAPE)(input_node)
  output_node = ak.ConvBlock(kernel_size=3,filters=filters,num_layers=1, separable=True, max_pooling=True)(output_node)
  output_node = ak.ClassificationHead()(output_node)
  auto_model = ak.AutoModel(
      inputs=input_node, outputs=output_node, objective="accuracy", tuner="greedy", overwrite=True, max_trials=20, max_model_size=60000
  )
  auto_model.fit(X_train, y_train, epochs=10)
  return auto_model

In [22]:
auto_model = build_conv_model()

Trial 20 Complete [00h 01m 32s]
accuracy: 0.3372953534126282

Best accuracy So Far: 0.7040774822235107
Total elapsed time: 00h 29m 46s
INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: ./auto_model/best_model/assets


In [23]:
print(auto_model.evaluate(X_test, y_test))

[1.1622689962387085, 0.6576589941978455]


In [24]:
model = auto_model.export_model()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 676)]             0         
_________________________________________________________________
cast_to_float32 (CastToFloat (None, 676)               0         
_________________________________________________________________
reshape (Reshape)            (None, 52, 13, 1)         0         
_________________________________________________________________
separable_conv2d (SeparableC (None, 50, 11, 16)        41        
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 25, 5, 16)         0         
_________________________________________________________________
separable_conv2d_1 (Separabl (None, 25, 5, 16)         416       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 12, 2, 16)         0     

In [28]:
model_name = 'all_words_mfcc'
model.save(model_name+'.h5')


In [30]:
loaded_model = keras.models.load_model('all_words_mfcc.h5')

In [31]:
loaded_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 676)]             0         
_________________________________________________________________
cast_to_float32 (CastToFloat (None, 676)               0         
_________________________________________________________________
reshape (Reshape)            (None, 52, 13, 1)         0         
_________________________________________________________________
separable_conv2d (SeparableC (None, 50, 11, 16)        41        
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 25, 5, 16)         0         
_________________________________________________________________
separable_conv2d_1 (Separabl (None, 25, 5, 16)         416       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 12, 2, 16)         0     

In [32]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
# Save the model to disk
open(model_name + '.tflite', "wb").write(tflite_model)

import os
basic_model_size = os.path.getsize(model_name + '.tflite')
print("Model is %d bytes" % basic_model_size)

INFO:tensorflow:Assets written to: /tmp/tmpxswkq7l0/assets


INFO:tensorflow:Assets written to: /tmp/tmpxswkq7l0/assets


'NoneType' object has no attribute 'name'


'NoneType' object has no attribute 'name'


ConverterError: ignored

In [26]:
# Convert the model to the TensorFlow Lite format with quantization
converter = tf.lite.TFLiteConverter.from_keras_model(model)


# converter.optimizations = [tf.lite.Optimize.DEFAULT]

# def representative_dataset_generator():
#   for value in X_test:
#     yield [np.array(value, dtype = np.float32, ndmin=4)]

# converter.representative_dataset = representative_dataset_generator

tflite_model = converter.convert()

# Save the model to disk
open(model_name + '.tflite', "wb").write(tflite_model)

import os
basic_model_size = os.path.getsize(model_name + '.tflite')
print("Model is %d bytes" % basic_model_size)

INFO:tensorflow:Assets written to: /tmp/tmp3pmrssvj/assets
'NoneType' object has no attribute 'name'


ConverterError: ignored

In [None]:
def build_dense_model():
  hp = kt.HyperParameters()
  num_units = hp.Choice("num_units", values=[16,32,64,128,256])
  input_node = ak.Input()
  output_node = ak.DenseBlock(num_units=num_units)(input_node)
  output_node = ak.ClassificationHead(num_classes=36, loss = "categorical_crossentropy")(output_node)
  auto_model = ak.AutoModel(
      inputs=input_node, outputs=output_node, objective="accuracy", tuner="greedy", overwrite=True, max_trials=10
  )
  auto_model.fit(X_train, y_train, epochs=10)
  return auto_model

Trial 10 Complete [00h 01m 28s]
accuracy: 0.7838003635406494

Best accuracy So Far: 0.7838003635406494
Total elapsed time: 00h 11m 28s
INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: ./auto_model/best_model/assets


In [18]:
auto_model = build_dense_model()

NameError: ignored

In [None]:
print(auto_model.evaluate(X_test, y_test))

[0.800967276096344, 0.7621225118637085]


In [None]:
model = auto_model.export_model()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 676)]             0         
_________________________________________________________________
cast_to_float32 (CastToFloat (None, 676)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                21664     
_________________________________________________________________
batch_normalization (BatchNo (None, 32)                128       
_________________________________________________________________
re_lu (ReLU)                 (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               16896     
_________________________________________________________________
batch_normalization_1 (Batch (None, 512)               2048  