In [1]:
import os
import pathlib
import numpy as np
import tensorflow as tf

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

In [2]:
def create_set_list(main_dir, filepath):
    # opening the file in read mode
    file = open(filepath, "r")
  
    # reading the file
    data = file.read()
    initial_list =  data.split("\n")

    data_list = []

    for items in initial_list:
        data_list.append(main_dir + items)
        # data_list.append(items)
    return tf.convert_to_tensor(data_list)

In [3]:
main_dir = '/home/audio_ml.work/data/audio/speech/speech_commands/'
data_dir = pathlib.Path(main_dir)
# train_path = '/ifxhome/mazumderarna/XAI/training_list.txt'
# val_path = '/ifxhome/mazumderarna/XAI/validation_list.txt'
# test_path = '/ifxhome/mazumderarna/XAI/testing_list.txt'
train_path = '/home/audio_ml.work/data/audio/speech/speech_commands/training_list.txt'
val_path = '/home/audio_ml.work/data/audio/speech/speech_commands/validation_list.txt'
test_path = '/home/audio_ml.work/data/audio/speech/speech_commands/testing_list.txt'

In [4]:
commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands = commands[commands != 'README.md']
commands = commands[commands != 'LICENSE']
commands = commands[commands != 'training_list.txt']
commands = commands[commands != 'validation_list.txt']
commands = commands[commands != 'testing_list.txt']
commands = commands[commands != '_background_noise_']
print('Commands:', commands)

Commands: ['on' 'marvin' 'up' 'two' 'right' 'nine' 'four' 'yes' 'five' 'six' 'zero'
 'dog' 'down' 'house' 'learn' 'seven' 'forward' 'eight' 'follow' 'visual'
 'one' 'off' 'sheila' 'happy' 'stop' 'tree' 'left' 'bird' 'three'
 'backward' 'no' 'wow' 'cat' 'go' 'bed']


In [5]:
train_filenames = create_set_list(main_dir, train_path)
val_filenames = create_set_list(main_dir, val_path)
test_filenames = create_set_list(main_dir, test_path)

In [6]:
train_filenames = train_filenames[:-1]
val_filenames = val_filenames[:-1]
test_filenames = test_filenames[:-1]

In [7]:
print(train_filenames)

tf.Tensor(
[b'/home/audio_ml.work/data/audio/speech/speech_commands/backward/0165e0e8_nohash_0.wav'
 b'/home/audio_ml.work/data/audio/speech/speech_commands/backward/017c4098_nohash_0.wav'
 b'/home/audio_ml.work/data/audio/speech/speech_commands/backward/017c4098_nohash_1.wav'
 ...
 b'/home/audio_ml.work/data/audio/speech/speech_commands/zero/ffd2ba2f_nohash_3.wav'
 b'/home/audio_ml.work/data/audio/speech/speech_commands/zero/ffd2ba2f_nohash_4.wav'
 b'/home/audio_ml.work/data/audio/speech/speech_commands/zero/fffcabd1_nohash_0.wav'], shape=(84842,), dtype=string)


In [8]:
def decode_audio(audio_binary):
    # Decode WAV-encoded audio files to `float32` tensors, normalized
    # to the [-1.0, 1.0] range. Return `float32` audio and a sample rate.
    audio, _ = tf.audio.decode_wav(contents=audio_binary)
    # Since all the data is single channel (mono), drop the `channels`
    # axis from the array.
    return tf.squeeze(audio, axis=-1)

In [9]:
def get_label(file_path):
    parts = tf.strings.split(
        input=file_path,
        sep=os.path.sep)
    # Note: You'll use indexing here instead of tuple unpacking to enable this
    # to work in a TensorFlow graph.
    return parts[-2]

In [10]:
def get_waveform_and_label(file_path):
    label = get_label(file_path)
    audio_binary = tf.io.read_file(file_path)
    waveform = decode_audio(audio_binary)
    return waveform, label

In [11]:
def get_spectrogram(waveform):
    # Zero-padding for an audio waveform with less than 16,000 samples.
    input_len = 16000
    waveform = waveform[:input_len]
    zero_padding = tf.zeros(
        [16000] - tf.shape(waveform),
        dtype=tf.float32)
    # Cast the waveform tensors' dtype to float32.
    waveform = tf.cast(waveform, dtype=tf.float32)
    # Concatenate the waveform with `zero_padding`, which ensures all audio
    # clips are of the same length.
    equal_length = tf.concat([waveform, zero_padding], 0)
    # Convert the waveform to a spectrogram via a STFT.
    spectrogram = tf.signal.stft(
        equal_length, frame_length=255, frame_step=128)
    # Obtain the magnitude of the STFT.
    spectrogram = tf.abs(spectrogram)
    # Add a `channels` dimension, so that the spectrogram can be used
    # as image-like input data with convolution layers (which expect
    # shape (`batch_size`, `height`, `width`, `channels`).
    spectrogram = spectrogram[..., tf.newaxis]
    return spectrogram

In [12]:
def get_spectrogram_and_label_id(audio, label):
    spectrogram = get_spectrogram(audio)
    label_id = tf.math.argmax(label == commands)
    return spectrogram, label_id

In [13]:
def preprocess_dataset(files):
    files_ds = tf.data.Dataset.from_tensor_slices(files)
    output_ds = files_ds.map(
        map_func=get_waveform_and_label)
    output_ds = output_ds.map(
        map_func=get_spectrogram_and_label_id)
    return output_ds

train_ds = preprocess_dataset(train_filenames)
val_ds = preprocess_dataset(val_filenames)
test_ds = preprocess_dataset(test_filenames)

In [14]:
def min_max_scaling(x):
    x_min = x.min(axis=(1, 2), keepdims=True)
    x_max = x.max(axis=(1, 2), keepdims=True)
    x = (x - x_min)/(x_max-x_min)
    return x

In [15]:
def gen_numpy_arrays(data):
    
    x = []
    y = []

    for audio, label in data:
        x.append(audio.numpy())
        y.append(label.numpy())

    x = np.array(x)
    x = min_max_scaling(x)
    null = tf.zeros([x.shape[0],x.shape[1],x.shape[2],x.shape[3]], tf.float32)
    x = tf.concat([x,null,null], 3)
    # null = np.zeros((x.shape[0],x.shape[1],x.shape[2],x.shape[3]))
    # x = np.concatenate([x,null,null], axis=3).astype('float32')
    print('Shape of the dataset array: '+repr(x.shape))
    y = np.array(y)
    print('Shape of the label set array: '+repr(y.shape))

    return x, y

In [16]:
x_train, y_train = gen_numpy_arrays(train_ds)
np.save('/home/audio_ml.work/data/audio/speech/speech_commands_arrays/new_split/x_train_full.npy', x_train)
np.save('/home/audio_ml.work/data/audio/speech/speech_commands_arrays/new_split/y_train_full.npy', y_train)

Shape of the dataset array: TensorShape([84842, 124, 129, 3])
Shape of the label set array: (84842,)


In [17]:
x_test, y_test = gen_numpy_arrays(test_ds)
np.save('/home/audio_ml.work/data/audio/speech/speech_commands_arrays/new_split/x_test_full.npy', x_test)
np.save('/home/audio_ml.work/data/audio/speech/speech_commands_arrays/new_split/y_test_full.npy', y_test)

Shape of the dataset array: TensorShape([11005, 124, 129, 3])
Shape of the label set array: (11005,)


In [18]:
x_val, y_val = gen_numpy_arrays(val_ds)
np.save('/home/audio_ml.work/data/audio/speech/speech_commands_arrays/new_split/x_val_full.npy', x_val)
np.save('/home/audio_ml.work/data/audio/speech/speech_commands_arrays/new_split/y_val_full.npy', y_val)

Shape of the dataset array: TensorShape([9981, 124, 129, 3])
Shape of the label set array: (9981,)
