In [1]:
import numpy as np
import tensorflow as tf
#import tensorflow.keras as keras
import matplotlib.pyplot as plt
import os
import xml.etree.ElementTree as ET
from pydub import AudioSegment

In [2]:
from tensorflow.python.client import device_lib
def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]
print(get_available_devices())

['/device:CPU:0', '/device:XLA_CPU:0', '/device:XLA_GPU:0', '/device:GPU:0']


In [3]:
print(tf.__version__)
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

2.3.2


In [6]:
audio_chunks = []
chunks_classes = []
dataset_path = "/home/andrea/Scrivania/MIVIA_ROAD_DB1/audio/"
folders = ["A/", "B/", "C/", "D/"]
for folder in folders:
    tot_path = os.path.join(dataset_path,folder)
    files = os.listdir(tot_path)
    files = sorted(files)
    folder_idx = 0
    for i,f in enumerate(files):
        if f == "v2":
            folder_idx = i
            break
    del files[folder_idx]
    audiofiles = sorted(os.listdir(dataset_path+folder+"v2/"))
    for i, file in enumerate(files):
        descriptor = ET.parse(dataset_path+folder+file)
        root = descriptor.getroot().getchildren()[0]
        newAudio = AudioSegment.from_wav(dataset_path+folder+"v2/"+audiofiles[i])
        for child in root:
            cls = child[1].text
            start = round(float(child[3].text), 3)
            end = round(float(child[4].text), 3)
            audio_chunks.append(newAudio[start*1000:end*1000])
            chunks_classes.append(int(cls))

In [2]:
for idx, chunk in enumerate(audio_chunks):
    chunk.export("/home/andrea/Scrivania/MIVIA_ROAD_DB1/splitted_dataset/"+str(idx)+".wav", format="wav")
np_classes = np.asarray(chunks_classes)
np.save("/home/andrea/Scrivania/MIVIA_ROAD_DB1/splitted_dataset/classes.npy", np_classes)

NameError: name 'audio_chunks' is not defined

In [4]:
np_classes = np.load("/home/andrea/Scrivania/MIVIA_ROAD_DB1/classes.npy")
classes_tensor = tf.convert_to_tensor(np_classes)
classes = np.unique(np_classes)

In [5]:
files = os.listdir("/home/andrea/Scrivania/MIVIA_ROAD_DB1/splitted_dataset/")
for i,f in enumerate(files):
    files[i] = "/home/andrea/Scrivania/MIVIA_ROAD_DB1/splitted_dataset/"+f
files_ds = tf.data.Dataset.from_tensor_slices(files)

In [6]:
def decode_audio(audio_binary):
  audio, _ = tf.audio.decode_wav(audio_binary,  desired_samples=16000)
  return tf.squeeze(audio, axis=-1)

In [7]:
def get_label(file_path):
    parts = tf.strings.split(file_path, os.path.sep)
    filename = parts[-1]
    splitted = tf.strings.split(filename, ".")
    classID = splitted[0]
    classID = tf.strings.to_number(classID, tf.int32)
    return tf.convert_to_tensor(classes_tensor[classID])

In [8]:
def get_waveform_and_label(file_path):
  label = get_label(file_path)
  audio_binary = tf.io.read_file(file_path)
  waveform = decode_audio(audio_binary)
  return waveform, label

def get_waveform_and_label_id(file_path):
  label = get_label(file_path)
  audio_binary = tf.io.read_file(file_path)
  waveform = decode_audio(audio_binary)
  waveform = tf.expand_dims(waveform, -1)
  label_id = tf.argmax(label == classes)
  return waveform, label_id

In [8]:
waveform_ds = files_ds.map(get_waveform_and_label, num_parallel_calls=-1)

In [13]:
def get_spectrogram(waveform):
  # Padding for files with less than 16000 samples
  zero_padding = tf.zeros([128000] - tf.shape(waveform), dtype=tf.float32)

  # Concatenate audio with padding so that all audio clips will be of the
  # same length
  waveform = tf.cast(waveform, tf.float32)
  equal_length = tf.concat([waveform, zero_padding], 0)
  spectrogram = tf.signal.stft(
      equal_length, frame_length=255, frame_step=128)

  spectrogram = tf.abs(spectrogram)

  return spectrogram

In [11]:
def get_spectrogram_and_label_id(audio, label):
  spectrogram = get_spectrogram(audio)
  spectrogram = tf.expand_dims(spectrogram, -1)
  label_id = tf.argmax(label == classes)
  return spectrogram, label_id

In [21]:
spectrogram_ds = waveform_ds.map(
    get_spectrogram_and_label_id, num_parallel_calls=-1)


In [9]:

def prepare_datasets(train_perc=0.8):
    import random
    batch_size = 16
    random.shuffle(files)
    boundary = int(len(files)*train_perc)
    train_files = files[:boundary]
    train_classes = np_classes[:boundary]
    val_files = files[boundary:]
    val_classes = np_classes[boundary:]
    train_ds = tf.data.Dataset.from_tensor_slices(train_files)
    val_ds = tf.data.Dataset.from_tensor_slices(val_files)
    train_waveform_ds = train_ds.map(get_waveform_and_label_id, num_parallel_calls=-1)
    val_waveform_ds = val_ds.map(get_waveform_and_label_id, num_parallel_calls=-1)
    '''
    train_spectrogram_ds = train_waveform_ds.map(get_spectrogram_and_label_id, num_parallel_calls=-1)
    val_spectrogram_ds = train_waveform_ds.map(get_spectrogram_and_label_id, num_parallel_calls=-1)
    train_ds = train_spectrogram_ds.batch(batch_size)
    train_ds = train_ds.cache().prefetch(-1)
    val_ds = val_spectrogram_ds.batch(batch_size)
    val_ds = val_ds.cache().prefetch(-1)
    return train_ds, val_ds
    '''
    train_ds = train_waveform_ds.batch(batch_size)
    train_ds = train_ds.cache().prefetch(-1)
    val_ds = val_waveform_ds.batch(batch_size)
    val_ds = val_ds.cache().prefetch(-1)
    return train_ds, val_ds

In [10]:
train_ds, val_ds = prepare_datasets()

In [15]:
batch_size = 32
train_ds = spectrogram_ds.batch(batch_size)
train_ds = train_ds.cache().prefetch(-1)

NameError: name 'spectrogram_ds' is not defined

In [11]:
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models
norm_layer = preprocessing.Normalization()
norm_layer.adapt(train_ds.map(lambda x, _: x))
for spectrogram, _ in train_ds.take(1):
  input_shape = spectrogram.shape
print('Input shape:', input_shape)
num_labels = len(classes)

Input shape: (16, 16000, 1)


In [27]:
del model



In [14]:
model = models.Sequential([
    layers.Input(shape=(999,129,1)),
    preprocessing.Resizing(32, 32),
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels),
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resizing (Resizing)          (None, 32, 32, 1)         0         
_________________________________________________________________
normalization (Normalization (None, 32, 32, 1)         3         
_________________________________________________________________
conv2d (Conv2D)              (None, 30, 30, 32)        320       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 28, 28, 64)        18496     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 14, 14, 64)        0         
_________________________________________________________________
dropout (Dropout)            (None, 14, 14, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 12544)             0

In [15]:
del model

RNN

In [13]:
from tensorflow.keras.layers import LSTM, Dense, Dropout, InputLayer, GRU
from tensorflow import keras

In [None]:
input_shape=(16000,1)
model = keras.Sequential()
model.add(InputLayer(input_shape=input_shape))
model.add(norm_layer)
model.add(LSTM(128,input_shape=input_shape))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(48, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(24, activation='softmax'))
model.summary()

In [16]:
input = tf.keras.Input(shape=(16000,1))
x = tf.keras.layers.Dense(64, activation='relu')(input)
x = norm_layer(x)
x = tf.keras.layers.GRU(32,recurrent_activation='sigmoid',
                               activation='tanh',
                               recurrent_dropout=0,
                               unroll=False,
                               use_bias=True,
                               reset_after=True,
                               dropout=0.1)(x)
x = tf.keras.layers.Dense(16, activation='relu')(x)
x = tf.keras.layers.Dense(8, activation='relu')(x)
x = tf.keras.layers.Dropout(0.1)(x)
y = tf.keras.layers.Dense(3, activation='softmax')(x)
model = tf.keras.models.Model(inputs=[input], outputs=[y])

In [17]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy'],
)

In [19]:
train_ds, val_ds = prepare_datasets()

In [None]:
EPOCHS = 200
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
)

Epoch 1/200

In [17]:
model.save("audio_model.h5",)


TEST

In [12]:
def flat(spectrogram):
    return tf.expand_dims(spectrogram, -1)

In [16]:
file = ["/home/andrea/Scrivania/audio_only.wav"]
test_audio = tf.data.Dataset.from_tensor_slices(file)
io_audio = test_audio.map(tf.io.read_file, num_parallel_calls=-1)
waveform = io_audio.map(decode_audio, num_parallel_calls=-1)
spectrogram = waveform.map(get_spectrogram, num_parallel_calls=-1)
spectrogram = spectrogram.map(flat, num_parallel_calls=-1)
test = spectrogram.batch(1)
test = test.cache().prefetch(-1)

In [9]:
model = tf.keras.models.load_model("audio_model.h5")

In [17]:
model.predict_classes(test)

array([0])