In [1]:
import numpy as np
import tensorflow as tf
#import tensorflow.keras as keras
import matplotlib.pyplot as plt
import os
import xml.etree.ElementTree as ET
from pydub import AudioSegment

In [2]:
print(tf.__version__)
tf.config.list_physical_devices('GPU')
tf.config.list_physical_devices()

2.5.0


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [6]:
audio_chunks = []
chunks_classes = []
dataset_path = "/home/andrea/Scrivania/MIVIA_ROAD_DB1/audio/"
folders = ["A/", "B/", "C/", "D/"]
for folder in folders:
    tot_path = os.path.join(dataset_path,folder)
    files = os.listdir(tot_path)
    files = sorted(files)
    folder_idx = 0
    for i,f in enumerate(files):
        if f == "v2":
            folder_idx = i
            break
    del files[folder_idx]
    audiofiles = sorted(os.listdir(dataset_path+folder+"v2/"))
    for i, file in enumerate(files):
        descriptor = ET.parse(dataset_path+folder+file)
        root = descriptor.getroot().getchildren()[0]
        newAudio = AudioSegment.from_wav(dataset_path+folder+"v2/"+audiofiles[i])
        for child in root:
            cls = child[1].text
            start = round(float(child[3].text), 3)
            end = round(float(child[4].text), 3)
            audio_chunks.append(newAudio[start*1000:end*1000])
            chunks_classes.append(int(cls))

In [7]:
for idx, chunk in enumerate(audio_chunks):
    chunk.export("/home/andrea/Scrivania/MIVIA_ROAD_DB1/splitted_dataset/"+str(idx)+".wav", format="wav")
np_classes = np.asarray(chunks_classes)
np.save("/home/andrea/Scrivania/MIVIA_ROAD_DB1/splitted_dataset/classes.npy", np_classes)

In [2]:
np_classes = np.load("/home/andrea/Scrivania/MIVIA_ROAD_DB1/classes.npy")
classes_tensor = tf.convert_to_tensor(np_classes)
classes = np.unique(np_classes)

In [3]:
files = os.listdir("/home/andrea/Scrivania/MIVIA_ROAD_DB1/splitted_dataset/")
for i,f in enumerate(files):
    files[i] = "/home/andrea/Scrivania/MIVIA_ROAD_DB1/splitted_dataset/"+f
files_ds = tf.data.Dataset.from_tensor_slices(files)

In [4]:
def decode_audio(audio_binary):
  audio, _ = tf.audio.decode_wav(audio_binary,  desired_samples=128000)
  return tf.squeeze(audio, axis=-1)

In [5]:
def get_label(file_path):
    parts = tf.strings.split(file_path, os.path.sep)
    filename = parts[-1]
    splitted = tf.strings.split(filename, ".")
    classID = splitted[0]
    classID = tf.strings.to_number(classID, tf.int32)
    return tf.convert_to_tensor(classes_tensor[classID])

In [6]:
def get_waveform_and_label(file_path):
  label = get_label(file_path)
  audio_binary = tf.io.read_file(file_path)
  waveform = decode_audio(audio_binary)
  return waveform, label

In [18]:
waveform_ds = files_ds.map(get_waveform_and_label, num_parallel_calls=-1)

In [7]:
def get_spectrogram(waveform):
  # Padding for files with less than 16000 samples
  zero_padding = tf.zeros([128000] - tf.shape(waveform), dtype=tf.float32)

  # Concatenate audio with padding so that all audio clips will be of the
  # same length
  waveform = tf.cast(waveform, tf.float32)
  equal_length = tf.concat([waveform, zero_padding], 0)
  spectrogram = tf.signal.stft(
      equal_length, frame_length=255, frame_step=128)

  spectrogram = tf.abs(spectrogram)

  return spectrogram

In [8]:
def get_spectrogram_and_label_id(audio, label):
  spectrogram = get_spectrogram(audio)
  spectrogram = tf.expand_dims(spectrogram, -1)
  label_id = tf.argmax(label == classes)
  return spectrogram, label_id

In [21]:
spectrogram_ds = waveform_ds.map(
    get_spectrogram_and_label_id, num_parallel_calls=-1)


In [11]:

def prepare_datasets(train_perc=0.8):
    import random
    batch_size = 32
    random.shuffle(files)
    boundary = int(len(files)*train_perc)
    train_files = files[:boundary]
    train_classes = np_classes[:boundary]
    val_files = files[boundary:]
    val_classes = np_classes[boundary:]
    train_ds = tf.data.Dataset.from_tensor_slices(train_files)
    val_ds = tf.data.Dataset.from_tensor_slices(val_files)
    train_waveform_ds = train_ds.map(get_waveform_and_label, num_parallel_calls=-1)
    val_waveform_ds = val_ds.map(get_waveform_and_label, num_parallel_calls=-1)
    train_spectrogram_ds = train_waveform_ds.map(get_spectrogram_and_label_id, num_parallel_calls=-1)
    val_spectrogram_ds = train_waveform_ds.map(get_spectrogram_and_label_id, num_parallel_calls=-1)
    train_ds = train_spectrogram_ds.batch(batch_size)
    train_ds = train_ds.cache().prefetch(-1)
    val_ds = val_spectrogram_ds.batch(batch_size)
    val_ds = train_ds.cache().prefetch(-1)
    return train_ds, val_ds

In [12]:
train_ds, val_ds = prepare_datasets()

In [15]:
batch_size = 32
train_ds = spectrogram_ds.batch(batch_size)
train_ds = train_ds.cache().prefetch(-1)

NameError: name 'spectrogram_ds' is not defined

In [13]:
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models
norm_layer = preprocessing.Normalization()
norm_layer.adapt(train_ds.map(lambda x, _: x))
for spectrogram, _ in train_ds.take(1):
  input_shape = spectrogram.shape
print('Input shape:', input_shape)
num_labels = len(classes)

Input shape: (32, 999, 129, 1)


In [27]:
del model



In [14]:
model = models.Sequential([
    layers.Input(shape=(999,129,1)),
    preprocessing.Resizing(32, 32),
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels),
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resizing (Resizing)          (None, 32, 32, 1)         0         
_________________________________________________________________
normalization (Normalization (None, 32, 32, 1)         3         
_________________________________________________________________
conv2d (Conv2D)              (None, 30, 30, 32)        320       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 28, 28, 64)        18496     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 14, 14, 64)        0         
_________________________________________________________________
dropout (Dropout)            (None, 14, 14, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 12544)             0

In [15]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

In [32]:
train_ds, val_ds = prepare_datasets()

In [16]:
EPOCHS = 100
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100

KeyboardInterrupt: 

In [17]:
model.save("audio_model.h5",)