In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import math
import tensorflow as tf
import tensorflow_io as tfio
from tensorflow import keras

caused by: ['/home/ubuntu/anaconda3/envs/tensorflow2_latest_p37/lib/python3.7/site-packages/tensorflow_io/core/python/ops/libtensorflow_io.so: undefined symbol: _ZN10tensorflow2io20InputStreamInterface10SkipNBytesEl']


In [2]:
import IPython.display as ipd
import librosa

In [4]:
def get_paths_and_labels(audio_directory_path):
#     class_names = os.listdir(audio_directory_path)
    class_names = ['english', 'spanish']
#     class_names.remove('.DS_Store')
    label_class_dict = {num:class_ for num, class_ in enumerate(class_names)}
    
    audio_paths = []
    labels = []

    for label, name in enumerate(class_names):
        dir_path = os.path.join(audio_directory_path, name)
        full_paths = [os.path.join(dir_path, filename) for filename in os.listdir(dir_path) if 'ipynb' not in filename and '.DS_Store' not in filename]
        audio_paths += full_paths
        labels += [label] * len(full_paths)
    
    return audio_paths, labels, label_class_dict

def path_to_audio(path):
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1, desired_samples=16000*7)
    return audio

def create_dataset(audio_paths, labels):
    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
    audio_ds = path_ds.map(lambda x: path_to_audio(x))
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    return tf.data.Dataset.zip((audio_ds, label_ds))

In [5]:
train_path = '../../data/data/recordings/cleaned_set/cleaned_train_set'
test_path = '../../data/data/recordings/cleaned_set/cleaned_test_set'

In [6]:
batch_size = 12

train_audio_paths, train_labels, train_label_class_dict = get_paths_and_labels(train_path)
train_ds = create_dataset(train_audio_paths, train_labels)
train_ds = train_ds.shuffle(buffer_size=len(train_ds)).batch(batch_size)
train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE)

test_audio_paths, test_labels, test_label_class_dict = get_paths_and_labels(test_path)
test_ds = create_dataset(test_audio_paths, test_labels)
test_ds = test_ds.shuffle(buffer_size=len(test_ds)).batch(batch_size)
test_ds = test_ds.prefetch(tf.data.experimental.AUTOTUNE)

In [7]:
train_label_class_dict

{0: 'english', 1: 'spanish'}

In [8]:
import os
root_logdir = os.path.join(os.curdir, "model_logs")

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

In [9]:
def residual_block(x, filters, conv_num=3, activation="relu", pool_size=3, strides=3):
    s = keras.layers.Conv1D(filters, 1, padding="same")(x)
    for i in range(conv_num - 1):
        x = keras.layers.Conv1D(filters, 3, padding="same")(x)
        x = keras.layers.Activation(activation)(x)
    x = keras.layers.Conv1D(filters, 3, padding="same")(x)
    x = keras.layers.Add()([x, s])
    x = keras.layers.Activation(activation)(x)
    return keras.layers.MaxPool1D(pool_size=pool_size, strides=strides)(x)


def build_model(input_shape, num_classes):
    inputs = keras.layers.Input(shape=input_shape, name="input")

    x = residual_block(inputs, 8, 2, 'relu')
    x = residual_block(x, 16, 2, 'relu')
    x = residual_block(x, 32, 3, 'relu')
    x = residual_block(x, 64, 3, 'relu')
    x = residual_block(x, 64, 3, 'relu')
    x = residual_block(x, 128, 3, 'relu')

    x = keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dense(128, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = keras.layers.Dense(128, activation="relu")(x)
    x = keras.layers.Dense(64, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.2)(x)

    outputs = keras.layers.Dense(num_classes, activation="softmax", name="output")(x)

    return keras.models.Model(inputs=inputs, outputs=outputs)


model = build_model((16000*7, 1), 2)

model.summary()

model.compile(
    optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=['accuracy']
)


model_save_filename = "model.h5"

earlystopping_cb = keras.callbacks.EarlyStopping(patience=25, restore_best_weights=True)
modelcheckpoint_cb = keras.callbacks.ModelCheckpoint(
    model_save_filename, monitor='val_accuracy', save_best_only=True)
tensorboard_cb = keras.callbacks.TensorBoard(get_run_logdir())


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 112000, 1)]  0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 112000, 8)    32          input[0][0]                      
__________________________________________________________________________________________________
activation (Activation)         (None, 112000, 8)    0           conv1d_1[0][0]                   
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 112000, 8)    200         activation[0][0]                 
______________________________________________________________________________________________

In [80]:
# cw = {0: 25.595744680851062,
#  1: 23.134615384615383,
#  2: 18.50769230769231,
#  3: 32.513513513513516,
#  4: 25.0625,
#  5: 2.077720207253886,
#  6: 25.0625,
#  7: 19.095238095238095,
#  8: 7.425925925925926,
#  9: 11.794117647058824}

cw = {0:1,
     1:5}

In [10]:
history = model.fit(train_ds, 
                    epochs=100, 
                    validation_data=test_ds,
                    callbacks=[earlystopping_cb, modelcheckpoint_cb, tensorboard_cb])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100

KeyboardInterrupt: 

In [82]:
np.argmax(model.predict(test_ds), axis=1)

array([1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0])

In [301]:
def residual_block(x, filters, i, kernel_size=3, activation="relu", pool_size=3, strides=3):
    s = keras.layers.Conv1D(filters, 1, padding="same")(x)
    tanh_out = keras.layers.Conv1D(filters, 
                      kernel_size, 
                      dilation_rate = kernel_size**i, 
                      padding='causal', 
                      activation='tanh'
                      )(x)
    sigm_out = keras.layers.Conv1D(filters, 
                      kernel_size, 
                      dilation_rate = kernel_size**i, 
                      padding='causal', 
                      activation='sigmoid'
                      )(x)
    x = keras.layers.Conv1D(filters, 3, padding="same")(x)
    x = keras.layers.Add()([x, s])
    x = keras.layers.Activation(activation)(x)
    return keras.layers.MaxPool1D(pool_size=pool_size, strides=strides)(x)


def build_model(input_shape, num_classes):
    inputs = keras.layers.Input(shape=input_shape, name="input")
    
    
    x = residual_block(inputs, 4, 1, 7, 'relu')
    x = residual_block(x, 8, 2, 5, 'relu')
    x = residual_block(x, 16, 3, 3, 'relu')
    x = residual_block(x, 32, 4, 3, 'relu')
    x = residual_block(x, 32, 5, 3, 'relu')
    x = residual_block(x, 64, 6, 3, 'relu')


    x = keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dense(128, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(64, activation="relu")(x)
    x = keras.layers.Dense(64, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(16, activation="relu")(x)

    outputs = keras.layers.Dense(num_classes, activation="softmax", name="output")(x)

    return keras.models.Model(inputs=inputs, outputs=outputs)


model = build_model((16000*7, 1), 2)

model.summary()

model.compile(optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=['accuracy'])


model_save_filename = "model.h5"

earlystopping_cb = keras.callbacks.EarlyStopping(patience=25, restore_best_weights=True)
modelcheckpoint_cb = keras.callbacks.ModelCheckpoint(
    model_save_filename, monitor='val_accuracy', save_best_only=True)
tensorboard_cb = keras.callbacks.TensorBoard(get_run_logdir())


Model: "model_76"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 112000, 1)]  0                                            
__________________________________________________________________________________________________
conv1d_1758 (Conv1D)            (None, 112000, 4)    16          input[0][0]                      
__________________________________________________________________________________________________
conv1d_1755 (Conv1D)            (None, 112000, 4)    8           input[0][0]                      
__________________________________________________________________________________________________
add_446 (Add)                   (None, 112000, 4)    0           conv1d_1758[0][0]                
                                                                 conv1d_1755[0][0]         

In [302]:
history = model.fit(train_ds, 
                    epochs=250, 
                    validation_data=test_ds,
                    callbacks=[earlystopping_cb, modelcheckpoint_cb, tensorboard_cb])

Epoch 1/250
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250


In [161]:
np.argmax(model.predict(test_ds), axis=1)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0])

In [None]:
lab = []
for audios, labels in test_ds.take(1):

In [260]:
def audio_to_fft(audio):
    # Since tf.signal.fft applies FFT on the innermost dimension,
    # we need to squeeze the dimensions and then expand them again
    # after FFT
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(
        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )
    fft = tf.expand_dims(fft, axis=-1)

    # Return the absolute value of the first half of the FFT
    # which represents the positive frequencies
    return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])

In [279]:
audio = path_to_audio('../../data/data/recordings/cleaned_set/cleaned_train_set/english/english102.wav')