In [2]:
import os
import shutil
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_io as tfio
from tensorflow.keras import models, Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import *
from sklearn.model_selection import train_test_split
import tensorflow_addons as tfa
from tensorflow.keras import backend as K
import random
import tensorflow_hub as hub
import math

In [3]:
seed = 69
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)
sample_rate = 16000

train_data_dir = 'data/combined_normalized_16khz_TF'
val_data_dir = 'data/TIL_ans'
classes = np.array(os.listdir(train_data_dir))
num_classes = len(classes)

classes_index = {label: index for index, label in enumerate(classes)}
print('Classes:', classes_index)

x_train = tf.io.gfile.glob(train_data_dir + '/*/*.wav')
x_val = tf.io.gfile.glob(val_data_dir + '/*/*.wav')
random.shuffle(x_train)
random.shuffle(x_val)
train_labels = [classes_index[f.split(os.path.sep)[-2]] for f in x_train]
val_labels = [classes_index[f.split(os.path.sep)[-2]] for f in x_val]
num_samples = len(x_train) + len(x_val)
print('Number of train examples:', num_samples)

Classes: {'angry': 0, 'fear': 1, 'happy': 2, 'neutral': 3, 'sad': 4}
Number of train examples: 16501


In [49]:
wav = tf.io.read_file('data/TIL_train_normalized/angry/a2e7652234.wav')
wav, sr = tf.audio.decode_wav(wav, desired_channels=1)
wav = tf.squeeze(wav, axis=-1)
sr = tf.cast(sr, dtype=tf.int64)
wav = tfio.audio.resample(wav, rate_in=sr, rate_out=sample_rate)
position = tfio.audio.trim(wav, axis=0, epsilon=0.1)  # epsilon = noise gate (how loud between 0 to 1)
#wav = wav[position[0]:position[1]]
if len(wav) > 3*sample_rate:
      to_cut = len(wav) - 3*sample_rate
      wav = wav[tf.convert_to_tensor(math.floor(to_cut/2)):tf.convert_to_tensor(-(math.ceil(to_cut/2)))]
else:
      zero_padding = tf.zeros(([3*sample_rate] - tf.shape(wav)), dtype=tf.float32)  # pad to 3 seconds
      wav = tf.concat([wav, zero_padding], 0)

int

In [7]:
@tf.function(jit_compile=False)
def preprocess(filepath):
      wav = tf.io.read_file(filepath)
      wav, sr = tf.audio.decode_wav(wav, desired_channels=1)
      wav = tf.squeeze(wav, axis=-1)
      if no_norm: wav *= 32768.0
      sr = tf.cast(sr, dtype=tf.int64)
      # wav = tfio.audio.resample(wav, rate_in=sr, rate_out=sample_rate)
      position = tfio.audio.trim(wav, axis=0, epsilon=0.1)  # epsilon = noise gate (how loud between 0 to 1)
      wav = wav[position[0]:position[1]]
      if len(wav) > 3*sample_rate:
            to_cut = len(wav) - 3*sample_rate
            wav = wav[:-to_cut]
      else:
            zero_padding = tf.zeros(([3*sample_rate] - tf.shape(wav)), dtype=tf.float32)  # pad to 3 seconds
            wav = tf.concat([wav, zero_padding], 0)
      return wav, tf.one_hot(tf.argmax(tf.strings.split(filepath, os.path.sep)[-2] == classes), len(classes))


@tf.function(jit_compile=False)
def preprocess_val(filepath):
      wav = tf.io.read_file(filepath)
      wav, sr = tf.audio.decode_wav(wav, desired_channels=1)
      wav = tf.squeeze(wav, axis=-1)
      if no_norm: wav *= 32768.0
      sr = tf.cast(sr, dtype=tf.int64)
      # wav = tfio.audio.resample(wav, rate_in=sr, rate_out=sample_rate)
      position = tfio.audio.trim(wav, axis=0, epsilon=0.1)  # epsilon = noise gate (how loud between 0 to 1)
      wav = wav[position[0]:position[1]]
      if len(wav) > 3*sample_rate:
            to_cut = len(wav) - 3*sample_rate
            wav = wav[:-to_cut]
      else:
            zero_padding = tf.zeros(([3*sample_rate] - tf.shape(wav)), dtype=tf.float32)  # pad to 3 seconds
            wav = tf.concat([wav, zero_padding], 0)
      return wav, tf.one_hot(tf.argmax(tf.strings.split(filepath, os.path.sep)[-2] == classes), len(classes))


no_norm = False  # set to False if model expects between -1 and 1, else set to True and let the model do preprocessing
train_ds = tf.data.Dataset.from_tensor_slices(x_train).map(preprocess, num_parallel_calls=-1)
val_ds = tf.data.Dataset.from_tensor_slices(x_val).map(preprocess_val, num_parallel_calls=-1)

batch_size = 32
train_ds = train_ds.batch(batch_size).cache().prefetch(-1)
val_ds = val_ds.batch(batch_size).cache().prefetch(-1)

In [5]:
# Training
# NO preprocessing needed! Already scaled
xInput = Input((3*sample_rate,))
m = hub.KerasLayer('https://tfhub.dev/google/trillsson1/1')
x = m(xInput)['embedding']
# x = Dropout(0.1)(x)
x = Dense(128, activation='swish')(x)
# x = Dropout(0.1)(x)
# x = Dense(128, activation='swish')(x)
# x = Dropout(0.1)(x)
xOutput = Dense(len(classes))(x)  # no activation as loss using logit=True
model = tf.keras.models.Model(xInput, xOutput)

In [6]:
epoch = 30
# opt_lr_schedule = tf.keras.optimizers.schedules.CosineDecay(8e-4, epoch * 988)
# opt = tf.keras.optimizers.Adam(learning_rate=5e-4)
opt = tfa.optimizers.AdamW(learning_rate=1e-4, weight_decay=5e-5)
opt = tfa.optimizers.Lookahead(opt)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1)
metrics = ['accuracy', tfa.metrics.F1Score(num_classes=len(classes), average='weighted', threshold=0.5)]
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_f1_score', min_delta=0, patience=6, verbose=1, mode='max', restore_best_weights=True),
    # tf.keras.callbacks.ModelCheckpoint('./best_model',monitor='val_accuracy',save_best_only=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_f1_score', mode='max', factor=0.1, patience=3, verbose=1)
]
model.compile(optimizer=opt, loss=loss, metrics=metrics)
plot_model(model, show_shapes=True, show_dtype=True, show_layer_names=True, to_file='model.png')
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 48000)]           0         
                                                                 
 keras_layer (KerasLayer)    {'embedding': (None, 102  5488704   
                             4)}                                 
                                                                 
 dense (Dense)               (None, 128)               131200    
                                                                 
 dense_1 (Dense)             (None, 5)                 645       
                                                                 
Total params: 5,620,549
Trainable params: 131,845
Non-trainable params: 5,488,704
_________________________________________________________________


In [8]:
history = model.fit(train_ds, epochs=epoch, validation_data=val_ds, batch_size=batch_size, callbacks=callbacks, use_multiprocessing=True, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
 11/497 [..............................] - ETA: 1:56 - loss: 1.0345 - accuracy: 0.6960 - f1_score: 0.4806

KeyboardInterrupt: 

In [62]:
model.evaluate(val_ds, verbose=1)



[1.5058115720748901, 0.4399999976158142, 0.4106185734272003]

In [59]:
model.save('models/trillsson5')

INFO:tensorflow:Assets written to: models/trillsson5\assets


INFO:tensorflow:Assets written to: models/trillsson5\assets
