# Step 1. Prepare the VocalSound Data

**Step 1.1** Mount your Google Drive to this Colab as the VocalSound dataset is quite large (2.5 GB). You will be asked a few security check in this step.

In [None]:
import os

from google.colab import drive
drive.mount('/content/drive')

data_dir = '/content/drive/MyDrive/vocalsound_baseline'
if os.path.exists(data_dir) == True:
    print('data path already exists')
else:
    os.mkdir(data_dir)

Mounted at /content/drive
data path already exists


In [None]:
!nvidia-smi

Thu Jun  1 12:20:44 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

**Step 1.2** Download and unzip the VocalSound (16kHz version) dataset. Unzip process takes up to 20 minutes, please be patient.

In [None]:
# if os.path.exists('/content/drive/MyDrive/vocalsound_baseline/vs_release.zip') == False:
  # print('Downloading and uncompressing the VocalSound dataset, it takes up to 20 minutes, please be patient.')
  # os.system('wget https://www.dropbox.com/s/c5ace70qh1vbyzb/vs_release_16k.zip?dl=1 -O /content/drive/MyDrive/vocalsound_baseline/vs_release.zip')
os.system('unzip -q /content/drive/MyDrive/vocalsound_baseline/vs_release.zip -d /content/drive/MyDrive/vocalsound_baseline/')

256

In [None]:
len(os.listdir('/content/drive/MyDrive/vocalsound_baseline/audio_16k'))

21024

**Step 1.3** Automatically correct the path in the data json file.

In [None]:
import json

def get_immediate_files(a_dir):
    return [name for name in os.listdir(a_dir) if os.path.isfile(os.path.join(a_dir, name))]

def change_path(json_file_path, target_path):
    with open(json_file_path, 'r') as fp:
        data_json = json.load(fp)
    data = data_json['data']

    # change the path in the json file
    for i in range(len(data)):
        ori_path = data[i]["wav"]
        new_path = target_path + '/audio_16k/' + ori_path.split('/')[-1]
        data[i]["wav"] = new_path

    with open(json_file_path, 'w') as f:
        json.dump({'data': data}, f, indent=1)

# for train, validation, test
json_files = get_immediate_files(data_dir + '/datafiles/')
for json_f in json_files:
    if json_f.endswith('.json'):
        print('now processing ' + data_dir + '/datafiles/' + json_f)
        change_path(data_dir + '/datafiles/' + json_f, data_dir)

# for subtest sets
json_files = get_immediate_files(data_dir + '/datafiles/subtest/')
for json_f in json_files:
    if json_f.endswith('.json'):
        print('now processing ' + data_dir + '/datafiles/subtest/' + json_f)
        change_path(data_dir + '/datafiles/subtest/' + json_f, data_dir)


now processing /content/drive/MyDrive/vocalsound_baseline/datafiles/tr.json
now processing /content/drive/MyDrive/vocalsound_baseline/datafiles/te.json
now processing /content/drive/MyDrive/vocalsound_baseline/datafiles/val.json
now processing /content/drive/MyDrive/vocalsound_baseline/datafiles/all.json
now processing /content/drive/MyDrive/vocalsound_baseline/datafiles/subtest/te_age3.json
now processing /content/drive/MyDrive/vocalsound_baseline/datafiles/subtest/te_male.json
now processing /content/drive/MyDrive/vocalsound_baseline/datafiles/subtest/te_age2.json
now processing /content/drive/MyDrive/vocalsound_baseline/datafiles/subtest/te_age1.json
now processing /content/drive/MyDrive/vocalsound_baseline/datafiles/subtest/te_female.json


# Step 2. Tensorflow Dataset

In [None]:
!pip install tensorflow_addons
!pip install transformers==4.27.4
# !pip install audiomentations
!pip install kapre
!pip install pillow==9.5.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (591 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m591.0/591.0 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.20.0 typeguard-2.13.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.27.4
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.27.4)
  Downloading huggingface_h

In [None]:
import os
import tensorflow as tf
import tensorflow_addons as tfa
from transformers import ViTImageProcessor, TFViTModel
from kapre.augmentation import SpecAugment
import librosa
import numpy as np
import time
import pandas as pd
import json


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
def prepare_dataset(audio_files, labels=None, batch_size=32, shuffle=False, max_length=5,repeat=True):

    length = max_length * 16000
    
    def load_audio(file_path, label):
        # Load signal file

        signal, sr = librosa.load(file_path, sr=16000)

        if len(signal) < length:
            signal = np.pad(signal,(0,length-len(signal)),'constant')
        else:
            signal = signal[:length]

        # Compute spectrogram
        spectrogram = librosa.feature.melspectrogram(y=signal, sr=sr, n_mels=128)
        spectrogram = librosa.power_to_db(S=spectrogram, ref=np.max)
        
        # Normalize spectrogram
        spectrogram_min, spectrogram_max = spectrogram.min(), spectrogram.max()
        spectrogram = (spectrogram - spectrogram_min) / (spectrogram_max - spectrogram_min)
        # spectrogram = (spectrogram + 3.05) / 5.42
        
        return spectrogram.astype('float32'), np.array(label,np.int64)
    
    def load_audio2(file_path):
        # Load signal file

        signal, sr = librosa.load(file_path, sr=16000)

        if len(signal) < length:
            signal = np.pad(signal,(0,length-len(signal)),'constant')
        else:
            signal = signal[:length]

        # Compute spectrogram
        spectrogram = librosa.feature.melspectrogram(y=signal, sr=sr, n_mels=128)
        spectrogram = librosa.power_to_db(S=spectrogram, ref=np.max)
        
        # Normalize spectrogram
        spectrogram_min, spectrogram_max = spectrogram.min(), spectrogram.max()
        spectrogram = (spectrogram - spectrogram_min) / (spectrogram_max - spectrogram_min)
        # spectrogram = (spectrogram + 3.05) / 5.42
        
        return spectrogram.astype('float32')

    if labels:
        dataset = tf.data.Dataset.from_tensor_slices((audio_files, labels))
        dataset = dataset.map(lambda x,y: tf.numpy_function(load_audio, [x,y], [tf.float32,tf.int64]), num_parallel_calls=tf.data.AUTOTUNE)
    else:
        dataset = tf.data.Dataset.from_tensor_slices(audio_files)
        dataset = dataset.map(lambda x: tf.numpy_function(load_audio2, [x], [tf.float32]), num_parallel_calls=tf.data.AUTOTUNE)
    
    # Shuffle and batch dataset
    if shuffle:
        dataset = dataset.shuffle(len(dataset))
    
    dataset = dataset.batch(batch_size=batch_size)
    if repeat:
        dataset = dataset.repeat()
    
    return dataset

In [None]:
DIR = "/content/drive/MyDrive/vocalsound_baseline/audio_16k"
label_file = "/content/drive/MyDrive/vocalsound_baseline/class_labels_indices_vs.csv"
train_json = "/content/drive/MyDrive/vocalsound_baseline/datafiles/tr.json"
test_json = "/content/drive/MyDrive/vocalsound_baseline/datafiles/te.json"
val_json = "/content/drive/MyDrive/vocalsound_baseline/datafiles/val.json"

In [None]:
def split_ssl_data(data, label_percent=0.2, num_classes=6):

    files, labels = data

    np_files = np.array(files)
    np_labels = np.array(labels)
    labeled = None
    unlabeled = None

    for class_idx in range(num_classes):
        mask = np_labels == class_idx
        class_count = sum(mask)
        sz = int(label_percent * class_count)
        if labeled is None:
            labeled = (np.array(np_files[mask][:sz]), np.array(np_labels[mask][:sz]))
            unlabeled = np.array(np_files[mask][sz:])
        else:
            labeled = (
                np.concatenate([labeled[0],  np.array(np_files[mask][:sz])]),
                np.concatenate([labeled[1],  np.array(np_labels[mask][:sz])])
            )
            unlabeled = np.concatenate([unlabeled, np.array(np_files[mask][sz:])])

    return (labeled[0].tolist(),labeled[1].tolist()), unlabeled.tolist()



In [None]:
label_df = pd.read_csv(label_file)
label2id = {}
for idx in range(len(label_df)):
    elem = label_df.iloc[idx]
    label2id[elem.mid] = idx

In [None]:
def parse_json(json_file):

    with open(json_file, mode="r") as fin:
        data = json.load(fin)
    files = [elem["wav"] for elem in data["data"]]
    labels = [label2id[elem["labels"]] for elem in data["data"]]

    return files, labels

In [None]:
train_files, train_labels = parse_json(train_json)
test_files, test_labels = parse_json(test_json)
val_files, val_labels = parse_json(val_json)

In [None]:
labeled, unlabeled = split_ssl_data((train_files, train_labels), label_percent=0.05)

In [None]:
labeled_ds = prepare_dataset(audio_files=labeled[0], labels=labeled[1], batch_size=64, shuffle=True)
unlabeled_ds = prepare_dataset(audio_files=unlabeled, batch_size=64, shuffle=True)

In [None]:
test_ds = prepare_dataset(audio_files=test_files, labels=test_labels, batch_size=32, shuffle=True,repeat=False)
val_ds = prepare_dataset(audio_files=val_files, labels=val_labels, batch_size=32, shuffle=True,repeat=False)

# Step 3. Train and evaluate

In [None]:
class AudioFixMatch(tf.keras.Model):
    def __init__(self, encoder_name='google/vit-base-patch16-224', num_classes=6, **kwargs):
        super(AudioFixMatch, self).__init__(**kwargs)
        self.vit = TFViTModel.from_pretrained(encoder_name)
        self.num_classes = num_classes
        self.cls_head = tf.keras.Sequential([
            tf.keras.layers.Dense(256,activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(64,activation="relu"),
            tf.keras.layers.Dense(self.num_classes, activation="softmax")
        ])
        self.strong_augment = SpecAugment(
            freq_mask_param=8,
            time_mask_param=8,
            n_freq_masks=2,
            n_time_masks=2,
            mask_value=0.0,
            data_format="channels_first"
        )
        self.weak_augment = SpecAugment(
            freq_mask_param=2,
            time_mask_param=2,
            n_freq_masks=2,
            n_time_masks=2,
            mask_value=0.0,
            data_format="channels_first"
        )

    def call(self, inputs, training):

        strong = self.strong_augment(inputs[:,0,:,:][:,tf.newaxis,:,:],training=training)
        weak = self.weak_augment(inputs[:,0,:,:][:,tf.newaxis,:,:],training=training)
        embeds_strong = self.vit(pixel_values=tf.repeat(strong,3,axis=1),training=training).pooler_output
        embeds_weak = self.vit(pixel_values=tf.repeat(weak,3,axis=1),training=training).pooler_output

        return self.cls_head(embeds_weak), self.cls_head(embeds_strong)

In [None]:
optim = tfa.optimizers.AdamW(weight_decay=0.001,learning_rate=0.005)
optim2 = tfa.optimizers.AdamW(weight_decay=0.0,learning_rate=0.00001)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_prec_metric = tf.keras.metrics.Precision(name="precision")
val_recall_metric = tf.keras.metrics.Recall(name="recall")
f1_metric_micro = tfa.metrics.F1Score(num_classes=6, threshold=0.5, average='micro', name='f1_micro')
f1_metric_macro = tfa.metrics.F1Score(num_classes=6, threshold=0.5, average='macro', name='f1_macro')

In [None]:
model = AudioFixMatch()
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')

Some layers from the model checkpoint at google/vit-base-patch16-224 were not used when initializing TFViTModel: ['classifier']
- This IS expected if you are initializing TFViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit/pooler/dense/kernel:0', 'vit/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
thresh = 0.9
unsup_weight = 1.0
num_classes = 6
fairness_weight = 0.01
eps = 1e-7
num_classes = 6
tau = tf.Variable(tf.constant(1/num_classes))
p_thresh = tf.Variable(tau * tf.ones((num_classes,),dtype=tf.float32))
hist_t = tf.Variable(eps * tf.ones((num_classes,),dtype=tf.float32))
eps_t = tf.Variable(eps * tf.ones((num_classes,),dtype=tf.float32))

ema_decay = tf.constant(0.999)

In [None]:
@tf.function
def sumnorm(x):
  return x/tf.reduce_sum(x)

@tf.function
def maxnorm(x):
  return x/tf.reduce_max(x)

@tf.function
def train_step(x,y,xu,ema_decay):

  with tf.GradientTape() as tape:
      wl,_ = model(x["pixel_values"],training=True)
      wu,su = model(xu["pixel_values"],training=True)

      ls = loss_fn(y, wl)

      local_thresh_update = tf.reduce_mean(tf.reduce_max(wu,axis=1))
      tau.assign(ema_decay * tau + (1-ema_decay) * local_thresh_update)
      p_thresh.assign(ema_decay * p_thresh + (1-ema_decay) * tf.reduce_mean(wu,axis=0))

      hist_t.assign(ema_decay * hist_t + (1-ema_decay) * tf.reduce_sum(tf.one_hot(tf.argmax(wu,axis=1),6),axis=0))
      sat = tau * maxnorm(p_thresh)
      mask = tf.reduce_max(wu,axis=1)>=tf.gather(sat,tf.argmax(wu,axis=1))
      
      su = su[mask]
      wu = wu[mask]
      
      lu = loss_fn(tf.argmax(wu,axis=1),su)
      
      lf = 0.0
      if tf.shape(su)[0] != 0:
        p_avg = tf.reduce_mean(su,axis=0)
        h_avg = tf.reduce_sum(tf.one_hot(tf.argmax(su,axis=1),6),axis=0)

        lf = -tf.keras.metrics.categorical_crossentropy(sumnorm(p_thresh/hist_t),sumnorm(p_avg/(eps_t+h_avg)))

      loss = ls + unsup_weight * lu + fairness_weight * lf

  grads = tape.gradient(loss, [model.cls_head.trainable_weights, model.vit.trainable_weights])
  optim.apply_gradients(zip(grads[0], model.cls_head.trainable_weights))
  optim2.apply_gradients(zip(grads[1], model.vit.trainable_weights))

  return loss

@tf.function
def test_step(x, y):
    wpred, _ = model(x["pixel_values"], training=False)
    val_acc_metric.update_state(y, wpred)
    true_hot = tf.one_hot(y, 6)
    val_prec_metric.update_state(true_hot, wpred)
    val_recall_metric.update_state(true_hot, wpred)
    f1_metric_micro.update_state(true_hot, wpred)
    f1_metric_macro.update_state(true_hot, wpred)

In [None]:
EPOCHS=4
max_val_acc = 0.0
best_weights = None
steps_per_epoch = 200

for epoch in range(EPOCHS):
  print(f"Epoch {epoch}")
  start_time = time.time()
  step = 0
  u_iter = iter(unlabeled_ds)
  for x,y in labeled_ds:
    
    xu = next(u_iter)
    labeled_inputs = processor.preprocess(tf.repeat(x[:,:,:,tf.newaxis],3,-1),image_mean=(-3.05,-3.05,-3.05),image_std=(2.33,2.33,2.33),return_tensors="tf")
    unlabeled_inputs = processor.preprocess(tf.repeat(xu[0][:,:,:,tf.newaxis],3,-1),image_mean=(-3.05,-3.05,-3.05),image_std=(2.33,2.33,2.33),return_tensors="tf")

    loss = train_step(labeled_inputs,y, unlabeled_inputs, ema_decay)
    if step % 50 == 0:
      print(
        "Training loss (for one batch) at step %d: %.4f"
        % (step, float(loss))
      )
    step += 1
    if step == steps_per_epoch:
        break

  for x_batch_val, y_batch_val in val_ds:
      inputs = processor.preprocess(tf.repeat(x_batch_val[:,:,:,tf.newaxis],3,-1),image_mean=(-3.05,-3.05,-3.05),image_std=(2.33,2.33,2.33),return_tensors="tf")
    
      test_step(inputs, y_batch_val)

  acc = float(val_acc_metric.result())
  prec = float(val_prec_metric.result())
  recall = float(val_recall_metric.result())
  micro = float(f1_metric_micro.result())
  macro = float(f1_metric_macro.result())

  val_acc_metric.reset_states()
  val_prec_metric.reset_states()
  val_recall_metric.reset_states()
  f1_metric_micro.reset_states()
  f1_metric_macro.reset_states()

  if acc > max_val_acc:
    max_val_acc = acc
    best_weights = model.get_weights()
  print(f"Validation acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")
  print("Time taken: %.2fs" % (time.time() - start_time))
print("Restoring best weights relative to validation accuracy...")
model.set_weights(best_weights)

Epoch 0
Training loss (for one batch) at step 0: 3.1872
Training loss (for one batch) at step 50: 2.6048
Training loss (for one batch) at step 100: 2.1122
Training loss (for one batch) at step 150: 2.0656
Validation acc: 0.658 precision: 0.773 recall: 0.518 f1_micro: 0.620 f1_macro: 0.601
Time taken: 1092.80s
Epoch 1
Training loss (for one batch) at step 0: 1.8467
Training loss (for one batch) at step 50: 1.4694
Training loss (for one batch) at step 100: 1.2895
Training loss (for one batch) at step 150: 1.1644
Validation acc: 0.771 precision: 0.806 recall: 0.737 f1_micro: 0.770 f1_macro: 0.761
Time taken: 1057.56s
Epoch 2
Training loss (for one batch) at step 0: 0.7457
Training loss (for one batch) at step 50: 1.1916
Training loss (for one batch) at step 100: 0.9153
Training loss (for one batch) at step 150: 1.0118
Validation acc: 0.802 precision: 0.824 recall: 0.785 f1_micro: 0.804 f1_macro: 0.803
Time taken: 1043.53s
Epoch 3
Training loss (for one batch) at step 0: 0.6746
Training lo

In [None]:
for x_batch_val, y_batch_val in test_ds:
    inputs = processor.preprocess(tf.repeat(x_batch_val[:,:,:,tf.newaxis],3,-1),image_mean=(-3.05,-3.05,-3.05),image_std=(2.33,2.33,2.33),return_tensors="tf")
    test_step(inputs, y_batch_val)
acc = float(val_acc_metric.result())
prec = float(val_prec_metric.result())
recall = float(val_recall_metric.result())
micro = float(f1_metric_micro.result())
macro = float(f1_metric_macro.result())

val_acc_metric.reset_states()
val_prec_metric.reset_states()
val_recall_metric.reset_states()
f1_metric_micro.reset_states()
f1_metric_macro.reset_states()
print(f"Test acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")

Test acc: 0.800 precision: 0.821 recall: 0.787 f1_micro: 0.804 f1_macro: 0.802


In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.vit.push_to_hub("andrei-saceleanu/vit-base-freematch")

tf_model.h5:   0%|          | 0.00/346M [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
model.cls_head.save_weights("checkpoints/audio_freematch")