In [1]:
import os
import tensorflow as tf
import tensorflow_addons as tfa
from transformers import ViTImageProcessor, TFViTModel
import librosa
import numpy as np
import time
from audiomentations import Compose, TimeStretch, PitchShift, AddGaussianNoise

2023-05-15 17:10:10.082937: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-15 17:10:10.593541: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64:/usr/local/cuda-10.2/lib64:/usr/local/cuda/extras/CUPTI/lib64::/usr/local/cuda-10.1/lib64:/usr/local/cuda-10.2/lib64:/usr/local/cuda/extras/CUPTI/lib64:
2023-05-15 17:10:10.593590: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object 

In [2]:
label2id = {
    "ANG": 0,
    "NEU": 1,
    "HAP": 2,
    "FEA": 3,
    "DIS": 4,
    "SAD": 5
}

In [4]:
def prepare_audio_ds():

    data = []
    labels = []
    length = 3 * 16000
    for file in os.listdir("audio_data"):
        file_path = os.path.join("audio_data",file)
        audio, _ = librosa.load(file_path,sr=16000)
        if len(audio) < length:
            audio = np.pad(audio,(0,length-len(audio)),'constant')
        else:
            audio = audio[:length]

        label = label2id[file_path.split("_")[3]]
        melspec = librosa.feature.melspectrogram(y=audio, sr=16000)

        data.append(melspec)
        labels.append(label)

    data_ds = tf.data.Dataset.from_tensor_slices(data)
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    return tf.data.Dataset.zip((data_ds, label_ds))

In [5]:
def preprocess(x, label):

    inputs = processor.preprocess(x,do_rescale=False,do_resize=False,image_mean=0,image_std=0.5,return_tensors="tf") 
    return inputs["pixel_values"], label

def preprocess2(x):

    inputs = processor.preprocess(x,do_rescale=False,do_resize=False,image_mean=0,image_std=0.5,return_tensors="tf") 
    return inputs["pixel_values"]
    

In [12]:
def prepare_ssl_ds(num_classes=6, train_percent=0.8, label_percent = 0.5, batch_sizel=16, batch_sizeu=16):

    data = {i:[] for i in range(num_classes)}
    length = 3 * 16000
    for file in os.listdir("audio_data"):
        file_path = os.path.join("audio_data",file)
        audio, _ = librosa.load(file_path,sr=16000)
        if len(audio) < length:
            audio = np.pad(audio,(0,length-len(audio)),'constant')
        else:
            audio = audio[:length]

        label = label2id[file_path.split("_")[3]]
        melspec = librosa.feature.melspectrogram(y=audio, sr=16000,n_fft=512,hop_length=430,n_mels=112)[:,:,np.newaxis]

        data[label].append(melspec)

    labeled_split, unlabeled_split, test_split = ([],[]), [], ([],[])
    for label in range(num_classes):
        label_data = data[label]
        train_size = int(train_percent*len(label_data))
        test_size = len(label_data) - train_size
        labeled_size = int(label_percent * train_size)

        test_split[0].extend(label_data[train_size:])
        test_split[1].extend([label for _ in range(test_size)])
        
        labeled_split[0].extend(label_data[:labeled_size])
        labeled_split[1].extend([label for _ in range(labeled_size)])
    
        unlabeled_split.extend(label_data[labeled_size:train_size])

    labeled = tf.data.Dataset.from_tensor_slices(labeled_split)
    labeled = labeled.shuffle(len(labeled)).batch(batch_sizel)

    unlabeled = tf.data.Dataset.from_tensor_slices(unlabeled_split)
    unlabeled = unlabeled.shuffle(len(unlabeled)).batch(batch_sizeu)

    test = tf.data.Dataset.from_tensor_slices(test_split)
    test = test.shuffle(len(test)).batch(16)
    
    return labeled, unlabeled, test

In [3]:
def prepare_dataset(audio_files, batch_size, shuffle=True):
    # Define augmentations to apply to signal
    # augmentations = Compose([
    #     TimeStretch(min_rate=0.8, max_rate=1.2, p=0.5),
    #     PitchShift(min_semitones=-3, max_semitones=3, p=0.5),
    #     AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5)
    # ])
    length = 3 * 16000
    
    def load_audio(file_path):
        # Load signal file
        signal, sr = librosa.load(file_path, sr=16000)

        if len(signal) < length:
            signal = np.pad(signal,(0,length-len(signal)),'constant')
        else:
            signal = signal[:length]
        
        label = label2id[str(file_path).split("_")[3]]
        # Apply augmentations
        # signal = augmentations(samples=signal, sample_rate=sr)
        
        # Compute spectrogram
        spectrogram = librosa.feature.melspectrogram(y=signal, sr=sr, n_mels=112, hop_length=430)
        # spectrogram = librosa.power_to_db(S=spectrogram, ref=np.max)
        
        # Normalize spectrogram
        spectrogram_min, spectrogram_max = spectrogram.min(), spectrogram.max()
        spectrogram = (spectrogram - spectrogram_min) / (spectrogram_max - spectrogram_min + 1e-3)
        
        
        return spectrogram.astype('float32'), label

    # Create dataset from list of file paths
    dataset = tf.data.Dataset.from_tensor_slices(audio_files)
    
    # Map function to load and preprocess signal
    dataset = dataset.map(lambda x: tf.numpy_function(load_audio, [x], [tf.float32,tf.int64]), num_parallel_calls=tf.data.AUTOTUNE)
    
    # Shuffle and batch dataset
    if shuffle:
        dataset = dataset.shuffle(len(audio_files))
    dataset = dataset.batch(batch_size=batch_size)
    
    return dataset

In [4]:
files = [os.path.join("audio_data",elem) for elem in sorted(os.listdir("audio_data"))]
train_ids = list(map(str,range(1001,1082)))
test_ids = list(map(str,range(1082,1092)))
train_files = [file for file in files if os.path.basename(file).split("_")[0] in train_ids]
test_files = [file for file in files if os.path.basename(file).split("_")[0] in test_ids]

In [5]:
train_ds = prepare_dataset(train_files,16)
test_ds = prepare_dataset(test_files, 16)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


2023-05-15 17:10:21.483286: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-15 17:10:21.483616: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-15 17:10:21.487300: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-15 17:10:21.487601: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-15 17:10:21.487891: I tensorflow/compiler/xla/stream_executo

In [5]:
dset = prepare_dataset(files,16)
total = len(dset)
train_ds = dset.take(int(0.8 * total))
test_ds = dset.skip(int(0.8 * total))

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


2023-05-11 17:11:21.053411: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-11 17:11:21.053934: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-11 17:11:21.059362: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-11 17:11:21.059787: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-11 17:11:21.060196: I tensorflow/compiler/xla/stream_executo

In [6]:
batch = next(train_ds.as_numpy_iterator())

2023-05-15 17:10:47.808328: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2032 of 6622
2023-05-15 17:10:57.810086: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 4186 of 6622
2023-05-15 17:11:07.773262: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 6339 of 6622
2023-05-15 17:11:08.983321: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:417] Shuffle buffer filled.


In [8]:
features = batch[0]

In [9]:
np.isnan(features).any()

False

In [10]:
from kapre.augmentation import SpecAugment

In [26]:
class AudioClassifier(tf.keras.Model):
    def __init__(self, encoder_name='google/vit-base-patch16-224', num_classes=6, **kwargs):
        super(AudioClassifier, self).__init__(**kwargs)
        self.vit = TFViTModel.from_pretrained(encoder_name)
        self.num_classes = num_classes

        self.spec_augment = SpecAugment(freq_mask_param=5,
                           time_mask_param=10,
                           n_freq_masks=2,
                           n_time_masks=3,
                           mask_value=-100,
                           data_format="channels_first")  
        self.cls_head = tf.keras.Sequential([
        tf.keras.layers.Dense(256,activation="relu"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64,activation="relu"),
        tf.keras.layers.Dense(self.num_classes, activation="softmax")
        ])

    def call(self, inputs, training):

        x = self.spec_augment(inputs[:,0,:,:][:,tf.newaxis,:,:],training=training)

        embeds= self.vit(pixel_values=tf.repeat(x,3,axis=1),training=training).pooler_output

        return self.cls_head(embeds)

In [7]:
optim = tfa.optimizers.AdamW(weight_decay=0.001,learning_rate=0.005)
optim2 = tfa.optimizers.AdamW(weight_decay=0.0,learning_rate=0.00001)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_prec_metric = tf.keras.metrics.Precision(name="precision")
val_recall_metric = tf.keras.metrics.Recall(name="recall")
f1_metric_micro = tfa.metrics.F1Score(num_classes=6, threshold=0.5, average='micro', name='f1_micro')
f1_metric_macro = tfa.metrics.F1Score(num_classes=6, threshold=0.5, average='macro', name='f1_macro')

In [27]:
model = AudioClassifier()
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')

Some layers from the model checkpoint at google/vit-base-patch16-224 were not used when initializing TFViTModel: ['classifier']
- This IS expected if you are initializing TFViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit/pooler/dense/kernel:0', 'vit/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
inps = processor.preprocess(tf.repeat(features[:,:,:,tf.newaxis],3,-1),do_normalize=False,return_tensors="tf")

In [30]:
o1 = model(inps["pixel_values"],training=True)
o2 = model(inps["pixel_values"],training=True)

In [31]:
o1 - o2

<tf.Tensor: shape=(16, 6), dtype=float32, numpy=
array([[ 0.06682296, -0.06553048, -0.01734985,  0.00831209, -0.03359868,
         0.04134396],
       [-0.05041097, -0.00746413, -0.00197205,  0.02194054,  0.05022062,
        -0.01231408],
       [-0.00724378, -0.01822868, -0.01349299, -0.04212488,  0.05358164,
         0.02750872],
       [ 0.01803322,  0.04941311, -0.08708128, -0.05414873,  0.07414068,
        -0.00035696],
       [ 0.02522423, -0.04440199, -0.01314503,  0.05140947, -0.01230535,
        -0.00678134],
       [-0.03673504, -0.11002575,  0.04511422,  0.0872122 ,  0.02903804,
        -0.01460362],
       [-0.02736025,  0.1061403 ,  0.0222266 , -0.0624399 ,  0.02246654,
        -0.06103339],
       [ 0.04626766, -0.04188779, -0.04815687,  0.03282814, -0.01040184,
         0.02135073],
       [ 0.0296785 ,  0.00038029,  0.03637715, -0.02392404, -0.01839469,
        -0.02411717],
       [-0.01626869, -0.02296731,  0.01202375, -0.01119214, -0.02451365,
         0.06291807],
 

In [9]:
@tf.function
def train_step(x,y):

  with tf.GradientTape() as tape:
      pred = model(x["pixel_values"],training=True)
      loss = loss_fn(y, pred)
      
  grads = tape.gradient(loss, [model.cls_head.trainable_weights,model.vit.trainable_weights])
  optim.apply_gradients(zip(grads[0], model.cls_head.trainable_weights))
  optim2.apply_gradients(zip(grads[1], model.vit.trainable_weights))

  return loss

@tf.function
def test_step(x, y):
    wpred = model(x["pixel_values"], training=False)
    val_acc_metric.update_state(y, wpred)
    true_hot = tf.one_hot(y, 6)
    val_prec_metric.update_state(true_hot, wpred)
    val_recall_metric.update_state(true_hot, wpred)
    f1_metric_micro.update_state(true_hot, wpred)
    f1_metric_macro.update_state(true_hot, wpred)

In [10]:
EPOCHS=9
max_val_acc = 0.0
best_weights = None

for epoch in range(EPOCHS):
  print(f"Epoch {epoch}")
  start_time = time.time()
  step = 0
  for x,y in train_ds:
    
    inputs = processor.preprocess(tf.repeat(x[:,:,:,tf.newaxis],3,-1),do_normalize=False,return_tensors="tf")

    loss = train_step(inputs,y)
    if step % 50 == 0:
      print(
        "Training loss (for one batch) at step %d: %.4f"
        % (step, float(loss))
      )
    step += 1
  for x_batch_val, y_batch_val in test_ds:
      inputs = processor.preprocess(tf.repeat(x_batch_val[:,:,:,tf.newaxis],3,-1),do_normalize=False,return_tensors="tf")
    
      test_step(inputs, y_batch_val)

  acc = float(val_acc_metric.result())
  prec = float(val_prec_metric.result())
  recall = float(val_recall_metric.result())
  micro = float(f1_metric_micro.result())
  macro = float(f1_metric_macro.result())

  val_acc_metric.reset_states()
  val_prec_metric.reset_states()
  val_recall_metric.reset_states()
  f1_metric_micro.reset_states()
  f1_metric_macro.reset_states()

  if acc > max_val_acc:
    max_val_acc = acc
    best_weights = model.get_weights()
  print(f"Validation acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")
  print("Time taken: %.2fs" % (time.time() - start_time))
print("Restoring best weights relative to validation accuracy...")
model.set_weights(best_weights)

Epoch 0


2023-05-11 17:11:54.785848: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 1963 of 7442
2023-05-11 17:12:04.815977: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 4144 of 7442
2023-05-11 17:12:14.790384: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 6294 of 7442
2023-05-11 17:12:19.857649: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:417] Shuffle buffer filled.


Training loss (for one batch) at step 0: 1.7653
Training loss (for one batch) at step 50: 1.3695
Training loss (for one batch) at step 100: 1.3718
Training loss (for one batch) at step 150: 1.3242
Training loss (for one batch) at step 200: 1.4777
Training loss (for one batch) at step 250: 1.3461
Training loss (for one batch) at step 300: 1.4766
Training loss (for one batch) at step 350: 1.3661


2023-05-11 17:15:41.861491: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2174 of 7442
2023-05-11 17:15:51.847917: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 4343 of 7442
2023-05-11 17:16:01.884261: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 6526 of 7442
2023-05-11 17:16:06.034361: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:417] Shuffle buffer filled.


Validation acc: 0.360 precision: 0.577 recall: 0.174 f1_micro: 0.267 f1_macro: 0.184
Time taken: 290.44s
Epoch 1


2023-05-11 17:16:45.258089: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2202 of 7442
2023-05-11 17:16:55.225772: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 4384 of 7442
2023-05-11 17:17:05.229239: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 6532 of 7442
2023-05-11 17:17:09.383491: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:417] Shuffle buffer filled.


Training loss (for one batch) at step 0: 1.3800
Training loss (for one batch) at step 50: 1.4529
Training loss (for one batch) at step 100: 1.3782
Training loss (for one batch) at step 150: 1.9285
Training loss (for one batch) at step 200: 1.3847
Training loss (for one batch) at step 250: 1.2879
Training loss (for one batch) at step 300: 1.3653
Training loss (for one batch) at step 350: 1.2527


2023-05-11 17:20:24.646447: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2165 of 7442
2023-05-11 17:20:34.637575: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 4315 of 7442
2023-05-11 17:20:44.631319: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 6481 of 7442
2023-05-11 17:20:48.908389: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:417] Shuffle buffer filled.


Validation acc: 0.507 precision: 0.747 recall: 0.119 f1_micro: 0.205 f1_macro: 0.180
Time taken: 280.81s
Epoch 2


2023-05-11 17:21:26.048620: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2183 of 7442
2023-05-11 17:21:36.015739: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 4347 of 7442
2023-05-11 17:21:46.082450: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 6515 of 7442
2023-05-11 17:21:50.325943: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:417] Shuffle buffer filled.


Training loss (for one batch) at step 0: 1.3442
Training loss (for one batch) at step 50: 1.6097
Training loss (for one batch) at step 100: 1.4325
Training loss (for one batch) at step 150: 1.4540
Training loss (for one batch) at step 200: 1.0866
Training loss (for one batch) at step 250: 1.0605
Training loss (for one batch) at step 300: 1.4197
Training loss (for one batch) at step 350: 0.9794


2023-05-11 17:25:06.188873: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2205 of 7442
2023-05-11 17:25:16.158195: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 4365 of 7442
2023-05-11 17:25:26.157822: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 6551 of 7442
2023-05-11 17:25:30.162716: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:417] Shuffle buffer filled.


Validation acc: 0.521 precision: 0.649 recall: 0.251 f1_micro: 0.362 f1_macro: 0.308
Time taken: 281.30s
Epoch 3


2023-05-11 17:26:07.379222: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2172 of 7442
2023-05-11 17:26:17.341274: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 4348 of 7442
2023-05-11 17:26:27.374549: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 6574 of 7442
2023-05-11 17:26:31.310151: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:417] Shuffle buffer filled.


Training loss (for one batch) at step 0: 1.4035
Training loss (for one batch) at step 50: 1.1945
Training loss (for one batch) at step 100: 1.4275
Training loss (for one batch) at step 150: 1.4016
Training loss (for one batch) at step 200: 1.3548
Training loss (for one batch) at step 250: 1.3579
Training loss (for one batch) at step 300: 0.9279
Training loss (for one batch) at step 350: 1.2307


2023-05-11 17:29:48.682806: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2220 of 7442
2023-05-11 17:29:58.704502: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 4378 of 7442
2023-05-11 17:30:08.701215: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 6549 of 7442
2023-05-11 17:30:12.716554: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:417] Shuffle buffer filled.


Validation acc: 0.562 precision: 0.656 recall: 0.376 f1_micro: 0.478 f1_macro: 0.453
Time taken: 282.54s
Epoch 4


In [None]:
for x_batch_val, y_batch_val in test_ds:
    test_step(x_batch_val, y_batch_val)
acc = float(val_acc_metric.result())
prec = float(val_prec_metric.result())
recall = float(val_recall_metric.result())
micro = float(f1_metric_micro.result())
macro = float(f1_metric_macro.result())

val_acc_metric.reset_states()
val_prec_metric.reset_states()
val_recall_metric.reset_states()
f1_metric_micro.reset_states()
f1_metric_macro.reset_states()
print(f"Test acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")

# Diverse

In [4]:
files = [os.path.join("audio_data",elem) for elem in sorted(os.listdir("audio_data"))]

In [None]:
files

In [6]:
ds = prepare_dataset(files,8,shuffle=False)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


2023-05-11 07:14:03.326045: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-11 07:14:03.326634: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-11 07:14:03.332943: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-11 07:14:03.333422: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-11 07:14:03.333886: I tensorflow/compiler/xla/stream_executo

In [7]:
it = ds.as_numpy_iterator()

In [8]:
batch_features, batch_labels = next(it)

In [9]:
batch_features.shape

(8, 112, 112)

In [None]:
ds = get_ds()

In [13]:
labeled, unlabeled, test = prepare_ssl_ds()

In [14]:
next(iter(labeled))

2023-05-06 12:03:33.117409: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [2974]
	 [[{{node Placeholder/_1}}]]
2023-05-06 12:03:33.117704: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [2974]
	 [[{{node Placeholder/_1}}]]


(<tf.Tensor: shape=(16, 112, 112, 1), dtype=float32, numpy=
 array([[[[3.10835254e-04],
          [1.13181675e-04],
          [5.34751918e-04],
          ...,
          [0.00000000e+00],
          [0.00000000e+00],
          [0.00000000e+00]],
 
         [[4.97518806e-04],
          [2.69889692e-03],
          [9.57946468e-04],
          ...,
          [0.00000000e+00],
          [0.00000000e+00],
          [0.00000000e+00]],
 
         [[5.71993703e-04],
          [9.02745128e-03],
          [1.54509407e-03],
          ...,
          [0.00000000e+00],
          [0.00000000e+00],
          [0.00000000e+00]],
 
         ...,
 
         [[4.78189293e-07],
          [9.04746571e-07],
          [1.06656319e-06],
          ...,
          [0.00000000e+00],
          [0.00000000e+00],
          [0.00000000e+00]],
 
         [[4.55601793e-07],
          [7.85294219e-07],
          [4.92827098e-07],
          ...,
          [0.00000000e+00],
          [0.00000000e+00],
          [0.00000000e+00

In [4]:
test_file = "/home/andrei/SSL/audio_data/1001_DFA_FEA_XX.wav"
length = 3 * 16000
audio, _ = librosa.load(test_file,sr=16000)

if len(audio) < length:
    audio = np.pad(audio,(0,length-len(audio)),'constant')
else:
    audio = audio[:length]

audio.shape

(48000,)

In [11]:
melspec = librosa.feature.melspectrogram(y=audio, sr=16000,n_fft=512,hop_length=430,n_mels=112)
melspec.shape

(112, 112)

In [10]:
from transformers import ViTImageProcessor, TFViTModel

In [11]:
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')

In [15]:
model = TFViTModel.from_pretrained('google/vit-base-patch16-224')

2023-05-11 07:15:20.750028: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8100

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.
Some layers from the model checkpoint at google/vit-base-patch16-224 were not used when initializing TFViTModel: ['classifier']
- This IS expected if you are initializing TFViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit/pooler/dense/kernel:0', 'vit/pooler/dense/bias:0']
You should proba

In [17]:
x = batch_features / np.amax(batch_features)

In [21]:
tf.repeat(x[:,tf.newaxis,:,:],3,1)

<tf.Tensor: shape=(8, 3, 112, 112), dtype=float32, numpy=
array([[[[2.11548522e-05, 4.25573198e-05, 6.97008072e-05, ...,
          2.18051340e-04, 4.84560114e-05, 2.42616693e-06],
         [8.11546415e-05, 1.22299476e-04, 8.83477333e-05, ...,
          9.82173660e-05, 1.94894656e-05, 1.19180015e-06],
         [5.43840542e-05, 9.49605455e-05, 1.23547434e-04, ...,
          1.00878467e-04, 2.19050926e-05, 2.60523916e-06],
         ...,
         [1.56879754e-08, 3.34665593e-08, 3.80255685e-08, ...,
          1.68133312e-08, 1.01359516e-08, 2.29534414e-09],
         [1.45310821e-08, 2.77839387e-08, 2.88826367e-08, ...,
          2.44096725e-08, 1.90357561e-08, 3.39401862e-09],
         [1.06349267e-08, 2.25696297e-08, 3.17923856e-08, ...,
          1.74895103e-08, 9.36511402e-09, 1.57541924e-09]],

        [[2.11548522e-05, 4.25573198e-05, 6.97008072e-05, ...,
          2.18051340e-04, 4.84560114e-05, 2.42616693e-06],
         [8.11546415e-05, 1.22299476e-04, 8.83477333e-05, ...,
         

In [35]:
x = batch_features *255
x = x.astype("uint8")

In [36]:
x.shape

(8, 112, 112)

In [12]:
inputs = processor.preprocess(tf.repeat(batch_features[:,:,:,tf.newaxis],3,-1),do_normalize=False,return_tensors="tf")

In [13]:
inputs

{'pixel_values': <tf.Tensor: shape=(8, 3, 224, 224), dtype=float32, numpy=
array([[[[0.11764706, 0.14901961, 0.20784314, ..., 0.        ,
          0.        , 0.        ],
         [0.1254902 , 0.15686275, 0.21176471, ..., 0.        ,
          0.        , 0.        ],
         [0.14117648, 0.16862746, 0.21960784, ..., 0.        ,
          0.        , 0.        ],
         ...,
         [0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.        , 0.00392157, ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.        , 0.00392157, ..., 0.        ,
          0.        , 0.        ]],

        [[0.11764706, 0.14901961, 0.20784314, ..., 0.        ,
          0.        , 0.        ],
         [0.1254902 , 0.15686275, 0.21176471, ..., 0.        ,
          0.        , 0.        ],
         [0.14117648, 0.16862746, 0.21960784, ..., 0.        ,
          0.        , 0.        ],
         ...,
         [0. 

In [16]:
model(**inputs)

TFBaseModelOutputWithPooling(last_hidden_state=<tf.Tensor: shape=(8, 197, 768), dtype=float32, numpy=
array([[[-0.05842694,  0.35170335, -0.663353  , ...,  1.6655248 ,
          0.11887266,  0.3121931 ],
        [ 0.14366789, -0.80312777, -0.2063374 , ...,  0.92824143,
          0.7842201 , -0.58480823],
        [-0.05086734,  0.1883568 , -0.7229579 , ...,  1.1643814 ,
          0.15209699,  0.26526305],
        ...,
        [ 0.8489661 , -0.7806218 , -2.209633  , ...,  0.9562143 ,
          0.27626705, -0.09625694],
        [ 0.2790149 , -0.3611182 , -1.5517721 , ...,  0.2760882 ,
          0.2714017 , -0.37976813],
        [ 0.09540801, -0.02532787, -0.26065674, ...,  0.40586743,
          0.58491814, -0.2816257 ]],

       [[ 0.21484426,  0.7438918 , -0.40159073, ...,  1.7606384 ,
          0.3682174 ,  0.35421148],
        [-0.5401938 ,  0.18732081,  0.34144688, ...,  0.45145005,
          0.8920721 , -1.0399929 ],
        [ 0.08160028,  0.96327573, -0.14635451, ...,  0.8372804 ,
 

In [107]:
norm_mel = melspec / np.max(melspec)

In [108]:
norm_mel = norm_mel[np.newaxis,:,:,np.newaxis]
norm_mel.shape

(1, 224, 224, 1)

In [18]:
batch = next(iter(labeled))

In [20]:
batch[0]

<tf.Tensor: shape=(16, 112, 112, 1), dtype=float32, numpy=
array([[[[1.3448650e-03],
         [1.6590778e-04],
         [1.8373945e-03],
         ...,
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00]],

        [[2.0199262e-03],
         [1.2314869e-03],
         [2.0786505e-03],
         ...,
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00]],

        [[2.3793723e-03],
         [1.2008757e-03],
         [1.6865426e-03],
         ...,
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00]],

        ...,

        [[9.0859508e-08],
         [1.1403018e-07],
         [8.6782372e-08],
         ...,
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00]],

        [[2.3369414e-07],
         [3.0757215e-07],
         [7.6529949e-07],
         ...,
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00]],

        [[6.9941869e-08],
         [3.6381815e-07],
         [6.000

In [35]:
b = batch[0]/tf.reduce_max(batch[0])
b = tf.repeat(b,3,axis=-1)
inputs = processor.preprocess(b,image_mean=0,image_std=0.5,return_tensors="tf")

tf.reduce_max(inputs["pixel_values"])

<tf.Tensor: shape=(), dtype=float32, numpy=1.5137255>

In [34]:
model(**inputs)

TFBaseModelOutputWithPooling(last_hidden_state=<tf.Tensor: shape=(16, 197, 768), dtype=float32, numpy=
array([[[ 0.02334747,  1.0715562 , -1.200588  , ..., -0.457747  ,
          0.21726893, -0.01203843],
        [ 0.4471927 ,  1.4031292 , -0.12195988, ...,  0.14339238,
          0.99487036,  0.18174091],
        [ 0.2698606 , -0.04117971, -0.03693089, ..., -0.63055205,
          1.6403565 ,  1.3969635 ],
        ...,
        [ 0.4800619 ,  0.7794227 , -0.08498895, ..., -0.74754244,
          0.84426796,  0.52911854],
        [ 0.42537072,  0.7792351 , -0.05819314, ..., -0.95499545,
          0.8533628 ,  0.4238273 ],
        [ 0.40322465,  0.9207795 , -0.2419951 , ...,  0.14645788,
          0.60421413, -0.28083032]],

       [[ 0.3101172 ,  0.31281754, -0.26379558, ..., -0.49732864,
          0.07916331,  0.03241296],
        [ 0.51191   ,  0.83087844,  0.6871592 , ..., -0.41650552,
          0.30407572,  0.12435514],
        [ 1.4944315 ,  0.5677866 ,  0.0409994 , ..., -0.6730571 ,
