In [None]:
# utils
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import librosa
import IPython.display as ipd


In [None]:
# Displaying a random waveform and spectrogram
audioFile = 'data/genres_original/blues/blues.00000.wav'
waveform, sampleRate = librosa.load(audioFile)
print('Class : Blues\n')
ipd.display(ipd.Audio(waveform, rate = sampleRate))


# Displaying waveform
plt.figure(figsize = (15, 4))
plt.subplot(1, 2, 1)
plt.plot(waveform)
plt.title('Waveform', fontsize = 16)
plt.xlabel('Sample Index', fontsize = 12)
plt.ylabel('Amplitude', fontsize = 12)


# Displaying spectrogram
plt.subplot(1, 2, 2)
plt.specgram(waveform, Fs = sampleRate)
plt.title('Spectrogram', fontsize = 16)
plt.xlabel('Time (s)', fontsize = 12)
plt.ylabel('Frequency (Hz)', fontsize = 12)

plt.show()

In [None]:
# Displaying the duration of the audio file
# Waveform is a 1D numpy array containing the audio signal
print(f'Waveform shape : {waveform.shape} -> 1D numpy array')
# the resulting shapeis the sample Hz times the duration of the audio file
print(f'Audio duration : {(waveform.shape[0]/sampleRate).__round__(2)} seconds')

It is important to ensure that all audio examples in the dataset have the same sampling rate when working on any audio task. And the sampling rate of the data should match the sampling rate of the data the model was pre-trained on if not we need to do the resampling process.

The sampling rate determines the time interval between successive audio samples, which impacts the temporal resolution of the audio data. For example, a 5-second sound at a sampling rate of 16,000 Hz will be represented as a series of 80,000 values. (Hz*len(audio) in seconds) 

Hertz : equals the number of cycles per second (svingninger per sekond)

The amplitude of a sound describes the sound pressure level at any given instant and is measured in decibels(dB). The bit depth of the sample determines with how much precision this amplitude value can be described.

Amplitude change is represented by y-axis as plotted by time. "This is also known as the time domain representation of sound."

In [None]:
import librosa.display

plt.figure().set_figwidth(12)
librosa.display.waveshow(waveform, sr=sampleRate)
plt.ylabel('Amplitude')
plt.show()

### The frequency spectrum
#### discrete Fourier transform or DFT

Another way to visualize audio data is to plot the frequency spectrum of an audio signal, also known as the "frequency domain representation". The spectrum is computed using the discrete Fourier transform or DFT. It describes the individual frequencies that make up the signal and how strong they are.

Here, we are going to plot the frequency spectrum for the using numpy's rfft() function. While it is possible to plot the spectrum of the entire sound, it's more useful to look at a small region instead. Here we will take the DFT over the first 4096 samples, which is roughly the length of the first note being played:

In [None]:
# Displaying the frequency spectrum

dft_input=waveform[:4096]

# calculate the DFT
window =np.hanning(len(dft_input))
windowed_input=dft_input*window
dft=np.fft.rfft(windowed_input)

# get the amplitude spectrum in decibels
amplitude=np.abs(dft)
amplitude_db=librosa.amplitude_to_db(amplitude, ref=np.max)

# get the frequency bins
frequency=librosa.fft_frequencies(sr=sampleRate, n_fft=len(dft_input))

plt.figure().set_figwidth(12)
plt.plot(frequency, amplitude_db)
plt.xlabel('Frequency (Hz)')
plt.ylabel('Amplitude (dB)')
plt.xscale('log')

## Spectrogram (discrete Fourier transform or DFT)
The waveform plots the amplitude of the audio signal over time, the spectrum visualizes the amplitudes of the individual frequencies at a fixed point in time. What if we want to see how the frequencies in an audio signal change? The problem is that the spectrum only shows a frozen snapshot of the frequencies at a given instant. The solution is to take multiple DFTs, each covering only a small slice of time, and stack the resulting spectra together into a spectrogram. The algorithm that performs this computation is the STFT of Short Fourier Transform.


In this plot, the x-axis represents time as in the waveform visualization but now the y-axis represents frequency in Hz. The intensity of the color gives the amplitude of power of the frequency component at each point in time, measured in decibels(dB). The spectrogram is created by taking shot segments of the audio signal, typically lasting a few milliseconds, and calculating the discrete Fourier transform of each segment to obtain its frequency spectrum. The resulting spectra are then stacked together on the time axis to create the spectrogram. Each vertical slice in this image corresponds to a single frequency spectrum, seen from the top. By default, librosa.stft() splits the audio signal into segments of 2048 samples, which gives a good-trade-off between frequency resolution and time resolution.

Since the spectrogram and the waveform are different views of the same data, it's possible to turn the spectrogram back into the original waveform using the inverse STFT. In that case, we can use a phase reconstruction algorithm such as the classic Griffin-Lim algorithm, or using a neural network called a vector, to reconstruct a waveform from the spectrogram.

In [None]:
D=librosa.stft(waveform, n_fft=2048, hop_length=250, win_length=1000, window='hann')
print(D.shape)
S_db=librosa.amplitude_to_db(np.abs(D), ref=np.max)
plt.figure().set_figwidth(12)
librosa.display.specshow(S_db, x_axis='time', y_axis='hz')
plt.colorbar()

### Mel spectrogram
A mel spectrogram is a variation of the spectrogram that is commonly used in speech processing and machine learning tasks. It is similar to a spectrogram in that it shoes the frequency content of an audio signal over time, but on a different frequency axis.

In the example below, n_mel stands for the number of mel bands to generate. The mel bands define a set of frequency ranges that divide the spectrum into preceptually meaningful components, using a set of filters whose shape and spacing are chosen to mimic the way the human ears responds to different frequencies. Common values for n_mels are 40 or 80 fmax indicates the highest frequency(in Hz) we care about

In [None]:
S=librosa.feature.melspectrogram(y=waveform, sr=sampleRate, n_mels=140, fmax=9500)
S_dB=librosa.power_to_db(S,ref=np.max)
print(S.shape)
plt.figure().set_figwidth(12)
librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=sampleRate, fmax=9500)
plt.colorbar()

# HUBERT

In [None]:
## Env variables
os.environ['DATASET']='data/genres_original'
os.environ['MODEL']='ntu-spml/distilhubert'
os.environ["WANDB_NAME"] = "hubert-10-genres"
os.environ["WANDB_PROJECT"] = "Fine-tuning HuBERT"
os.environ["WANDB_NOTES"] = "Fine-tuning HuBERT on gtzan"

## Load Dataset

In [None]:
from datasets import load_dataset
# Define Path to the dataset
dataset=load_dataset(os.getenv('DATASET'))
train_data = dataset['train'].train_test_split(seed=42, shuffle=True, stratify_by_column="label",test_size=.4)
test_val_data = train_data['test'].train_test_split(seed=42, shuffle=True, stratify_by_column="label",test_size=.5)

dataset["train"] = train_data["train"]
dataset["test"] = test_val_data["test"]
dataset["validation"] = test_val_data["train"]


# test_val_data["val"] = test_val_data["train"]
# train_data = train_data["train"]


In [None]:
print("#"*25, "DATASET", "#"*25)
print('\n')
print(dataset)

In [None]:
from transformers import AutoFeatureExtractor
feature_extractor=AutoFeatureExtractor.from_pretrained(os.getenv('MODEL'), do_normalize=True, return_attention_mask=True, sampling_rate=22050)

sampling_rate=feature_extractor.sampling_rate
print(f'DistilHuBERT Sampling Rate: {sampling_rate} Hz')

In [None]:
max_duration=30.0

def preprocess_function(examples):
    # extracting and saving arrays
    audio_arrays=[x['array'] for x in examples['audio']]
    
    # preprocessing audio inputs
    inputs=feature_extractor(audio_arrays, sampling_rate=sampling_rate, max_length=int(sampling_rate*max_duration), truncation=True, return_attention_mask=True)
    return inputs

dataset_encoded=dataset.map(preprocess_function, remove_columns=['audio'], batched=True, batch_size=100, num_proc=1)

## Training

In [None]:
import evaluate
import numpy as np

metric=evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions=np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
from transformers import TrainingArguments, Trainer

training_args=TrainingArguments(
    output_dir=os.getenv('WANDB_NAME'),
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    learning_rate=5e-5,
    seed=42,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    max_steps=100, # We control the total training steps to fit the limied resources
    num_train_epochs=2,
    warmup_ratio=0.1,
    fp16=False, # set to true for cuda
    save_total_limit=2,
    report_to='wandb',
    run_name=os.getenv('WANDB_NAME')
)

trainer=Trainer(model=model, args=training_args, train_dataset=dataset_encoded['train'], eval_dataset=dataset_encoded['validation'],tokenizer=feature_extractor, compute_metrics=compute_metrics)

trainer.train()

### Inference

In [None]:
trainer.evaluate(dataset_encoded['test'])