## Fine-tune `wav2vec2-base` with gtzan

Using https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification dataset


In [None]:
import pyarrow.parquet as pq
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import librosa
import librosa.display

#from IPython.display import Audio

!pip install datasets
from datasets import load_dataset, Audio, DatasetDict

#!pip install git+https://github.com/huggingface/transformers
!pip install -U accelerate
!pip install -U transformers
!pip install evaluate

In [None]:
import logging

gtzan = load_dataset("marsyas/gtzan", split='train')

gtzan

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['file', 'audio', 'genre'],
    num_rows: 999
})

In [None]:
# Split the dataset into training and test sets (e.g., 60% training, 40% test)
train_test = gtzan.train_test_split(seed=42, shuffle=True, test_size=0.4, stratify_by_column = 'genre')
train_set = train_test['train']
test_set = train_test['test']

# Further split the test set into test and evaluation sets (50% test, 50% evaluation)
test_eval = test_set.train_test_split(seed=42, shuffle=True, test_size=0.5, stratify_by_column = 'genre')
test_set = test_eval['train']
eval_set = test_eval['test']

# Combine the splits into a DatasetDict for convenience
splits = DatasetDict({
    'train': train_set,
    'test': test_set,
    'eval': eval_set
})

In [None]:
print("Train set: \n",  pd.Series(splits['train']['genre']).value_counts().sort_index(), '\n')

print("Test set: \n",  pd.Series(splits['eval']['genre']).value_counts().sort_index())

print("Test set: \n",  pd.Series(splits['test']['genre']).value_counts().sort_index())

Train set: 
 0    60
1    60
2    60
3    60
4    60
5    59
6    60
7    60
8    60
9    60
Name: count, dtype: int64 

Test set: 
 0    20
1    20
2    20
3    20
4    20
5    20
6    20
7    20
8    20
9    20
Name: count, dtype: int64
Test set: 
 0    20
1    20
2    20
3    20
4    20
5    20
6    20
7    20
8    20
9    20
Name: count, dtype: int64


In [None]:
# GENRES = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]

# create a dictionary to map label
labels = splits['train'].features['genre'].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
  label2id[label] = str(i)
  id2label[str(i)] = label

id2label[str(7)]

'pop'

### Preprocess

In [None]:
from transformers import AutoFeatureExtractor

model_id = "facebook/wav2vec2-base"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id,
    # do_normalize=True,  # common preprocess -> ensures amplitude of audio signal is scaled within a certain range (consistent in terms of amplitude)
    # return_attention_mask=True # focus on relevant parts of the input and ignore padding.
)



In [None]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate # This is the sampling rate that the model expects, so we have to make sure we re-sample the audio to this rate.

16000

In [None]:
gtzan = splits.cast_column("audio", Audio(sampling_rate=sampling_rate)) # reseample the dataset to 16kHz to use Wav2vec.

In [None]:
max_duration = 30.0 #I'm pretty sure all the audio is close to exactly this long (skipped EDA, lol)

In [None]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        # return_attention_mask=True,
    )
    return inputs

In [None]:
# apply preprocessing function over entire dataset using `map` function.
gtzan_encoded = gtzan.map(
    preprocess_function,
    remove_columns=["audio", "file"],
    batched=True, # process multiple elements of dataset at once
    batch_size=100,
    num_proc=1,
)
gtzan_encoded

DatasetDict({
    train: Dataset({
        features: ['genre', 'input_values'],
        num_rows: 599
    })
    test: Dataset({
        features: ['genre', 'input_values'],
        num_rows: 200
    })
    eval: Dataset({
        features: ['genre', 'input_values'],
        num_rows: 200
    })
})

In [None]:
gtzan_encoded = gtzan_encoded.rename_column("genre", "label") # rename `genre` to `label`

### Evaluate

In [None]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)


### Train

In [None]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
num_labels

10

In [None]:
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-gtzan",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate = 5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    metric_for_best_model="accuracy",
    fp16=True,
    load_best_model_at_end = True,
)

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=gtzan_encoded["train"],
    eval_dataset=gtzan_encoded["eval"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.9351,2.080762,0.24
2,1.8659,1.698552,0.505
3,1.2764,1.204097,0.7
4,0.9892,1.050729,0.7
5,0.7011,1.131818,0.68
6,0.8332,0.845435,0.76
7,0.5487,0.75185,0.81
8,0.4133,0.8369,0.76
9,0.2057,0.675268,0.8
10,0.2395,0.718954,0.79


TrainOutput(global_step=750, training_loss=1.019743766784668, metrics={'train_runtime': 2354.2082, 'train_samples_per_second': 2.544, 'train_steps_per_second': 0.319, 'total_flos': 1.6314657538752e+18, 'train_loss': 1.019743766784668, 'epoch': 10.0})

In [None]:
results = trainer.evaluate()
results

{'eval_loss': 0.7518499493598938,
 'eval_accuracy': 0.81,
 'eval_runtime': 44.5769,
 'eval_samples_per_second': 4.487,
 'eval_steps_per_second': 0.561,
 'epoch': 10.0}

In [None]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/erdos24')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Saving model & feature extractor
directory = 'fine_tuned_wav2vec-base'
trainer.save_model(directory)
feature_extractor.save_pretrained(directory)

In [None]:
import torch
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor

model = AutoModelForAudioClassification.from_pretrained("/content/drive/MyDrive/erdos24/fine_tuned_wav2vec-base")
model.to('cuda')  # move model to GPU
model.eval()  # set model to evaluation mode

# get inputs
inputs = gtzan_encoded['test']['input_values']

In [None]:
# Initialize a list to store the logits
all_logits = []

for input in inputs:
    # convert input to tensor and add a batch dimension
    input_tensor = torch.tensor(input).unsqueeze(0).to('cuda')

    # perform inference and store logits
    with torch.no_grad():
        logits = model(input_tensor).logits
        all_logits.append(logits.cpu())  # move logits to CPU

    # free up memory
    del input_tensor
    torch.cuda.empty_cache()

# concatenate logits
all_logits = torch.cat(all_logits)

In [None]:
# Get the predicted class and labels
predicted_class_ids = torch.argmax(all_logits, dim=1)
predicted_labels = [model.config.id2label[class_id.item()] for class_id in predicted_class_ids]
predicted_labels

In [None]:
# Test set evaluation
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

# actual labels
labels = gtzan_encoded['test']['label']

# convert label
actual_labels = [model.config.id2label[label_id] for label_id in labels]

# calculate the confusion matrix, accuracy, f1
conf_matrix = confusion_matrix(actual_labels, predicted_labels)
accuracy = accuracy_score(actual_labels, predicted_labels)
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

conf_matrix, accuracy, f1

(array([[17,  0,  2,  0,  0,  0,  0,  0,  1,  0],
        [ 0, 20,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1,  0, 13,  0,  0,  2,  0,  0,  0,  4],
        [ 0,  0,  0, 14,  1,  0,  0,  2,  0,  3],
        [ 0,  0,  0,  0, 20,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0, 19,  0,  1,  0,  0],
        [ 0,  0,  1,  1,  0,  0, 17,  0,  0,  1],
        [ 0,  0,  2,  1,  0,  0,  0, 16,  0,  1],
        [ 1,  0,  0,  1,  3,  0,  0,  0, 14,  1],
        [ 2,  0,  4,  3,  0,  0,  1,  2,  1,  7]]),
 0.785,
 0.7815616892253605)