In [None]:
!pip install transformers[torch] datasets[audio] evaluate


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset

gtzan = load_dataset("marsyas/gtzan", split="train")
gtzan


Downloading builder script:   0%|          | 0.00/3.38k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['file', 'audio', 'genre'],
    num_rows: 999
})

In [None]:
gtzan = gtzan.train_test_split(seed=42, shuffle=True, test_size=0.1)
gtzan

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 899
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 100
    })
})

In [None]:
from transformers import AutoFeatureExtractor

model_id = "openai/whisper-tiny"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True,
    #return_attention_mask=True
)

In [None]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [None]:
from datasets import Audio

gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [None]:
max_duration = 30.0


def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        # return_attention_mask=True,
    )
    return inputs

In [None]:
gtzan_encoded = gtzan.map(
    preprocess_function,
    remove_columns=["audio", "file"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
gtzan_encoded

Map:   0%|          | 0/899 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['genre', 'input_features'],
        num_rows: 899
    })
    test: Dataset({
        features: ['genre', 'input_features'],
        num_rows: 100
    })
})

In [None]:
gtzan_encoded = gtzan_encoded.rename_column("genre", "label")

In [None]:
id2label_fn = gtzan["train"].features["genre"].int2str
id2label_fn(gtzan["train"][0]["genre"])

'pop'

In [None]:
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(gtzan_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}

id2label["7"]


'pop'

In [None]:
from transformers import AutoModelForAudioClassification

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of WhisperForAudioClassification were not initialized from the model checkpoint at openai/whisper-tiny and are newly initialized: ['model.classifier.bias', 'model.classifier.weight', 'model.projector.weight', 'model.projector.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-gtzan-bs-{batch_size}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True,
)

In [None]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=gtzan_encoded["train"],
    eval_dataset=gtzan_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

/content/whisper-tiny-finetuned-gtzan-bs-8 is already a clone of https://huggingface.co/arpan-das-astrophysics/whisper-tiny-finetuned-gtzan-bs-8. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.4157,1.189636,0.67
2,0.7796,0.725877,0.75
3,0.3832,0.521388,0.83
4,0.3417,0.418241,0.86
5,0.2349,0.94437,0.73
6,0.0056,0.437724,0.91
7,0.1083,0.518987,0.9
8,0.0022,0.564216,0.89
9,0.1358,0.512466,0.91
10,0.0016,0.517112,0.91


TrainOutput(global_step=1130, training_loss=0.40022131267819655, metrics={'train_runtime': 1573.0471, 'train_samples_per_second': 5.715, 'train_steps_per_second': 0.718, 'total_flos': 1.001149788384e+17, 'train_loss': 0.40022131267819655, 'epoch': 10.0})

In [None]:
kwargs = {
    "dataset_tags": "marsyas/gtzan",
    "dataset": "GTZAN",
    "model_name": f"{model_name}-finetuned-gtzan",
    "finetuned_from": model_id,
    "tasks": "audio-classification",
}

trainer.push_to_hub(**kwargs)

Upload file runs/Aug15_07-28-49_2a1a6b13fc48/events.out.tfevents.1692084532.2a1a6b13fc48.522.2:   0%|         …

To https://huggingface.co/arpan-das-astrophysics/whisper-tiny-finetuned-gtzan-bs-8
   3fd9043..3207c24  main -> main

   3fd9043..3207c24  main -> main

To https://huggingface.co/arpan-das-astrophysics/whisper-tiny-finetuned-gtzan-bs-8
   3207c24..3655b69  main -> main

   3207c24..3655b69  main -> main



'https://huggingface.co/arpan-das-astrophysics/whisper-tiny-finetuned-gtzan-bs-8/commit/3207c2479518d7f80a95ed322afd728b82a3d946'