## Importing libraries

In [1]:
!pip install torch torchmetrics torchaudio datasets transformers scikit-learn matplotlib wandb torchcodec

from datasets import load_dataset
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import librosa
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)
from google.colab import drive
import os
from google.colab import userdata
import torch
import wandb
import torchmetrics


Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting torchcodec
  Downloading torchcodec-0.8.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchcodec-0.8.1-cp312-cp312-manylinux_2_28_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Installing collected packages: torchcodec, lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.15.2 torchcodec-0.8.1 torchmetrics-1.8.2


## Getting paths for metadata and getting a key for wandb

In [7]:
DATA_DIR = '/content/'
TRAIN_PATH = os.path.join(DATA_DIR, 'train.csv')
TEST_PATH = os.path.join(DATA_DIR, 'val.csv')


wandb_kay = userdata.get('WANDB')

## Log in to wandb

In [4]:
wandb.login(key=wandb_kay)

wandb.init(
    project="audio_test01",
)


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mboklahbohdan[0m ([33mdetect_kaggle[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Cloning your google drive

In [8]:
drive.mount('/content/drive')

Mounted at /content/drive


## Selecting the target directory

In [9]:
!rsync -a "/content/drive/MyDrive/audio_cls/" .

## Downloading a dataset

In [19]:
dataset = load_dataset(
    "csv",
    data_files={
        "train": TRAIN_PATH,
        "val": TEST_PATH
    }
)
dataset

DatasetDict({
    train: Dataset({
        features: ['path', 'target'],
        num_rows: 480
    })
    val: Dataset({
        features: ['path', 'target'],
        num_rows: 105
    })
})

## fix of path for a google colab

In [20]:
DATA_DIR_TRAIN = "/content/train"
DATA_DIR_TEST  = "/content/test"

def fix_path_train(example):
    tokens = example["path"].split('D:\\audio_cls_coursework\\data\\train\\')
    example["path"] = f"{DATA_DIR_TRAIN}/{tokens[1]}"
    return example

def fix_path_test(example):
    tokens = example["path"].split('D:\\audio_cls_coursework\\data\\test\\')
    example["path"] = f"{DATA_DIR_TEST}/{tokens[1]}"
    return example

def fix_label(example):
    example["labels"] = example.pop("target")
    return example

train_df = dataset['train']
valid_df = dataset['val']

train_df = train_df.map(fix_path_train)
valid_df = valid_df.map(fix_path_test)

train_df = train_df.map(fix_label)
valid_df = valid_df.map(fix_label)


## Initial model

In [34]:
MODEL_NAME = "facebook/hubert-base-ls960"
NUM_LABELS = 4

feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)

model = AutoModelForAudioClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    ignore_mismatched_sizes=True
)

preprocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Downloading audio samples

In [35]:
import numpy as np

TARGET_LEN = 16000 * 10

def preprocess(batch):
    waveform, sr = librosa.load(batch["path"], sr=16000)
    if len(waveform) < TARGET_LEN:
        waveform = np.pad(waveform, (0, TARGET_LEN - len(waveform)))
    else:
        waveform = waveform[:TARGET_LEN]
    inputs = feature_extractor(
        waveform,
        sampling_rate=16000,
        return_tensors="pt"
    )

    input_values = inputs["input_values"].squeeze(0)
    labels = torch.tensor(batch["labels"], dtype=torch.long)

    return {"input_values": input_values, "labels": labels}

train_df = train_df.map(preprocess)
valid_df = valid_df.map(preprocess)

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

## Implement a loss computing

In [14]:
KEY2LOSSES = {'ce': torch.nn.CrossEntropyLoss}

def compute_loss(
    model,
    inputs,
    loss_name="ce",
    return_outputs=False,
    loss_kwargs=None,
    multilabel=False
):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    assert loss_kwargs and isinstance(loss_kwargs, dict) and len(loss_kwargs) > 0, \
        "`loss_kwargs` must be a non-empty dict."
    loss_kwargs = loss_kwargs
    loss_func = KEY2LOSSES[loss_name](**loss_kwargs)
    if loss_name == "focal" and multilabel:
        labels = labels.float()
    elif loss_name == "ce":
        labels = labels.long()
    loss = loss_func(logits, labels)
    return (loss, outputs) if return_outputs else loss



## Implement metrics computing

In [15]:
def compute_metrics_hf(eval_pred):
    logits, labels = eval_pred
    logits = torch.tensor(logits)
    labels = torch.tensor(labels).long()
    probs = torch.nn.functional.softmax(logits, dim=-1)
    preds = probs.argmax(dim=-1)
    labels_np = labels.cpu().numpy()
    preds_np = preds.cpu().numpy()
    return {
        "accuracy": accuracy_score(labels_np, preds_np),
        "f1_macro": f1_score(labels_np, preds_np, average="macro"),
        "precision_macro": precision_score(labels_np, preds_np, average="macro"),
        "recall_macro": recall_score(labels_np, preds_np, average="macro"),
        "rocauc": torchmetrics.functional.auroc(
            probs, labels, task="multiclass", num_classes=probs.shape[-1]
        ).item(),
    }

## Implement a data collator

In [16]:
def collate_fn(features):
    xs = torch.stack(
        [torch.tensor(f["input_values"], dtype=torch.float32) for f in features]
    )
    labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)
    return {"input_values": xs, "labels": labels}

## Initial a training config

In [40]:

train_batch_size = 16
val_batch_size = 16
EPOCHS = 200


early_stopping = EarlyStoppingCallback(
    early_stopping_patience=5,
    early_stopping_threshold=0.0
)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=val_batch_size,
    num_train_epochs=EPOCHS,
    eval_strategy="steps",
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    report_to="wandb",
    fp16=False,
    gradient_checkpointing=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=valid_df,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics_hf,
    data_collator=collate_fn,
    callbacks=[early_stopping]
)

  trainer = Trainer(


## Train a model

In [41]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,Rocauc
10,1.3065,1.173978,0.447619,0.267907,0.482955,0.32818,0.895936
20,1.2396,1.088028,0.485714,0.361927,0.485385,0.407618,0.899448
30,1.0739,0.916077,0.704762,0.687934,0.766271,0.705476,0.941651
40,0.9984,0.914695,0.666667,0.690337,0.734549,0.679025,0.865151
50,0.9458,0.741806,0.828571,0.824434,0.848173,0.826461,0.953077
60,0.8232,0.889568,0.609524,0.558951,0.727663,0.579998,0.91778
70,0.8738,0.639237,0.828571,0.819757,0.862112,0.828407,0.985143
80,0.6934,0.495866,0.904762,0.904044,0.906062,0.908362,0.988287
90,0.7182,0.427435,0.904762,0.904467,0.923311,0.891707,0.982047
100,0.5327,0.476719,0.857143,0.873577,0.920073,0.860837,0.977988


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=180, training_loss=0.7004324091805352, metrics={'train_runtime': 1480.8665, 'train_samples_per_second': 64.827, 'train_steps_per_second': 4.052, 'total_flos': 2.614660411392e+17, 'train_loss': 0.7004324091805352, 'epoch': 6.0})

In [42]:
trainer.save_model("./results/best_model")  # збереження локально
feature_extractor.save_pretrained("./results/best_model")  # якщо є tokenizer/feature_extractor

# логування у W&B
artifact = wandb.Artifact("best_model", type="model")
artifact.add_dir("./results/best_model")
wandb.log_artifact(artifact)

[34m[1mwandb[0m: Adding directory to artifact (results/best_model)... Done. 12.5s


<Artifact best_model>