## Importing libraries

In [1]:
!pip install torch torchmetrics torchaudio datasets transformers scikit-learn matplotlib wandb torchcodec

from datasets import load_dataset
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import librosa
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)
from google.colab import drive
import os
from google.colab import userdata
import torch
import wandb
import torchmetrics


Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting torchcodec
  Downloading torchcodec-0.8.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchcodec-0.8.1-cp312-cp312-manylinux_2_28_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Installing collected packages: torchcodec, lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.15.2 torchcodec-0.8.1 torchmetrics-1.8.2


## Getting paths for metadata and getting a key for wandb

In [3]:
BASE_DIR = '/content/'
TRAIN_PATH = os.path.join(BASE_DIR, 'train.csv')
VALID_PATH = os.path.join(BASE_DIR, 'valid.csv')
TEST_PATH  = os.path.join(BASE_DIR, 'test.csv')

wandb_kay = userdata.get('WANDB')

## Log in to wandb

In [4]:
wandb.login(key=wandb_kay)

wandb.init(
    project="Hubert",
)


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mboklahbohdan[0m ([33mdetect_kaggle[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Cloning your google drive

In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


## Selecting the target directory

In [6]:
!rsync -a "/content/drive/MyDrive/audio_cls/" .

## Downloading a dataset

In [7]:
dataset = load_dataset(
    "csv",
    data_files={
        "train": TRAIN_PATH,
        "valid": VALID_PATH,
        "test" : TEST_PATH
    }
)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['path', 'target'],
        num_rows: 480
    })
    valid: Dataset({
        features: ['path', 'target'],
        num_rows: 96
    })
    test: Dataset({
        features: ['path', 'target'],
        num_rows: 105
    })
})

## fix of path for a google colab

In [8]:
DATA_DIR_TRAIN = "/content/train"
DATA_DIR_VALID = "/content/valid"
DATA_DIR_TEST  = "/content/test"

def fix_path_train(example):
    example["path"] = f"{DATA_DIR_TRAIN}/{example["path"]}"
    return example

def fix_path_valid(example):
    example["path"] = f"{DATA_DIR_VALID}/{example["path"]}"
    return example

def fix_path_test(example):
    example["path"] = f"{DATA_DIR_TEST}/{example["path"]}"
    return example

def fix_label(example):
    example["labels"] = example.pop("target")
    return example

train_df = dataset['train']
valid_df = dataset['valid']
test_df  = dataset['test']

train_df = train_df.map(fix_path_train)
valid_df = valid_df.map(fix_path_valid)
test_df  = test_df.map(fix_path_test)

train_df = train_df.map(fix_label)
valid_df = valid_df.map(fix_label)
test_df  = test_df.map(fix_label)


Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

## Initial model

In [9]:
MODEL_NAME = "facebook/hubert-base-ls960"
NUM_LABELS = 4

feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)

model = AutoModelForAudioClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    ignore_mismatched_sizes=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Downloading audio samples

In [11]:
import numpy as np

TARGET_LEN = 16000 * 8

def preprocess(batch):
    waveform, sr = librosa.load(batch["path"], sr=16000)
    if len(waveform) < TARGET_LEN:
        waveform = np.pad(waveform, (0, TARGET_LEN - len(waveform)))
    else:
        waveform = waveform[:TARGET_LEN]
    inputs = feature_extractor(
        waveform,
        sampling_rate=16000,
        return_tensors="pt"
    )

    input_values = inputs["input_values"].squeeze(0)
    labels = torch.tensor(batch["labels"], dtype=torch.long)

    return {"input_values": input_values, "labels": labels}

train_df = train_df.map(preprocess)
valid_df = valid_df.map(preprocess)
test_df = test_df.map(preprocess)

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

## Implement metrics computing

In [13]:
def compute_metrics_hf(eval_pred):
    logits, labels = eval_pred
    logits = torch.tensor(logits)
    labels = torch.tensor(labels).long()
    probs = torch.nn.functional.softmax(logits, dim=-1)
    preds = probs.argmax(dim=-1)
    labels_np = labels.cpu().numpy()
    preds_np = preds.cpu().numpy()
    return {
        "accuracy": accuracy_score(labels_np, preds_np),
        "f1_macro": f1_score(labels_np, preds_np, average="macro"),
        "precision_macro": precision_score(labels_np, preds_np, average="macro"),
        "recall_macro": recall_score(labels_np, preds_np, average="macro"),
    }

## Implement a data collator

In [14]:
def collate_fn(features):
    xs = torch.stack(
        [torch.tensor(f["input_values"], dtype=torch.float32) for f in features]
    )
    labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)
    return {"input_values": xs, "labels": labels}

## Initial a training config

In [19]:

train_batch_size = 20
val_batch_size = 12
EPOCHS = 200


early_stopping = EarlyStoppingCallback(
    early_stopping_patience=5,
    early_stopping_threshold=0.0
)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=val_batch_size,
    num_train_epochs=EPOCHS,
    eval_strategy="steps",
    save_steps=10,
    eval_steps=10,
    logging_strategy="steps",
    logging_steps=10,
    report_to="wandb",
    fp16=False,
    gradient_checkpointing=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=valid_df,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics_hf,
    data_collator=collate_fn,
    callbacks=[early_stopping]
)

  trainer = Trainer(


## Train a model

In [20]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
10,1.3002,1.146388,0.489583,0.360101,0.483004,0.392449
20,1.1783,0.979206,0.65625,0.64899,0.818269,0.636905
30,0.9851,0.77628,0.791667,0.774001,0.831127,0.776129
40,0.9128,0.642471,0.854167,0.858951,0.917693,0.841518
50,0.7024,0.547238,0.885417,0.889789,0.886521,0.899583
60,0.6214,0.400161,0.927083,0.935776,0.951538,0.925683
70,0.6145,0.404178,0.916667,0.918048,0.922103,0.916681
80,0.4813,0.319016,0.9375,0.947155,0.9625,0.939732
90,0.4285,0.26768,0.947917,0.946623,0.943627,0.951812
100,0.4351,0.198872,0.96875,0.96896,0.970089,0.968093


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=150, training_loss=0.6514357678095499, metrics={'train_runtime': 1691.6798, 'train_samples_per_second': 56.748, 'train_steps_per_second': 2.837, 'total_flos': 2.17888367616e+17, 'train_loss': 0.6514357678095499, 'epoch': 6.25})

In [21]:
trainer.save_model("./results/best_model")
feature_extractor.save_pretrained("./results/best_model")
artifact = wandb.Artifact("best_model", type="model")
artifact.add_dir("./results/best_model")
wandb.log_artifact(artifact)


[34m[1mwandb[0m: Adding directory to artifact (results/best_model)... Done. 2.2s


<Artifact best_model>

In [22]:
predictions_output = trainer.predict(test_df)
print("Метрики на тесті:")
for k, v in predictions_output.metrics.items():
    print(f"{k}: {v:.4f}")
logits = predictions_output.predictions
probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
preds = probs.argmax(dim=-1).numpy()
labels = predictions_output.label_ids

Метрики на тесті:
test_loss: 0.2929
test_accuracy: 0.9143
test_f1_macro: 0.9125
test_precision_macro: 0.9271
test_recall_macro: 0.9018
test_runtime: 30.8825
test_samples_per_second: 3.4000
test_steps_per_second: 0.2910
