In [None]:
!pip install torch pandas numpy librosa huggingface_hub transformers evaluate soundfile accelerate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

In [None]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency res

In [None]:
from datasets import load_dataset, Dataset
import torch, gc, os, json
import pandas as pd
import numpy as np
from transformers import (
    Wav2Vec2FeatureExtractor,
    AutoModelForAudioClassification,
    TrainerCallback,
    TrainingArguments,
    Trainer,
)
import evaluate
import torch.nn.functional as F
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve
from scipy.optimize import brentq
from scipy.interpolate import interp1d

In [None]:
voxceleb1 = load_dataset("Codec-SUPERB/Voxceleb1_test_original")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Running on:", device)

README.md:   0%|          | 0.00/480 [00:00<?, ?B/s]

(…)-00000-of-00003-d0eea6966e0eba2f.parquet:   0%|          | 0.00/441M [00:00<?, ?B/s]

(…)-00001-of-00003-1ab27c4d03da1754.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00002-of-00003-4d6815b5778a4368.parquet:   0%|          | 0.00/435M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4874 [00:00<?, ? examples/s]

Running on: cuda:0


In [None]:
waveform, sr, id_list = [], [], []
for dp in voxceleb1["test"]:
    waveform.append(dp["audio"]["array"])
    sr.append(dp["audio"]["sampling_rate"])
    id_list.append(dp["id"])

In [None]:
def preprocess_id(id_):
    idx = id_.find("+")
    return id_[:idx]

cleaned_id = [preprocess_id(i) for i in id_list]
df = pd.DataFrame({"audio": waveform, "id": cleaned_id})
hf_dataset_from_df = Dataset.from_pandas(df)

In [None]:
target_length = 16000

In [None]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

In [None]:
def preprocess_function(example):
    audio_array = example["audio"]
    cur_len = len(audio_array)

    if cur_len < target_length:
        audio_array = np.pad(audio_array, (0, target_length - cur_len), "constant")
    elif cur_len > target_length:
        audio_array = audio_array[:target_length]

    inputs = feature_extractor(
        audio_array, sampling_rate=16000, return_tensors="pt", padding="longest"
    )
    return {"input_values": inputs.input_values.squeeze(0)}

In [None]:
label_list = sorted(set(cleaned_id))
num_labels = len(label_list)
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

def convert_labels_to_ids(example):
    example["label_ids"] = label2id[example["id"]]
    return example

In [None]:
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_score(labels, predictions, average="macro", zero_division=0)
    recall = recall_score(labels, predictions, average="macro", zero_division=0)
    f1 = f1_score(labels, predictions, average="macro", zero_division=0)

    # Multi-class EER (one-vs-rest)
    eer_vals = []
    for cls in np.unique(labels):
        bin_labels = (labels == cls).astype(int)
        probs = eval_pred.predictions[:, cls]
        fpr, tpr, _ = roc_curve(bin_labels, probs)
        if len(fpr) > 1:
            eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0)
            eer_vals.append(eer)
        else:
            eer_vals.append(0.0)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "eer": np.mean(eer_vals),
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def nll_loss(logits, labels):
    return F.cross_entropy(logits, labels.long())

class CustomTrainer(Trainer):  # keep extra arg for safety
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = nll_loss(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

def minimal_data_collator(features):
    inputs = torch.stack([torch.as_tensor(f["input_values"]) for f in features]).to(device)
    labels = torch.tensor([f["label_ids"] for f in features]).to(device)
    return {"input_values": inputs, "labels": labels}

In [None]:
target_lengths = [
    16000, 32000, 48000, 63360, 64000, 80000, 96000, 103680,
    112000, 128000, 132400, 160000, 240000, 480000, 800000, 1104640
]

In [None]:
class JsonLoggerCallback(TrainerCallback):
    def __init__(self, filepath):
        self.filepath = filepath
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is None:   # just in case HF calls with None
            return
        record = {"epoch": state.epoch}
        record.update(metrics)
        with open(self.filepath, "a") as f:
            f.write(json.dumps(record) + "\n")

In [None]:
for tl in target_lengths:
    print(f"\n Starting run for target_length = {tl}")
    target_length = tl  # <-- updates the global used by preprocess_function

    # Re-encode / label-encode the dataset
    proc_ds = hf_dataset_from_df.map(
        preprocess_function, remove_columns=["audio"], desc=f"Padding to {tl}"
    ).map(convert_labels_to_ids, remove_columns=["id"])
    split_ds = proc_ds.train_test_split(test_size=0.2, seed=42)
    split_ds.set_format("torch")   # leave on CPU; collator moves to GPU

    # Fresh model each time
    model = AutoModelForAudioClassification.from_pretrained(
        "facebook/wav2vec2-base",
        num_labels=num_labels,
        label2id=label2id,
        id2label=id2label,
        ignore_mismatched_sizes=True,
    ).to(device)

    # HF args -- no checkpoint saving
    args = TrainingArguments(
        output_dir=f"./tmp_{tl}",           # temp dir
        eval_strategy="epoch",
        save_strategy="no",                # ← nothing saved
        learning_rate=3e-5,
        per_device_train_batch_size=32,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=32,
        num_train_epochs=20,
        warmup_ratio=0.1,
        logging_strategy="epoch",
        metric_for_best_model="eer",
        greater_is_better=False,
        dataloader_pin_memory=False,
        remove_unused_columns=False,
        report_to="none",
        push_to_hub=False,
    )

    json_path = f"eval_{tl}.jsonl"
    if os.path.exists(json_path):     # start fresh
        os.remove(json_path)

    trainer = CustomTrainer(
        model=model,
        args=args,
        train_dataset=split_ds["train"],
        eval_dataset=split_ds["test"],
        compute_metrics=compute_metrics,
        data_collator=minimal_data_collator,
        callbacks=[JsonLoggerCallback(json_path)],
    )

    trainer.train()

    # tidy GPU / RAM before next run
    del model, trainer, split_ds, proc_ds
    torch.cuda.empty_cache()
    gc.collect()

print("\n All lengths completed!  Check the `eval_<length>.jsonl` files.")


 Starting run for target_length = 16000


Padding to 16000:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.6197,3.652077,0.085128,0.005908,0.030986,0.008811,0.474773
2,3.5353,3.507611,0.118974,0.006429,0.047442,0.011173,0.270712
3,3.3157,3.234349,0.201026,0.09815,0.101061,0.062447,0.222422
4,3.0836,3.04397,0.270769,0.088451,0.15285,0.097644,0.184088
5,2.9155,2.891683,0.304615,0.107831,0.180421,0.119891,0.16514
6,2.7629,2.745325,0.327179,0.133703,0.204214,0.140774,0.158228
7,2.6304,2.63901,0.354872,0.16051,0.228036,0.158965,0.154876
8,2.5084,2.575936,0.387692,0.230648,0.262887,0.199338,0.154956
9,2.4058,2.486103,0.41641,0.251907,0.289728,0.233661,0.15069
10,2.3036,2.413656,0.443077,0.269867,0.314836,0.258283,0.151972



 Starting run for target_length = 32000


Padding to 32000:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.6179,3.646003,0.089231,0.015583,0.033406,0.011209,0.415058
2,3.5311,3.460649,0.168205,0.021499,0.073353,0.029103,0.233345
3,3.2251,3.052583,0.353846,0.137563,0.207362,0.142171,0.153819
4,2.8743,2.779036,0.447179,0.300212,0.297776,0.236681,0.122072
5,2.6044,2.500363,0.530256,0.311413,0.36883,0.30091,0.097164
6,2.3582,2.305407,0.573333,0.388563,0.413629,0.359314,0.090623
7,2.1569,2.108551,0.636923,0.445611,0.479533,0.430015,0.071187
8,1.9731,1.954893,0.702564,0.56872,0.568163,0.517467,0.066675
9,1.8139,1.81373,0.723077,0.579828,0.595234,0.555102,0.058657
10,1.6605,1.695503,0.730256,0.628778,0.608401,0.576604,0.055084



 Starting run for target_length = 48000


Padding to 48000:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.6162,3.641789,0.087179,0.009106,0.031744,0.010668,0.386644
2,3.5132,3.402206,0.211282,0.059413,0.100064,0.051822,0.196261
3,3.1458,2.941633,0.42359,0.222482,0.259974,0.195251,0.118127
4,2.749,2.608725,0.545641,0.343664,0.375745,0.318556,0.090205
5,2.4503,2.311013,0.642051,0.417427,0.46913,0.413267,0.06815
6,2.164,2.054217,0.708718,0.540687,0.572084,0.523496,0.049363
7,1.9084,1.81094,0.780513,0.640855,0.665864,0.62759,0.040671
8,1.6746,1.604537,0.821538,0.685653,0.718974,0.68842,0.032113
9,1.4904,1.435237,0.850256,0.722538,0.755016,0.721653,0.029105
10,1.3244,1.296751,0.859487,0.783696,0.775765,0.753859,0.025123



 Starting run for target_length = 63360


Padding to 63360:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.6152,3.640035,0.084103,0.010564,0.029796,0.009834,0.373214
2,3.5004,3.368121,0.260513,0.137314,0.134421,0.086047,0.175979
3,3.0829,2.844554,0.484103,0.267034,0.329094,0.260823,0.076592
4,2.6518,2.473274,0.64,0.474289,0.472105,0.424733,0.054148
5,2.323,2.150564,0.716923,0.537202,0.556416,0.502332,0.037252
6,2.0236,1.879001,0.801026,0.66618,0.68125,0.647206,0.024116
7,1.7578,1.629181,0.844103,0.728477,0.746504,0.716673,0.018799
8,1.5234,1.414068,0.887179,0.78713,0.806204,0.781902,0.013682
9,1.3274,1.252818,0.902564,0.832194,0.837293,0.818398,0.017479
10,1.1633,1.09938,0.935385,0.93543,0.898744,0.88871,0.011243



 Starting run for target_length = 64000


Padding to 64000:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.6151,3.638748,0.087179,0.010351,0.031327,0.010779,0.367656
2,3.496,3.36488,0.253333,0.125666,0.129636,0.07569,0.183494
3,3.0834,2.844255,0.500513,0.264858,0.335785,0.26454,0.079405
4,2.6532,2.485329,0.619487,0.381478,0.446107,0.378949,0.058045
5,2.3281,2.161625,0.728205,0.581336,0.581401,0.536353,0.033137
6,2.0224,1.850978,0.831795,0.69257,0.723927,0.688804,0.020546
7,1.751,1.624139,0.843077,0.727563,0.745381,0.712405,0.017833
8,1.5366,1.428079,0.88,0.798966,0.798177,0.774109,0.016429
9,1.3467,1.271196,0.909744,0.896137,0.846994,0.839233,0.018101
10,1.1865,1.124871,0.930256,0.905882,0.892018,0.888097,0.01336



 Starting run for target_length = 80000


Padding to 80000:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.615,3.639718,0.092308,0.035471,0.034043,0.013456,0.368839
2,3.4957,3.352193,0.285128,0.173954,0.149005,0.101878,0.16145
3,3.0693,2.830564,0.501538,0.272455,0.342769,0.274611,0.070867
4,2.6204,2.408215,0.686154,0.482969,0.523665,0.475471,0.035865
5,2.2487,2.085759,0.755897,0.620223,0.617286,0.577054,0.030773
6,1.9513,1.795969,0.832821,0.732814,0.737895,0.711585,0.019177
7,1.689,1.548038,0.887179,0.804726,0.810658,0.79066,0.008895
8,1.4666,1.345212,0.924103,0.887003,0.86672,0.852449,0.003862
9,1.2769,1.17709,0.939487,0.92129,0.893911,0.887118,0.004311
10,1.1146,1.03796,0.962051,0.959772,0.939223,0.944978,0.004734



 Starting run for target_length = 96000


Padding to 96000:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.6146,3.638688,0.094359,0.037157,0.035063,0.01402,0.360819
2,3.4917,3.348127,0.286154,0.139724,0.152452,0.105572,0.161919
3,3.0604,2.822367,0.523077,0.330734,0.361506,0.291794,0.061139
4,2.6161,2.407708,0.70359,0.491358,0.543321,0.490961,0.03946
5,2.252,2.067536,0.769231,0.642503,0.64415,0.602168,0.021052
6,1.9363,1.781086,0.857436,0.738692,0.773412,0.744049,0.016249
7,1.6712,1.537045,0.871795,0.770124,0.788969,0.763269,0.013257
8,1.4428,1.329894,0.913846,0.872009,0.853651,0.839355,0.012923
9,1.2474,1.161613,0.935385,0.937481,0.896697,0.894758,0.012366
10,1.086,1.016819,0.954872,0.952199,0.928422,0.932983,0.012103



 Starting run for target_length = 103680


Padding to 103680:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.6142,3.638162,0.09641,0.011683,0.035964,0.013485,0.36462
2,3.4889,3.335325,0.314872,0.145114,0.169023,0.119161,0.163803
3,3.0542,2.824181,0.524103,0.306822,0.365284,0.297829,0.07526
4,2.6039,2.392124,0.702564,0.528361,0.541371,0.487276,0.039146
5,2.2317,2.045266,0.775385,0.669162,0.647916,0.615087,0.019688
6,1.9178,1.752837,0.859487,0.744032,0.773225,0.741077,0.011717
7,1.6575,1.527541,0.899487,0.812075,0.826184,0.804178,0.00911
8,1.4426,1.320824,0.924103,0.882055,0.860128,0.84691,0.00628
9,1.2508,1.152479,0.945641,0.945469,0.909496,0.910293,0.006198
10,1.0877,1.00434,0.973333,0.977843,0.957361,0.964483,0.003639



 Starting run for target_length = 112000


Padding to 112000:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.614,3.638369,0.100513,0.009784,0.03805,0.013001,0.362256
2,3.4917,3.340605,0.299487,0.142563,0.159376,0.111367,0.152286
3,3.0622,2.836296,0.514872,0.322198,0.353973,0.289258,0.081529
4,2.6052,2.398132,0.705641,0.534955,0.562687,0.514356,0.039152
5,2.2305,2.050162,0.797949,0.664263,0.676551,0.643134,0.023378
6,1.9329,1.779993,0.859487,0.76405,0.769692,0.745281,0.015481
7,1.6762,1.536445,0.895385,0.81004,0.815055,0.795793,0.013779
8,1.4426,1.33056,0.918974,0.86963,0.857131,0.846789,0.010701
9,1.248,1.150451,0.948718,0.927762,0.909092,0.904483,0.007909
10,1.0835,1.002036,0.971282,0.967949,0.953223,0.957805,0.007152



 Starting run for target_length = 128000


Padding to 128000:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.614,3.639291,0.114872,0.009865,0.046136,0.014283,0.368194
2,3.4946,3.357116,0.291282,0.186051,0.155223,0.107774,0.170367
3,3.0637,2.802845,0.54359,0.317126,0.376708,0.304734,0.063517
4,2.5865,2.382162,0.704615,0.540399,0.550542,0.505888,0.032574
5,2.2212,2.048642,0.777436,0.686191,0.650748,0.625217,0.026148
6,1.9207,1.752254,0.872821,0.754788,0.78632,0.757004,0.011725
7,1.6546,1.521945,0.900513,0.836548,0.826305,0.806217,0.0131
8,1.4329,1.309935,0.925128,0.912235,0.870604,0.860394,0.006048
9,1.2394,1.138512,0.947692,0.926306,0.915946,0.912959,0.004365
10,1.0733,0.994299,0.968205,0.964445,0.953622,0.955775,0.003867



 Starting run for target_length = 132400


Padding to 132400:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.6139,3.639435,0.111795,0.008215,0.044111,0.013498,0.372442
2,3.4957,3.359248,0.25641,0.088146,0.129064,0.077006,0.172259
3,3.0781,2.857113,0.52,0.279717,0.354716,0.29095,0.072293
4,2.6262,2.412827,0.677949,0.47837,0.520112,0.457022,0.038208
5,2.2531,2.083358,0.771282,0.655766,0.638287,0.59352,0.025908
6,1.9498,1.79263,0.862564,0.745203,0.773587,0.746323,0.01612
7,1.6808,1.539694,0.902564,0.811337,0.826944,0.802543,0.011601
8,1.4413,1.328891,0.925128,0.85877,0.861643,0.843417,0.01062
9,1.2474,1.15266,0.940513,0.89348,0.895544,0.880519,0.006486
10,1.0855,1.011473,0.96,0.95679,0.934271,0.934347,0.007007



 Starting run for target_length = 160000


Padding to 160000:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.6139,3.641005,0.115897,0.010859,0.046678,0.013539,0.364441
2,3.5009,3.366131,0.250256,0.13166,0.132221,0.086611,0.170472
3,3.0826,2.830274,0.526154,0.296696,0.363597,0.299555,0.06567
4,2.6142,2.413047,0.700513,0.512514,0.545905,0.49212,0.034909
5,2.2469,2.077893,0.764103,0.6321,0.624576,0.583471,0.026736
6,1.9506,1.794026,0.855385,0.77089,0.766142,0.744124,0.018854
7,1.674,1.541302,0.900513,0.848221,0.830828,0.819072,0.010557
8,1.4513,1.330878,0.929231,0.891686,0.874281,0.866763,0.00691
9,1.262,1.166944,0.94359,0.945651,0.897449,0.898265,0.008349
10,1.1009,1.016429,0.964103,0.941666,0.934365,0.934777,0.003062



 Starting run for target_length = 240000


Padding to 240000:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.6115,3.63033,0.098462,0.009112,0.037188,0.01292,0.392219
2,3.4999,3.426035,0.184615,0.041316,0.081245,0.042808,0.230501
3,3.2046,3.003147,0.397949,0.230201,0.243993,0.203967,0.129988
4,2.7814,2.578969,0.595897,0.390199,0.418519,0.359944,0.05182
5,2.4108,2.236853,0.709744,0.505869,0.548482,0.493757,0.038049
6,2.0917,1.965623,0.752821,0.576435,0.606343,0.556129,0.031645
7,1.8267,1.709,0.826667,0.701,0.711828,0.675009,0.022538
8,1.5962,1.489526,0.867692,0.777987,0.788728,0.767036,0.015279
9,1.3843,1.289586,0.913846,0.856721,0.852341,0.836807,0.01115
10,1.2133,1.145183,0.932308,0.86398,0.882126,0.86417,0.008798



 Starting run for target_length = 480000


Padding to 480000:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 5.86 GiB. GPU 0 has a total capacity of 39.56 GiB of which 166.88 MiB is free. Process 5321 has 39.38 GiB memory in use. Of the allocated memory 24.45 GiB is allocated by PyTorch, and 14.44 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
target_lengths_continued = [480000,  800000, 1104640]

for tl in target_lengths_continued:
    print(f"\n Starting run for target_length = {tl}")
    target_length = tl  # <-- updates the global used by preprocess_function

    # Re-encode / label-encode the dataset
    proc_ds = hf_dataset_from_df.map(
        preprocess_function, remove_columns=["audio"], desc=f"Padding to {tl}"
    ).map(convert_labels_to_ids, remove_columns=["id"])
    split_ds = proc_ds.train_test_split(test_size=0.2, seed=42)
    split_ds.set_format("torch")   # leave on CPU; collator moves to GPU

    # Fresh model each time
    model = AutoModelForAudioClassification.from_pretrained(
        "facebook/wav2vec2-base",
        num_labels=num_labels,
        label2id=label2id,
        id2label=id2label,
        ignore_mismatched_sizes=True,
    ).to(device)
    model.gradient_checkpointing_enable()

    # HF args -- no checkpoint saving
    args = TrainingArguments(
        output_dir=f"./tmp_{tl}",           # temp dir
        eval_strategy="epoch",
        save_strategy="no",                # ← nothing saved
        learning_rate=3e-5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=64,
        per_device_eval_batch_size=1,
        num_train_epochs=20,
        warmup_ratio=0.1,
        bf16=True,
        gradient_checkpointing=True,
        logging_strategy="epoch",
        metric_for_best_model="eer",
        greater_is_better=False,
        dataloader_pin_memory=False,
        remove_unused_columns=False,
        report_to="none",
        push_to_hub=False,
    )

    json_path = f"eval_{tl}.jsonl"
    if os.path.exists(json_path):     # start fresh
        os.remove(json_path)

    trainer = CustomTrainer(
        model=model,
        args=args,
        train_dataset=split_ds["train"],
        eval_dataset=split_ds["test"],
        compute_metrics=compute_metrics,
        data_collator=minimal_data_collator,
        callbacks=[JsonLoggerCallback(json_path)],
    )

    trainer.train()

    # tidy GPU / RAM before next run
    del model, trainer, split_ds, proc_ds
    torch.cuda.empty_cache()
    gc.collect()

print("\n All lengths completed!  Check the `eval_<length>.jsonl` files.")


 Starting run for target_length = 480000


Padding to 480000:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.5991,3.61532,0.088205,0.01245,0.032734,0.010931,0.387684
2,3.4499,3.321485,0.335385,0.143289,0.185802,0.128159,0.165858
3,3.1141,2.94872,0.507692,0.263525,0.331869,0.259742,0.093625
4,2.759,2.590778,0.605128,0.40355,0.429528,0.365858,0.063653
5,2.4266,2.272861,0.698462,0.501381,0.53392,0.481234,0.05218
6,2.1241,2.00233,0.753846,0.635835,0.616341,0.581464,0.030062
7,1.8619,1.739829,0.85641,0.835763,0.767156,0.753054,0.01609
8,1.6366,1.540531,0.908718,0.891286,0.854108,0.856999,0.013247
9,1.4416,1.361126,0.933333,0.905055,0.900903,0.898295,0.011038
10,1.2507,1.183695,0.94359,0.921507,0.920737,0.91657,0.009215



 Starting run for target_length = 800000


Padding to 800000:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.5926,3.604883,0.10359,0.036422,0.045969,0.020852,0.352288
2,3.4437,3.346323,0.174359,0.02651,0.087517,0.032958,0.214257
3,3.1386,2.989065,0.414359,0.252837,0.273158,0.212396,0.112608
4,2.7976,2.653159,0.585641,0.375441,0.427123,0.369016,0.073933
5,2.491,2.374904,0.645128,0.393501,0.480026,0.411452,0.064316
6,2.2199,2.085892,0.706667,0.497852,0.540775,0.485722,0.042374
7,1.9732,1.888019,0.74359,0.545989,0.58968,0.540344,0.038506
8,1.7685,1.682809,0.773333,0.584888,0.619622,0.575821,0.031446
9,1.5744,1.5057,0.825641,0.751973,0.705136,0.684671,0.026238
10,1.3877,1.327628,0.876923,0.788876,0.783162,0.757008,0.013091



 Starting run for target_length = 1104640


Padding to 1104640:   0%|          | 0/4874 [00:00<?, ? examples/s]

Map:   0%|          | 0/4874 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Eer
1,3.592,3.603732,0.093333,0.021569,0.040687,0.01337,0.351102
2,3.4341,3.324852,0.181538,0.075714,0.09368,0.044248,0.194237
3,3.1548,3.024584,0.355897,0.21028,0.228378,0.177616,0.113984
4,2.8634,2.739486,0.529231,0.365187,0.380769,0.327622,0.086848
5,2.5629,2.43988,0.603077,0.400984,0.453104,0.385423,0.064482
6,2.2763,2.156895,0.668718,0.446938,0.499582,0.438178,0.055362
7,2.0145,1.931453,0.731282,0.548256,0.578841,0.528504,0.04598
8,1.7942,1.720076,0.755897,0.592053,0.603912,0.560578,0.025067
9,1.6069,1.548928,0.832821,0.740777,0.71362,0.686813,0.026389
10,1.4385,1.370541,0.887179,0.861451,0.801565,0.797836,0.01576



 All lengths completed!  Check the `eval_<length>.jsonl` files.
