In [23]:
!pip install jiwer torchcodec



In [2]:
import os, sys, math, random, json
from pathlib import Path
import numpy as np
import torch

from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
from tokenizers.pre_tokenizers import Whitespace
from transformers import Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
from transformers import TrainingArguments, Trainer
import soundfile as sf
import torchaudio

In [3]:
print("torch:", torch.__version__)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

torch: 2.8.0+cu126
device: cuda


In [24]:
train_dataset = load_dataset("librispeech_asr", "clean", split="train.100")
val_dataset = load_dataset("librispeech_asr", "clean", split="validation")

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

In [25]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
train_dataset.shape

(28539, 6)

In [27]:
val_dataset.shape

(2703, 6)

In [28]:
dataset = train_dataset.train_test_split(test_size=0.4, seed=42)["train"]

In [29]:
from datasets import Audio

train_dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
val_dataset = val_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [30]:
def preprocess(batch):
    speech = batch["audio"]["array"]   # waveform as numpy array
    batch["input_values"] = processor(speech, sampling_rate=16000).input_values[0]
    batch["labels"] = processor.tokenizer(batch["text"]).input_ids
    return batch

train_dataset = train_dataset.map(preprocess)
val_dataset = val_dataset.map(preprocess)

In [31]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [32]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [13]:
from jiwer import wer

In [41]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # Copy labels to avoid modifying the Trainer's object
    label_ids = pred.label_ids.copy()
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode predictions & references
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    # Compute WER with jiwer
    wer_score = wer(label_str, pred_str)

    return {"wer": wer_score}

In [16]:
model = Wav2Vec2ForCTC.from_pretrained(
    "microsoft/wavlm-large",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
).to(device)

You are using a model of type wavlm to instantiate a model of type wav2vec2. This is not supported for all configurations of models and can yield errors.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at microsoft/wavlm-large and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
model.freeze_feature_extractor()



In [45]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./wavlm-ctc-ex-1",
    group_by_length=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    eval_strategy="steps",
    num_train_epochs=3,
    fp16=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=100,
    learning_rate=3e-4,
    weight_decay=0.005,
    warmup_steps=100,
    lr_scheduler_type="linear",
    adam_beta1= 0.9,
    adam_beta2= 0.999,
    adam_epsilon= 1e-08,
    max_grad_norm= 1.0,
    save_total_limit=2,
    push_to_hub=False,
    report_to="none"
)

In [34]:
from torch.optim import Adam

optimizer = Adam(
    model.parameters(),
    lr=1e-4,
    betas=(0.9, 0.999),
    eps=1e-8
)

In [52]:
import torch.nn as nn

# Example: increase dropout to 0.1
model.dropout = nn.Dropout(p=0.1)

In [53]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.feature_extractor,
    optimizers=(optimizer, None)
)

  trainer = Trainer(


In [54]:
# from google.colab import drive
# drive.mount('/content/drive')

In [55]:
trainer.train(resume_from_checkpoint="/content/wavlm-ctc-ex-1/checkpoint-2400")



Step,Training Loss,Validation Loss,Wer
2500,0.6325,0.365358,0.350447
2600,0.6098,0.347094,0.336256
2700,0.5974,0.336605,0.325962
2800,0.5755,0.321358,0.310375
2900,0.5714,0.30949,0.307305
3000,0.5553,0.302762,0.296864
3100,0.5333,0.296,0.282398
3200,0.5378,0.287477,0.272012
3300,0.5216,0.278511,0.266792
3400,0.5035,0.272775,0.263281




TrainOutput(global_step=6422, training_loss=0.28220631971570126, metrics={'train_runtime': 23648.9723, 'train_samples_per_second': 2.172, 'train_steps_per_second': 0.272, 'total_flos': 1.9831169342257893e+19, 'train_loss': 0.28220631971570126, 'epoch': 3.0})

In [56]:
from google.colab import drive
drive.mount('/content/drive')

# copy file/folder to Drive
!cp -r /content/wavlm-ctc-ex-1 /content/drive/MyDrive/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
# Save model + processor
model.save_pretrained("/content/wavlm-ctc-ex-1")
processor.save_pretrained("/content/wavlm-ctc-ex-1")

[]

In [58]:
!cp -r /content/wavlm-ctc-ex-1 /content/drive/MyDrive/

# FaceBook Model Eval

In [59]:
librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

In [60]:
facebookModel = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to("cuda")
facebookProcessor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/162 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [70]:
from datasets import Audio

librispeech_eval = librispeech_eval.cast_column("audio", Audio(sampling_rate=16000))

In [71]:
def map_to_pred(batch):
    # extract all audio arrays
    audio_arrays = [x["array"] for x in batch["audio"]]

    # process batch
    inputs = processor(audio_arrays, sampling_rate=16000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values.to(model.device)).logits

    # decode predictions
    predicted_ids = torch.argmax(logits, dim=-1)
    transcriptions = processor.batch_decode(predicted_ids, skip_special_tokens=True)

    batch["transcription"] = transcriptions
    return batch

# run evaluation
result = librispeech_eval.map(map_to_pred, batched=True, batch_size=8, remove_columns=["audio"])

Map:   0%|          | 0/2620 [00:00<?, ? examples/s]

In [72]:
from jiwer import wer, cer

refs = result["text"]    # <- this might be a Series
hyps = result["transcription"]

# Convert to plain Python lists
refs = list(refs)
hyps = list(hyps)

print("WER:", wer(refs, hyps))
print("CER:", cer(refs, hyps))

WER: 0.4332014607425441
CER: 0.1350619827371861
