In [1]:
import os
os.environ['HF_HOME'] = 'huggingface'
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = 'True'
from datasets import Audio, load_dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ConformerForCTC, TrainingArguments, Trainer
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import numpy as np
import evaluate
import pandas as pd

In [2]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large-960h-ft")

In [2]:
ds = load_dataset("csv", data_files='Train.csv')

Downloading and preparing dataset csv/default to C:/Users/alien/Documents/PyCharm-Projects/TIL-2023/ASR/huggingface/datasets/csv/default-612014e5b5727df5/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/alien/Documents/PyCharm-Projects/TIL-2023/ASR/huggingface/datasets/csv/default-612014e5b5727df5/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
ds = ds.cast_column("path", Audio(sampling_rate=16000))

def prepare_dataset(batch):
    audio = batch["path"]
    batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["annotation"])
    batch["input_length"] = len(batch["input_values"][0])
    return batch

encoded_ds = ds.map(prepare_dataset, num_proc=4)

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: AutoProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"][0]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")

In [3]:
wer = evaluate.load("wer")
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    return {"wer": wer.compute(predictions=pred_str, references=label_str)}

In [3]:
model = Wav2Vec2ConformerForCTC.from_pretrained(
    "facebook/wav2vec2-conformer-rel-pos-large-960h-ft",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id)

In [6]:
model.freeze_feature_encoder()

In [None]:
training_args = TrainingArguments(
    output_dir="model",
    overwrite_output_dir =True,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    warmup_steps=500,
    num_train_epochs=5,
    gradient_checkpointing=True,
    fp16=True,
    group_by_length=True,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    per_device_eval_batch_size=4,
    save_steps=1,
    eval_steps=1,
    logging_steps=100,
    lr_scheduler_type = 'cosine',
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False)

In [None]:
class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.

        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """

        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_amp:
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            self.deepspeed.backward(loss)
        else:
            accelerator.backward(loss)

        return loss.detach()

In [None]:
from accelerate import Accelerator
accelerator = Accelerator(mixed_precision='fp8', dynamo_backend='inductor')

In [None]:
trainer = CTCTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_ds["train"],
    eval_dataset=encoded_ds["test"],
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.model_wrapped, trainer.optimizer, trainer.lr_scheduler = accelerator.prepare(trainer.model_wrapped, trainer.optimizer, trainer.lr_scheduler)
trainer.train()
accelerator.wait_for_everyone()

In [None]:
trainer.model_wrapped = accelerator.unwrap_model(trainer.model_wrapped)
trainer.save_model('wav2vec2-conformer.pt')

In [None]:
# Infer
from transformers import pipeline
transcriber = pipeline("automatic-speech-recognition", model="wav2vec2-conformer.pt")
def predict(audio_file):
    return transcriber(audio_file)['text'].upper()

In [7]:
test_ds = pd.read_csv('Test_Advanced.csv')
test_ds['annotation'] = test_ds['path'].map(predict)
test_ds.to_csv('Test_Advanced.csv', index=False)