In [1]:
import os
os.environ['HF_HOME'] = 'huggingface'
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = 'True'
import math
from datasets import Audio, Dataset, DatasetDict, load_dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ConformerForCTC, TrainingArguments, Trainer
import torch
import torchaudio
from torch.utils.data.dataloader import DataLoader
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import numpy as np
import evaluate
import pandas as pd
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name= 'facebook/wav2vec2-conformer-rel-pos-large-960h-ft'
checkpoint_name= 'checkpoints/checkpoint-750/'

In [3]:
processor = Wav2Vec2Processor.from_pretrained(model_name)

In [4]:
ds = load_dataset('audiofolder', data_dir='audio_augmented_folder', split='train')  # specify split to return a Dataset object instead of a DatasetDict

Resolving data files: 100%|██████████| 15000/15000 [00:00<00:00, 26192.67it/s] 
Found cached dataset audiofolder (/home/cheongalc/Documents/til2023/ASR/huggingface/datasets/audiofolder/default-682e93e6f8976099/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


In [5]:
ds = ds.train_test_split(test_size=0.2)

In [6]:
ds['train']['audio'][0]

{'path': '/home/cheongalc/Documents/til2023/ASR/audio_augmented_folder/audio_augmented/train_00227_1.1.wav',
 'array': array([ 0.00036621,  0.00027466,  0.00027466, ..., -0.01113892,
        -0.05032349, -0.04394531]),
 'sampling_rate': 16000}

In [6]:
def prepare_dataset(batch):
    model_name = 'facebook/wav2vec2-conformer-rel-pos-large-960h-ft'
    from transformers import Wav2Vec2Processor
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    batch["input_values"] = [processor(audio["array"], sampling_rate=16000).input_values for audio in batch["audio"]]
    batch["input_length"] = [len(b) for b in batch["input_values"]]
    batch['length'] = batch["input_length"]
    batch["labels"] = processor(text=batch["annotation"]).input_ids
    return batch


ds = ds.map(prepare_dataset, num_proc=8, batched=True, batch_size=256)

                                                                               

In [7]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"][0]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")

In [8]:
wer = evaluate.load("wer")
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    return {"wer": wer.compute(predictions=pred_str, references=label_str)}

In [9]:
model = Wav2Vec2ConformerForCTC.from_pretrained(
    checkpoint_name,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id)

In [10]:
model.freeze_feature_encoder()

In [11]:
per_gpu_bs = 4
effective_bs = 32
training_args = TrainingArguments(
    output_dir="checkpoints",
    overwrite_output_dir =True,
    per_device_train_batch_size=per_gpu_bs,
    gradient_accumulation_steps=math.ceil(effective_bs/per_gpu_bs),
    learning_rate=1e-4,
    num_train_epochs=20,
    gradient_checkpointing=False,
    fp16=True,
    # bf16=True,  # for A100
    fp16_full_eval=True,
    # bf16_full_eval=True,  # for A100
    group_by_length=True,  # slows down
    evaluation_strategy="epoch",
    save_strategy='epoch',  # epoch
    save_safetensors=True,
    per_device_eval_batch_size=4,
    save_steps=1,
    eval_steps=1,
    logging_steps=100,
    save_total_limit=3,
    lr_scheduler_type='cosine',
    load_best_model_at_end=True,  # True
    adam_beta1=0.9,
    adam_beta2=0.98,  # follow fairseq fintuning config
    warmup_ratio=0.22, # follow Ranger21
    weight_decay=1e-4,  # follow Ranger21
    metric_for_best_model="wer",
    greater_is_better=False,
    report_to=['tensorboard'],
    dataloader_num_workers=24 if os.name != 'nt' else 1)

In [12]:
class CTCTrainer(Trainer):
    def training_step(self, model: torch.nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.

        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """

        model.train()
        inputs = self._prepare_inputs(inputs)
        loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if os.name != 'nt':
            # accelerator.backward(self.scaler.scale(loss))
            self.scaler.scale(loss).backward()
        else:
            self.scaler.scale(loss).backward()
        return loss.detach()

In [13]:
if os.name != 'nt':
    from accelerate import Accelerator
    accelerator = Accelerator(mixed_precision='fp16', dynamo_backend='eager')  # FP8 needs transformer_engine package which is only on Linux with Hopper GPUs

In [14]:
def tri_stage_schedule(epoch: int, max_epoch = training_args.num_train_epochs, stage_ratio = [0.1, 0.4, 0.5], peak_lr = training_args.learning_rate, initial_lr_scale=0.01, final_lr_scale=0.05):
    """https://github.com/facebookresearch/fairseq/blob/5ecbbf58d6e80b917340bcbf9d7bdbb539f0f92b/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py#L51"""
    assert sum(stage_ratio) == 1
    current_ratio = epoch / max_epoch
    if current_ratio < stage_ratio[0]:  # linear warmup
        lrs = torch.linspace(initial_lr_scale * peak_lr, peak_lr, int(stage_ratio[0] * max_epoch))
        return lrs[epoch]
    elif stage_ratio[0] <= current_ratio <= stage_ratio[1]:  # constant
        return peak_lr
    else:  # exponential decay
        decay_factor = -math.log(final_lr_scale) / (stage_ratio[2] * max_epoch)
        return peak_lr * math.exp(-decay_factor * stage_ratio[2] * max_epoch)

In [15]:
# max_steps = math.ceil(training_args.num_train_epochs * len(ds['train']) / training_args.gradient_accumulation_steps / min(training_args.per_device_train_batch_size, len(ds['train'])))
# optimizer = Ranger21(model.parameters(), num_iterations=max_steps, lr=1e-4)
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-8, foreach=False)  # https://github.com/facebookresearch/fairseq/blob/main/examples/wav2vec/config/finetuning/base_960h.yaml
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_steps)
# scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=tri_stage_schedule)  # following FAIR finetuning settings
# scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: x)  # constant LR, stays same throughout, for Ranger21

trainer = CTCTrainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # optimizers=(optimizer, scheduler),
)
if os.name != 'nt':  # windows does not support torch.compile yet
    # pass
    trainer.model_wrapped, trainer.optimizer, trainer.lr_scheduler = accelerator.prepare(trainer.model_wrapped, trainer.optimizer, trainer.lr_scheduler)
trainer.train()
if os.name != 'nt':
    accelerator.wait_for_everyone()

   function: 'forward' (/home/cheongalc/venvs/til2023/lib/python3.9/site-packages/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py:806)
   reasons:  ___check_obj_id(self, 140658442431072)
to diagnose recompilation issues, see https://pytorch.org/docs/master/dynamo/troubleshooting.html.
   function: 'forward' (/home/cheongalc/venvs/til2023/lib/python3.9/site-packages/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py:563)
   reasons:  ___check_obj_id(self, 140658442342800)
to diagnose recompilation issues, see https://pytorch.org/docs/master/dynamo/troubleshooting.html.
   function: 'forward' (/home/cheongalc/venvs/til2023/lib/python3.9/site-packages/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py:660)
   reasons:  ___check_obj_id(self, 140658442343952)
to diagnose recompilation issues, see https://pytorch.org/docs/master/dynamo/troubleshooting.html.
   function: 'forward' (/home/cheongalc/venvs/til2023/lib/python3.9/site-pac

Epoch,Training Loss,Validation Loss,Wer
1,0.6172,0.327153,0.134042
2,0.5341,0.293813,0.116633
3,0.4242,0.253052,0.095108
4,0.3672,0.202164,0.081579
5,0.3198,0.182232,0.069467
6,0.2663,0.147588,0.062314
7,0.2371,0.154062,0.05975
8,0.1804,0.119504,0.045108
9,0.1561,0.120989,0.041835
10,0.1303,0.114256,0.042713


   function: '_apply_relative_embeddings' (/home/cheongalc/venvs/til2023/lib/python3.9/site-packages/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py:741)
   reasons:  ___check_obj_id(self, 140658442087872)
to diagnose recompilation issues, see https://pytorch.org/docs/master/dynamo/troubleshooting.html.
   function: '__init__' (<string>:2)
   reasons:  tensor 'logits' strides mismatch at index 0. expected 8032, actual 9440
to diagnose recompilation issues, see https://pytorch.org/docs/master/dynamo/troubleshooting.html.
   function: 'forward' (/home/cheongalc/venvs/til2023/lib/python3.9/site-packages/transformers/activations.py:149)
   reasons:  tensor 'input' strides mismatch at index 0. expected 786432, actual 296960
to diagnose recompilation issues, see https://pytorch.org/docs/master/dynamo/troubleshooting.html.
   function: 'forward' (/home/cheongalc/venvs/til2023/lib/python3.9/site-packages/accelerate/utils/operations.py:520)
   reasons:  tensor 'kwargs['lab

In [None]:
if os.name != 'nt':
    trainer.model_wrapped = accelerator.unwrap_model(trainer.model_wrapped)
trainer.save_model('wav2vec2-conformer')
processor.tokenizer.save_pretrained('wav2vec2-conformer')

In [3]:
from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor.from_pretrained('wav2vec2-conformer')
processor.tokenizer.save_pretrained('checkpoints/checkpoint-7500/')

('checkpoints/checkpoint-7500/tokenizer_config.json',
 'checkpoints/checkpoint-7500/special_tokens_map.json',
 'checkpoints/checkpoint-7500/vocab.json',
 'checkpoints/checkpoint-7500/added_tokens.json')

In [1]:
# Infer
import os
os.environ['HF_HOME'] = 'huggingface'
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = 'True'
import torch
import datasets
from transformers import Wav2Vec2Processor, Wav2Vec2ConformerForCTC
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm
import pandas as pd
from torch.utils.data import DataLoader

In [2]:
dataset = datasets.load_dataset("test", split="train")
dataset = KeyDataset(KeyDataset(dataset, "audio"), "array")
test_ds = pd.read_csv('Test_Advanced.csv')

Resolving data files:   0%|          | 0/12000 [00:00<?, ?it/s]

Found cached dataset audiofolder (C:/Users/alien/Documents/PyCharm-Projects/TIL-2023/ASR/huggingface/datasets/audiofolder/test-070480b76e15472b/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


In [3]:
def clean(annotation):
    if "'" in annotation:
        # print(annotation, f'has \' in {annotation}, removing')
        annotation = annotation.split("'")[0] + annotation.split("'")[1][1:]  # Tokenizer includes "'" but TIL dataset does not, remove the S following '
    return annotation

In [None]:
processor = Wav2Vec2Processor.from_pretrained("wav2vec2-conformer")
data_loader = DataLoader(dataset, batch_size=32, collate_fn=processor, pin_memory=True, num_workers=4)
checkpoint1 = 'model/checkpoint-2250 aug lb 0.0267'
checkpoint2 = 'model/checkpoint-2250 aug lb 0.0267'
model1 = Wav2Vec2ConformerForCTC.from_pretrained(checkpoint1).to('cuda')
model2 = Wav2Vec2ConformerForCTC.from_pretrained(checkpoint2).to('cuda')
model1.eval()
model2.eval()
logits1 = []
logits2 = []
logits = []
with torch.no_grad():
    for batch in tqdm(data_loader):
        inputs = processor(batch["input_values"], sampling_rate=16000, return_tensors="pt", padding=True).to('cuda')
        outputs1 = model1(**inputs).logits
        outputs2 = model2(**inputs).logits
        logits1.append(outputs1)
        logits2.append(outputs2)

  with safe_open(checkpoint_file, framework="pt") as f:


  0%|          | 0/375 [00:31<?, ?it/s]

In [None]:
logits = [(l1 + l2) / 2 for l1, l2 in zip(logits1, logits2)]
results = []
for l in logits:
    results.extend(processor.batch_decode(torch.argmax(l, dim=-1)))

In [None]:
test_ds['annotation'] = list(map(clean, results))
test_ds['path'] = test_ds['path'].apply(lambda x: x.split('/')[-1])
test_ds.to_csv('Test_Advanced_6750_0.0205.csv', index=False)  # change file name