# Fine-Tune Wav2Vec2

Adapted from guide here: https://colab.research.google.com/drive/1FjTsqbYKphl9kL-eILgUc-bl4zVThL8F?usp=sharing#scrollTo=e7cqAWIayn6w

## Create Tokenizer Vocabulary

In [1]:
from datasets import load_from_disk, DatasetDict

dataset = DatasetDict()

dataset['train'] = load_from_disk("../kham_asr_finetune_preprocessed")['train']

Loading dataset from disk:   0%|          | 0/34 [00:00<?, ?it/s]

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 67273
    })
})

In [3]:
from transformers import Wav2Vec2FeatureExtractor, AutoTokenizer, Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2Config

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("openpecha/Garchen_Rinpoche_stt")
tokenizer = AutoTokenizer.from_pretrained("openpecha/Garchen_Rinpoche_stt")
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

config = Wav2Vec2Config.from_pretrained("openpecha/Garchen_Rinpoche_stt")

model = Wav2Vec2ForCTC.from_pretrained(
    "openpecha/Garchen_Rinpoche_stt", 
    config=config
)


## Training and Evaluation

### Define a Data Collator

In [4]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=True,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=True,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [5]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

### Define the Training Configuration

In [6]:
%env WANDB_PROJECT=garchen

env: WANDB_PROJECT=garchen


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="kham-pre-ft",  # change to a repo name of your choice
    #auto_find_batch_size=True,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-4,
    warmup_steps=500,
    num_train_epochs=3,
    gradient_checkpointing=True,
    fp16=True,
    save_strategy='epoch',
)

trainer = Trainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
500,1.6125
1000,1.2
1500,1.1098
2000,1.0695
2500,1.0288
3000,0.9896
3500,0.9787
4000,0.9451




In [9]:
model.save_pretrained('garchen-kham-pre-ft')