# Fine-Tune Wav2Vec2

Adapted from guide here: https://colab.research.google.com/drive/1FjTsqbYKphl9kL-eILgUc-bl4zVThL8F?usp=sharing#scrollTo=e7cqAWIayn6w

## Create Tokenizer Vocabulary

In [1]:
from datasets import load_from_disk

dataset = load_from_disk("Data/kham_asr_dataset")

dataset

Loading dataset from disk:   0%|          | 0/34 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__', 'audio', 'transcript'],
        num_rows: 67273
    })
    validation: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__', 'audio', 'transcript'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__', 'audio', 'transcript'],
        num_rows: 4000
    })
})

In [2]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2Config
import torch

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="་")
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-base")
config.vocab_size = processor.tokenizer.vocab_size

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base", 
    config=config
)

# Replace the output layer (lm_head) to match the tokenizer vocab size
model.lm_head = torch.nn.Linear(
    model.lm_head.in_features,
    tokenizer.vocab_size,
    bias=True
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
with processor.as_target_processor():
    max_id = max(
        max(processor(text).input_ids) for text in dataset["train"]["uni"]
    )
print("Max token ID:", max_id)
print("Vocab size:", tokenizer.vocab_size)
assert max_id < tokenizer.vocab_size


Max token ID: 78
Vocab size: 79


## Process Dataset

In [8]:
def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["uni"]).input_ids
    return batch

In [9]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/67273 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]



In [10]:
dataset.save_to_disk('Data/kham_asr_finetune_preprocessed')

Saving the dataset (0/34 shards):   0%|          | 0/67273 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]