# Fine-Tune Wav2Vec2

Adapted from guide here: https://colab.research.google.com/drive/1FjTsqbYKphl9kL-eILgUc-bl4zVThL8F?usp=sharing#scrollTo=e7cqAWIayn6w

## Create Tokenizer Vocabulary

In [1]:
from datasets import load_from_disk

dataset = load_from_disk("garchen_dataset")

dataset

DatasetDict({
    train: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__', 'audio', 'transcript'],
        num_rows: 17739
    })
    validation: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__', 'audio', 'transcript'],
        num_rows: 3131
    })
})

In [2]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, AutoTokenizer, Wav2Vec2Config
import torch

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("openpecha/Garchen_Rinpoche_stt")
tokenizer = AutoTokenizer.from_pretrained("openpecha/Garchen_Rinpoche_stt")
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

config = Wav2Vec2Config.from_pretrained("openpecha/Garchen_Rinpoche_stt")

## Process Dataset

In [3]:
def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["uni"]).input_ids
    return batch

In [4]:
dataset = dataset['validation'].map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=1, batch_size=1)

Map (num_proc=1):   0%|          | 0/3131 [00:00<?, ? examples/s]



In [5]:
dataset.save_to_disk('garchen_val_preprocessed')

Saving the dataset (0/2 shards):   0%|          | 0/3131 [00:00<?, ? examples/s]