# Fine-Tune Wav2Vec2

Adapted from guide here: https://colab.research.google.com/drive/1FjTsqbYKphl9kL-eILgUc-bl4zVThL8F?usp=sharing#scrollTo=e7cqAWIayn6w

## Create Tokenizer Vocabulary

In [3]:
from datasets import load_from_disk

dataset = load_from_disk("Data/kham_asr_dataset")

dataset

Loading dataset from disk:   0%|          | 0/34 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__', 'audio', 'transcript'],
        num_rows: 67273
    })
    validation: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__', 'audio', 'transcript'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__', 'audio', 'transcript'],
        num_rows: 4000
    })
})

In [5]:
def extract_all_chars(batch):
  all_text = " ".join(batch["uni"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocabs = dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=dataset.column_names["train"])

Map:   0%|          | 0/67273 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [6]:
all_chars = sorted(set(vocabs["train"]["vocab"][0] + vocabs["test"]["vocab"][0]))
vocab_dict = {char: idx for idx, char in enumerate(all_chars)}
vocab_dict

{' ': 0,
 '\x7f': 1,
 '་': 2,
 '།': 3,
 '༕': 4,
 'ཀ': 5,
 'ཁ': 6,
 'ག': 7,
 'གྷ': 8,
 'ང': 9,
 'ཅ': 10,
 'ཆ': 11,
 'ཇ': 12,
 'ཉ': 13,
 'ཊ': 14,
 'ཋ': 15,
 'ཌ': 16,
 'ཎ': 17,
 'ཏ': 18,
 'ཐ': 19,
 'ད': 20,
 'ན': 21,
 'པ': 22,
 'ཕ': 23,
 'བ': 24,
 'བྷ': 25,
 'མ': 26,
 'ཙ': 27,
 'ཚ': 28,
 'ཛ': 29,
 'ཝ': 30,
 'ཞ': 31,
 'ཟ': 32,
 'འ': 33,
 'ཡ': 34,
 'ར': 35,
 'ལ': 36,
 'ཤ': 37,
 'ཥ': 38,
 'ས': 39,
 'ཧ': 40,
 'ཨ': 41,
 'ཪ': 42,
 'ཱ': 43,
 'ི': 44,
 'ུ': 45,
 'ྲྀ': 46,
 'ེ': 47,
 'ོ': 48,
 'ཾ': 49,
 'ྀ': 50,
 'ྃ': 51,
 'ྐ': 52,
 'ྒ': 53,
 'ྔ': 54,
 'ྕ': 55,
 'ྗ': 56,
 'ྙ': 57,
 'ྜ': 58,
 'ྞ': 59,
 'ྟ': 60,
 'ྠ': 61,
 'ྡ': 62,
 'ྣ': 63,
 'ྤ': 64,
 'ྥ': 65,
 'ྦ': 66,
 'ྨ': 67,
 'ྩ': 68,
 'ྫ': 69,
 'ྭ': 70,
 'ྰ': 71,
 'ྱ': 72,
 'ྲ': 73,
 'ླ': 74,
 'ྴ': 75,
 'ྵ': 76,
 'ྶ': 77,
 'ྷ': 78}

In [7]:
vocab_dict["་"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

80

In [8]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)