In [1]:
import pandas as pd
import numpy as np
from string import digits
from tokenizers import Tokenizer
from tokenizers.models import WordLevel, BPE
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
from tokenizers.trainers import WordLevelTrainer, BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import re

Using BPE algorithm poses as a problem since unseen phonemes are broken down at the character level, i.e. `'IH'` -> `['I', 'H']`. this is a problem for our label set as our crossentropyloss function expects a single label, not two. A workaround to this is to use WordLevel algorithm as it doesn't break down any words (or characters in our case), it simply leaves the string as is. Downside is that instead of tokenizer handling how words are broken into character-level, we have to handle this ourselves.

In [2]:
# Load in data
# adding spacing is done since WordLevel won't do this on its own (unlike BPE)
# we'll add this step to datamodule.
eng_df = pd.read_csv('../data/model_ready/csv/processed_english.csv')
# below adds spacing to words
eng_df['word'] = eng_df['word'].map(lambda row: ' '.join(list(row)))
# additional minor filtering due to the fact that unwanted words still persist.
eng_df['phonemes'] = eng_df['phonemes'].map(lambda row: re.sub(r'\d', '', row))
eng_df['label'] = eng_df['label'].map(lambda row: re.sub(r'\d', '', row))
eng_df = eng_df[~eng_df['phonemes'].str.contains('#')]

In [3]:
eng_df.head()

Unnamed: 0,word,phonemes,label
0,' b o u t,start,B
1,' b o u t,start B,AW
2,' b o u t,start B AW,T
3,' b o u t,start B AW T,stop
4,' c a u s e,start,K


In [4]:
words = list(eng_df['word'].values)
phonemes = list(eng_df['phonemes'].values)

In [5]:
#Initialize tokenizers
word_tokenizer = Tokenizer(WordLevel(unk_token='UNK'))
phoneme_tokenizer = Tokenizer(WordLevel(unk_token='UNK'))

In [6]:
word_trainer = WordLevelTrainer(special_tokens=['PAD', 'UNK'])
phone_trainer = WordLevelTrainer(special_tokens=['PAD', 'UNK'])

In [7]:
# Add normalizers to remove accents from non-english words

normalizer = normalizers.Sequence([NFD(), StripAccents()])
word_tokenizer.pre_tokenizer = Whitespace()
phoneme_tokenizer.pre_tokenizer = Whitespace()
word_tokenizer.normalizer = normalizer
phoneme_tokenizer.normalizer = normalizer

In [8]:
#Enable padding
word_tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, 
               pad_token='PAD', length=None, pad_to_multiple_of=None)
phoneme_tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, 
               pad_token='PAD', length=None, pad_to_multiple_of=None)

In [9]:
# Train Tokenizers
word_tokenizer.train_from_iterator(words, trainer=word_trainer)
phoneme_tokenizer.train_from_iterator(phonemes, trainer=phone_trainer)

In [10]:
# Test Tokenizers
word_test = word_tokenizer.encode(words[203])
phoneme_test = phoneme_tokenizer.encode(phonemes[48])

In [11]:
word_test.ids

[3, 3, 6, 8, 5, 7]

In [12]:
word_test.tokens

['a', 'a', 'r', 'o', 'n', 's']

In [13]:
phoneme_test.tokens

['start', 'EH', 'S']

In [14]:
phoneme_test.ids

[2, 12, 7]

In [15]:
word_tokenizer.encode('!').ids[0] == word_tokenizer.get_vocab()['UNK']

True

In [16]:
# Test on different language
cro_df = pd.read_csv('../data/model_ready/csv/processed_croatian.csv')
cro_df['word'] = cro_df['word'].map(lambda row: ' '.join(list(row)))
cro_df.head()

Unnamed: 0,word,phonemes,label
0,a b a d ž i j a,start,B
1,a b a d ž i j a,start B,AE
2,a b a d ž i j a,start B AE,JH
3,a b a d ž i j a,start B AE JH,IH
4,a b a d ž i j a,start B AE JH IH,Y


In [17]:
cro_test = word_tokenizer.encode_batch(list(cro_df['word'].values))

In [18]:
cro_test[0].ids

[3, 18, 3, 12, 28, 4, 30, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [19]:
cro_test[0].tokens

['a',
 'b',
 'a',
 'd',
 'z',
 'i',
 'j',
 'a',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD']

In [20]:
cro_test[0].ids

[3, 18, 3, 12, 28, 4, 30, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [21]:
phoneme_tokenizer.get_vocab()

{'JH': 32,
 'ZH': 40,
 'D': 13,
 'L': 9,
 'AO': 23,
 'G': 22,
 'HH': 24,
 'SH': 31,
 'P': 16,
 'DH': 41,
 'EH': 12,
 'AE': 15,
 'Z': 25,
 'ER': 19,
 'OW': 21,
 'R': 4,
 'NG': 34,
 'AH': 3,
 'TH': 37,
 'UNK': 1,
 'T': 10,
 'PAD': 0,
 'UW': 30,
 'AA': 14,
 'OY': 39,
 'S': 7,
 'N': 5,
 'IY': 18,
 'UH': 38,
 'IH': 6,
 'K': 8,
 'EY': 26,
 'AY': 28,
 'start': 2,
 'B': 17,
 'F': 20,
 'V': 27,
 'W': 29,
 'Y': 33,
 'M': 11,
 'CH': 35,
 'AW': 36}

In [22]:
phoneme_tokenizer.encode('t !').ids

[1, 1]

In [23]:
cro_test_labels = phoneme_tokenizer.encode_batch(list(cro_df['label'].values))

In [24]:
cro_test_labels[4].tokens

['Y']

In [25]:
cro_test_labels[4].ids

[33]

In [26]:
cro_test_phones = phoneme_tokenizer.encode_batch(list(cro_df['phonemes'].values))

In [27]:
cro_test_phones[0].ids

[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [28]:
cro_test_phones[0].tokens

['start',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD']

In [29]:
word_tokenizer.get_vocab()['PAD']

0

In [30]:
phoneme_tokenizer.get_vocab()['PAD']

0

In [31]:
# Save tokenizers
word_tokenizer.save("../data/token_encodings/word_tokenizer-eng.json")
phoneme_tokenizer.save('../data/token_encodings/phoneme_tokenizer-eng.json')

# NOTE
Below cell is a **one-time** run, only done to appropriately and persistently process our base data. no need to run again.

In [34]:
# Load in data
# adding spacing is done since WordLevel won't do this on its own (unlike BPE)
# we'll add this step to datamodule.
"""
eng_df = pd.read_csv('../data/model_ready/csv/processed_english.csv')
# below adds spacing to words
# additional minor filtering due to the fact that unwanted words still persist.
eng_df['phonemes'] = eng_df['phonemes'].map(lambda row: re.sub(r'\d', '', row))
eng_df['label'] = eng_df['label'].map(lambda row: re.sub(r'\d', '', row))
eng_df = eng_df[~eng_df['phonemes'].str.contains('#')]
eng_df.to_csv('../data/model_ready/csv/processed_english.csv', index=False)
"""

"\neng_df = pd.read_csv('../data/model_ready/csv/processed_english.csv')\n# below adds spacing to words\n# additional minor filtering due to the fact that unwanted words still persist.\neng_df['phonemes'] = eng_df['phonemes'].map(lambda row: re.sub(r'\\d', '', row))\neng_df['label'] = eng_df['label'].map(lambda row: re.sub(r'\\d', '', row))\neng_df = eng_df[~eng_df['phonemes'].str.contains('#')]\neng_df.to_csv('../data/model_ready/csv/processed_english.csv', index=False)\n"

In [120]:
max_L = 2
ix = 'B'
for item in cro_df['label']:
    if len(item) > max_L:
        max_L = len(item)
        ix = item
max_L

4

In [121]:
eng_tokens = phoneme_tokenizer.encode_batch(list(eng_df['label'].values))

In [122]:
# checking for instances where the second token isn't a pad ID
for i in range(len(eng_tokens)):
    if eng_tokens[i].ids[1] > 0:
        print(i)
        print(eng_tokens[i].ids)

IndexError: list index out of range