In [55]:
import pandas as pd
import numpy as np
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents

In [34]:
#Initialize tokenizers
word_tokenizer = Tokenizer(BPE())
phoneme_tokenizer = Tokenizer(BPE())

In [3]:
# Commenting out for now
# from tokenizers.trainers import BpeTrainer

# trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [35]:
# Read in data
english = '../datasets/cmudict.dict'
data = []
with open(english, 'r') as infile:
        for line in infile:
                    data.append((line.rstrip('\n')))
data[:5]

["'bout B AW1 T",
 "'cause K AH0 Z",
 "'course K AO1 R S",
 "'cuse K Y UW1 Z",
 "'em AH0 M"]

In [56]:
# Remove accents
# from tokenizers.normalizers import NFD, StripAccents

normalizer = normalizers.Sequence([NFD(), StripAccents()])
from tokenizers.pre_tokenizers import Whitespace

word_tokenizer.pre_tokenizer = Whitespace()
phoneme_tokenizer.pre_tokenizer = Whitespace()
word_tokenizer.normalizer = normalizer
phoneme_tokenizer.normalizer = normalizer

In [39]:
# Get words only
words = [item.split(' ')[0] for item in data]
words[:5]

["'bout", "'cause", "'course", "'cuse", "'em"]

In [40]:
# Remove extras
for word in words:
    if '(' in word:
        words.remove(word)

In [41]:
# Separate by character
words = [[letter for letter in item] for item in words]

In [42]:
# Get phonemes
phonemes = [item.split(' ')[1:] for item in data]
phonemes.append(['start', 'stop'])
phonemes[:5]

[['B', 'AW1', 'T'],
 ['K', 'AH0', 'Z'],
 ['K', 'AO1', 'R', 'S'],
 ['K', 'Y', 'UW1', 'Z'],
 ['AH0', 'M']]

In [59]:
# Train Tokenizers
word_tokenizer.train_from_iterator(words)
phoneme_tokenizer.train_from_iterator(phonemes)

In [44]:
# Test Tokenizers
word_test = word_tokenizer.encode(''.join(words[203]))
phoneme_test = phoneme_tokenizer.encode(words[48][0])

In [45]:
''.join(words[203])

'ability'

In [13]:
word_test.tokens

['a', 'b', 'i', 'l', 'i', 't', 'y']

In [16]:
word_test.ids

[6, 7, 14, 17, 14, 25, 30]

In [14]:
phoneme_test.tokens

['a']

In [15]:
phoneme_test.ids

[29]

In [60]:
# Save tokenizers
word_tokenizer.save("word_tokenizer-eng.json")
phoneme_tokenizer.save('phoneme_tokenizer-eng.json')

In [20]:
# Test on different language
cro_df = pd.read_csv('../datasets/processed/csv/processed_croatian.csv')

In [62]:
cro_test = word_tokenizer.encode(list(cro_df['word'].values)[1])

In [63]:
cro_test.tokens

['a', 'b', 'a', 'd', 'z', 'i', 'j', 'a']

In [64]:
cro_test.ids

[6, 7, 6, 9, 31, 14, 15, 6]