In [63]:
import pandas as pd
import numpy as np
from string import digits
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents

In [64]:
#Initialize tokenizers
word_tokenizer = Tokenizer(BPE(unk_token="?"))
phoneme_tokenizer = Tokenizer(BPE(unk_token="?"))

In [3]:
# Commenting out for now
# from tokenizers.trainers import BpeTrainer

# trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [16]:
# Read in data
english = '../data/cmudict/cmudict.dict'
data = []
with open(english, 'r') as infile:
        for line in infile:
                    data.append((line.rstrip('\n')))
data[:5]

["'bout B AW1 T",
 "'cause K AH0 Z",
 "'course K AO1 R S",
 "'cuse K Y UW1 Z",
 "'em AH0 M"]

In [17]:
# Remove non-english and extra words
for item in data:
    if '#' in item or '(' in item:
        data.remove(item)

In [28]:
# Remove numbers from phonemes
remove_digits = str.maketrans('', '', digits)
for i in range(len(data)):
    data[i] = data[i].translate(remove_digits)
data[:5]

["'bout B AW T",
 "'cause K AH Z",
 "'course K AO R S",
 "'cuse K Y UW Z",
 "'em AH M"]

In [65]:
# Add normalizers to remove accents from non-english words
# from tokenizers.normalizers import NFD, StripAccents

normalizer = normalizers.Sequence([NFD(), StripAccents()])
from tokenizers.pre_tokenizers import Whitespace

word_tokenizer.pre_tokenizer = Whitespace()
phoneme_tokenizer.pre_tokenizer = Whitespace()
word_tokenizer.normalizer = normalizer
phoneme_tokenizer.normalizer = normalizer

In [66]:
#Enable padding
word_tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, 
               pad_token='*', length=None, pad_to_multiple_of=None)
phoneme_tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, 
               pad_token='*', length=None, pad_to_multiple_of=None)

In [30]:
# Get words only
words = [item.split(' ')[0] for item in data]
words[:5]

["'bout", "'cause", "'course", "'cuse", "'em"]

In [58]:
words.append(['?'])

In [31]:
# Separate by character
words = [[letter for letter in item] for item in words]

In [32]:
# Get phonemes
phonemes = [item.split(' ')[1:] for item in data]
phonemes.append(['start', 'stop'])
phonemes[:5]

[['B', 'AW', 'T'],
 ['K', 'AH', 'Z'],
 ['K', 'AO', 'R', 'S'],
 ['K', 'Y', 'UW', 'Z'],
 ['AH', 'M']]

In [67]:
# Train Tokenizers
word_tokenizer.train_from_iterator(words)
phoneme_tokenizer.train_from_iterator(phonemes)

In [68]:
# Test Tokenizers
word_test = word_tokenizer.encode(''.join(words[203]))
phoneme_test = phoneme_tokenizer.encode(words[48][0])

In [69]:
word_test.tokens

['a', 'b', 'i', 'm', 'a', 'e', 'l', 's']

In [70]:
word_test.ids

[6, 7, 14, 18, 6, 10, 17, 24]

In [71]:
phoneme_test.tokens

['a']

In [72]:
phoneme_test.ids

[26]

In [73]:
# Save tokenizers
word_tokenizer.save("../data/token_encodings/word_tokenizer-eng.json")
phoneme_tokenizer.save('../data/token_encodings/phoneme_tokenizer-eng.json')

In [74]:
# Test on different language
cro_df = pd.read_csv('../data/model_ready/csv/processed_croatian.csv', index_col = 0)
cro_df.head()

Unnamed: 0,word,phonemes,label
0,abadžija,,start
1,abadžija,start,B
2,abadžija,start B,AE
3,abadžija,start B AE,JH
4,abadžija,start B AE JH,IH


In [75]:
cro_test = word_tokenizer.encode(list(cro_df['word'].values)[1])

In [76]:
cro_test.tokens

['a', 'b', 'a', 'd', 'z', 'i', 'j', 'a']

In [77]:
cro_test.ids

[6, 7, 6, 9, 31, 14, 15, 6]