In [1]:
import pandas as pd
import numpy as np
from tokenizers import Tokenizer
from tokenizers.models import BPE

In [2]:
#Initialize tokenizers
word_tokenizer = Tokenizer(BPE())
phoneme_tokenizer = Tokenizer(BPE())

In [3]:
# Commenting out for now
# from tokenizers.trainers import BpeTrainer

# trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [4]:
# Read in data
english = '../datasets/cmudict.dict'
data = []
with open(english, 'r') as infile:
        for line in infile:
                    data.append((line.rstrip('\n')))
data[:5]

["'bout B AW1 T",
 "'cause K AH0 Z",
 "'course K AO1 R S",
 "'cuse K Y UW1 Z",
 "'em AH0 M"]

In [5]:
# Whitespace for word tokenization
from tokenizers.pre_tokenizers import Whitespace

word_tokenizer.pre_tokenizer = Whitespace()
phoneme_tokenizer.pre_tokenizer = Whitespace()

In [6]:
# Get words only
words = [item.split(' ')[0] for item in data]
words[:5]

["'bout", "'cause", "'course", "'cuse", "'em"]

In [7]:
# Remove extras
for word in words:
    if '(' in word:
        words.remove(word)

In [8]:
# Separate by character
words = [[letter for letter in item] for item in words]

In [9]:
# Get phonemes
phonemes = [item.split(' ')[1:] for item in data]
phonemes.append(['start', 'stop'])
phonemes[:5]

[['B', 'AW1', 'T'],
 ['K', 'AH0', 'Z'],
 ['K', 'AO1', 'R', 'S'],
 ['K', 'Y', 'UW1', 'Z'],
 ['AH0', 'M']]

In [10]:
# Train Tokenizers
word_tokenizer.train_from_iterator(words)
phoneme_tokenizer.train_from_iterator(phonemes)

In [11]:
# Test Tokenizers
word_test = word_tokenizer.encode(''.join(words[203]))
phoneme_test = phoneme_tokenizer.encode(words[48][0])

In [12]:
''.join(words[203])

'ability'

In [13]:
word_test.tokens

['a', 'b', 'i', 'l', 'i', 't', 'y']

In [16]:
word_test.ids

[6, 7, 14, 17, 14, 25, 30]

In [14]:
phoneme_test.tokens

['a']

In [15]:
phoneme_test.ids

[29]

In [17]:
# Save tokenizers
word_tokenizer.save("word_tokenizer-eng.json")
phoneme_tokenizer.save('phoneme_tokenizer-eng.json')