In [1]:
import pandas as pd
import numpy as np
from string import digits
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
import re

In [2]:
#Initialize tokenizers
# word_tokenizer = Tokenizer(BPE(unk_token="?"))
# phoneme_tokenizer = Tokenizer(BPE(unk_token="?"))
word_tokenizer = Tokenizer(BPE())
phoneme_tokenizer = Tokenizer(BPE())

In [3]:
# Commenting out for now
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(special_tokens=["?", "*"], unk_token='?')

note by Antonio `10/5/21` : edited the previous code because it wasn't excluding the words as intended (no idea why the previous code didn't work as expected but below accomplishes what we want).

In [4]:
# Read in data
english = '../data/cmudict/cmudict.dict'
data = []

with open(english) as f:
        lines = f.readlines()
for line in lines:
    pairs = line.strip('\n').split(' ', 1)
    if re.search(r'(\d)', pairs[0]) or '#' in pairs[1]:
        # skipping any alternate pronunciations, which are denoted by (2) or (3)
        # similarly, skipping any words of foreign descent, denoted by '#'
        continue
    else:
        # remove 
        pairs[1] = re.sub(r'\d', '', pairs[1])
        pairs[0] = re.sub(r'[^A-Za-z\s]', '?', pairs[0])
        data.append(pairs)

In [5]:
data[:5]

[['?bout', 'B AW T'],
 ['?cause', 'K AH Z'],
 ['?course', 'K AO R S'],
 ['?cuse', 'K Y UW Z'],
 ['?em', 'AH M']]

In [6]:
# Add normalizers to remove accents from non-english words
# from tokenizers.normalizers import NFD, StripAccents

normalizer = normalizers.Sequence([NFD(), StripAccents()])
from tokenizers.pre_tokenizers import Whitespace

word_tokenizer.pre_tokenizer = Whitespace()
phoneme_tokenizer.pre_tokenizer = Whitespace()
word_tokenizer.normalizer = normalizer
phoneme_tokenizer.normalizer = normalizer

In [7]:
#Enable padding
word_tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, 
               pad_token='*', length=None, pad_to_multiple_of=None)
phoneme_tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, 
               pad_token='*', length=None, pad_to_multiple_of=None)

In [8]:
# Get words only
words = [item[0] for item in data]
words[:5]

['?bout', '?cause', '?course', '?cuse', '?em']

In [11]:
# words.append(['?'])

In [9]:
# Separate by character
words = [[letter for letter in item] for item in words]

In [10]:
# Get phonemes
phonemes = [item[1] for item in data]
phonemes.append(['start', 'stop'])
phonemes[:5]

['B AW T', 'K AH Z', 'K AO R S', 'K Y UW Z', 'AH M']

In [11]:
# Train Tokenizers
word_tokenizer.train_from_iterator(words, trainer=trainer)
phoneme_tokenizer.train_from_iterator(phonemes, trainer=trainer)

In [12]:
# Test Tokenizers
word_test = word_tokenizer.encode(''.join(words[203]))
phoneme_test = phoneme_tokenizer.encode(words[48][0])

In [13]:
word_test.tokens

['a', 'b', 'i', 'o']

In [14]:
word_test.ids

[2, 3, 10, 16]

In [15]:
phoneme_test.tokens

['a']

In [16]:
phoneme_test.ids

[26]

In [17]:
# Save tokenizers
word_tokenizer.save("../data/token_encodings/word_tokenizer-eng.json")
phoneme_tokenizer.save('../data/token_encodings/phoneme_tokenizer-eng.json')

In [18]:
# Test on different language
cro_df = pd.read_csv('../data/model_ready/csv/processed_croatian.csv')
cro_df.head()

Unnamed: 0,word,phonemes,label
0,abadžija,start,B
1,abadžija,start B,AE
2,abadžija,start B AE,JH
3,abadžija,start B AE JH,IH
4,abadžija,start B AE JH IH,Y


In [19]:
cro_test = word_tokenizer.encode_batch(list(cro_df['word'].values))

In [20]:
cro_test[0].ids

[2, 3, 2, 5, 27, 10, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [21]:
cro_test[0].tokens

['a',
 'b',
 'a',
 'd',
 'z',
 'i',
 'j',
 'a',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*']