In [1]:
import pandas as pd
import numpy as np
from string import digits
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
import re

In [58]:
# Load in data
eng_df = pd.read_csv('../data/model_ready/csv/processed_english.csv')

In [71]:
words = list(eng_df['word'].values)
phonemes = list(eng_df['phonemes'].values)

In [95]:
#Initialize tokenizers
word_tokenizer = Tokenizer(BPE(unk_token="?"))
phoneme_tokenizer = Tokenizer(BPE(unk_token="?"))

In [61]:
# Commenting out for now
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(special_tokens=["?", "*"], unk_token='?')

note by Antonio `10/5/21` : edited the previous code because it wasn't excluding the words as intended (no idea why the previous code didn't work as expected but below accomplishes what we want).

In [62]:
# Add normalizers to remove accents from non-english words
# from tokenizers.normalizers import NFD, StripAccents

normalizer = normalizers.Sequence([NFD(), StripAccents()])
from tokenizers.pre_tokenizers import Whitespace

word_tokenizer.pre_tokenizer = Whitespace()
phoneme_tokenizer.pre_tokenizer = Whitespace()
word_tokenizer.normalizer = normalizer
phoneme_tokenizer.normalizer = normalizer

In [63]:
#Enable padding
word_tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, 
               pad_token='*', length=None, pad_to_multiple_of=None)
phoneme_tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, 
               pad_token='*', length=None, pad_to_multiple_of=None)

In [64]:
words_clean = []
for word in words:
    word_clean = re.sub(r'\d', '', word)
    word_clean = re.sub(r'[^A-Za-z\s]', '?', word)
    words_clean.append(word_clean)
words_clean[:5]

['?bout', '?bout', '?bout', '?bout', '?cause']

In [65]:
# Separate by character
words_clean = [[letter for letter in item] for item in words_clean]

In [73]:
# Separate by phoneme
phonemes = [item.split(' ') for item in phonemes]
phonemes[:5]

[['start'],
 ['start', 'B'],
 ['start', 'B', 'AW1'],
 ['start', 'B', 'AW1', 'T'],
 ['start']]

In [101]:
# Train Tokenizers
word_tokenizer.train_from_iterator(words_clean, trainer=trainer)
phoneme_tokenizer.train_from_iterator(phonemes, trainer=trainer)

In [97]:
# Test Tokenizers
word_test = word_tokenizer.encode(''.join(words_clean[203]))
# word_test = word_tokenizer.encode(words_clean[203])
# phoneme_test = phoneme_tokenizer.encode(phonemes[48][0])

In [98]:
words_clean[203]

['a', 'a', 'r', 'o', 'n', '?', 's']

In [99]:
word_test.ids

[2, 2, 19, 16, 15, 0, 20]

In [84]:
phoneme_test.tokens

['start']

In [85]:
phoneme_test.ids

[50]

In [123]:
# Save tokenizers
word_tokenizer.save("../data/token_encodings/word_tokenizer-eng.json")
phoneme_tokenizer.save('../data/token_encodings/phoneme_tokenizer-eng.json')

In [16]:
# Test on different language
cro_df = pd.read_csv('../data/model_ready/csv/processed_croatian.csv')
cro_df.head()

Unnamed: 0,word,phonemes,label
0,abadžija,start,B
1,abadžija,start B,AE
2,abadžija,start B AE,JH
3,abadžija,start B AE JH,IH
4,abadžija,start B AE JH IH,Y


In [25]:
cro_df.head(10)

Unnamed: 0,word,phonemes,label
0,abadžija,start,B
1,abadžija,start B,AE
2,abadžija,start B AE,JH
3,abadžija,start B AE JH,IH
4,abadžija,start B AE JH IH,Y
5,abadžija,start B AE JH IH Y,AE
6,abadžija,start B AE JH IH Y AE,stop
7,abaiti,start,AE
8,abaiti,start AE,B
9,abaiti,start AE B,AE


In [17]:
cro_test = word_tokenizer.encode_batch(list(cro_df['word'].values))

In [18]:
cro_test[0].ids

[2, 3, 2, 5, 27, 10, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [19]:
cro_test[0].tokens

['a',
 'b',
 'a',
 'd',
 'z',
 'i',
 'j',
 'a',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*',
 '*']

In [102]:
cro_test = phoneme_tokenizer.encode_batch(list(cro_df['label'].values))

In [118]:
cro_test[4].tokens

['Y']

In [119]:
cro_test[4].ids

[29]

In [120]:
max_L = 2
ix = 'B'
for item in cro_df['label']:
    if len(item) > max_L:
        max_L = len(item)
        ix = item
max_L

4

In [121]:
eng_tokens = phoneme_tokenizer.encode_batch(list(eng_df['label'].values))

In [122]:
# checking for instances where the second token isn't a pad ID
for i in range(len(eng_tokens)):
    if eng_tokens[i].ids[1] > 0:
        print(i)
        print(eng_tokens[i].ids)

IndexError: list index out of range