In [1]:
import pandas as pd
import numpy as np
from string import digits
from tokenizers import Tokenizer
from tokenizers.models import WordLevel, BPE
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
from tokenizers.trainers import WordLevelTrainer, BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import re

In [23]:
languages = ['english', 'croatian', 'czech', 'dutch', 'finnish', 'french', 'icelandic', 'italian', 
            'polish', 'romanian', 'spanish']

df = pd.DataFrame(columns = ['word', 'phonemes', 'label'])
for language in languages:
    path = '../data/model_ready/csv/'
    language = language
    file_name = path+'processed_'+language+'.csv'
    token_data_table = pd.read_csv(file_name)
    df = pd.concat([df, token_data_table])
df.head()

Unnamed: 0,word,phonemes,label
0,'bout,start,B
1,'bout,start B,AW
2,'bout,start B AW,T
3,'bout,start B AW T,stop
4,'cause,start,K


In [27]:
# below adds spacing to words
df['word'] = df['word'].map(lambda row: ' '.join(list(row)))
# additional minor filtering due to the fact that unwanted words still persist.
df['phonemes'] = df['phonemes'].map(lambda row: re.sub(r'\d', '', row))
df['label'] = df['label'].map(lambda row: re.sub(r'\d', '', row))
df = df[~df['phonemes'].str.contains('#')]

In [28]:
df.head()

Unnamed: 0,word,phonemes,label
0,' b o u t,start,B
1,' b o u t,start B,AW
2,' b o u t,start B AW,T
3,' b o u t,start B AW T,stop
4,' c a u s e,start,K


In [46]:
words = list(df['word'].values)
phonemes = list(df['phonemes'].values)

In [30]:
#Initialize tokenizers
word_tokenizer = Tokenizer(WordLevel(unk_token='UNK'))
phoneme_tokenizer = Tokenizer(WordLevel(unk_token='UNK'))

In [31]:
#Initialize trainers
word_trainer = WordLevelTrainer(special_tokens=['PAD', 'UNK'])
phone_trainer = WordLevelTrainer(special_tokens=['PAD', 'UNK'])

In [38]:
#Remove Whitespace
word_tokenizer.pre_tokenizer = Whitespace()
phoneme_tokenizer.pre_tokenizer = Whitespace()

In [39]:
#Enable padding
word_tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, 
               pad_token='PAD', length=None, pad_to_multiple_of=None)
phoneme_tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, 
               pad_token='PAD', length=None, pad_to_multiple_of=None)

In [47]:
# Train Tokenizers
word_tokenizer.train_from_iterator(words, trainer=word_trainer)
phoneme_tokenizer.train_from_iterator(phonemes, trainer=phone_trainer)

In [48]:
# Baseline Test (make sure it matches english only)
word_test = word_tokenizer.encode(words[203])
phoneme_test = phoneme_tokenizer.encode(phonemes[48])

In [49]:
word_test.ids

[2, 2, 9, 6, 5, 7]

In [50]:
word_test.tokens

['a', 'a', 'r', 'o', 'n', 's']

In [60]:
#Test Random word in dataset
idx = np.random.choice(len(words))
word_test = word_tokenizer.encode(words[idx])
phoneme_test = phoneme_tokenizer.encode(phonemes[idx])

In [61]:
word_test.ids

[25, 7, 16, 38, 28, 22, 2, 10, 3, 46, 5, 21]

In [62]:
word_test.tokens

['w', 's', 'p', 'ó', 'ł', 'z', 'a', 'l', 'e', 'ż', 'n', 'y']

In [66]:
#Padding Test
pad_test = word_tokenizer.encode_batch(list(df['word'][1000:1500].values))
pad_test[77].ids

[2, 19, 3, 14, 4, 0, 0, 0, 0, 0, 0, 0]

In [67]:
pad_test[77].tokens

['a', 'b', 'e', 'd', 'i', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']

In [68]:
#Testing unknowns
phoneme_tokenizer.encode('t !').ids

[1, 1]

In [69]:
#Check that longest label is 'stop', should return 4
max_L = 2
ix = 'B'
for item in df['label']:
    if len(item) > max_L:
        max_L = len(item)
        ix = item
max_L

4

In [71]:
tokens = phoneme_tokenizer.encode_batch(list(df['label'].values))
# checking for instances where the second token isn't a pad ID, should return an error
for i in range(len(tokens)):
    if tokens[i].ids[1] > 0:
        print(i)
        print(tokens[i].ids)

IndexError: list index out of range

In [72]:
# Save tokenizers
word_tokenizer.save("../data/token_encodings/word_tokenizer-universal.json")
phoneme_tokenizer.save('../data/token_encodings/phoneme_tokenizer-universal.json')