In [1]:
import pandas as pd
import numpy as np
from string import digits
from tokenizers import Tokenizer
from tokenizers.models import WordLevel, BPE
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
from tokenizers.trainers import WordLevelTrainer, BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import re

In [2]:
languages = ['english', 'croatian', 'czech', 'dutch', 'finnish', 'french', 'icelandic', 'italian', 
            'polish', 'romanian', 'spanish']

df = pd.DataFrame(columns = ['word', 'phonemes', 'label'])
for language in languages:
    path = '../data/model_ready/csv/'
    language = language
    file_name = path+'processed_'+language+'.csv'
    token_data_table = pd.read_csv(file_name)
    df = pd.concat([df, token_data_table])
df.head()

Unnamed: 0,word,phonemes,label
0,bout,start,B
1,bout,start B,AW
2,bout,start B AW,T
3,bout,start B AW T,stop
4,cause,start,K


In [3]:
# below adds spacing to words
df['word'] = df['word'].map(lambda row: ' '.join(list(row)))

### Commenting out below, filters should already be implemented in the csv files
# additional minor filtering due to the fact that unwanted words still persist.
# df['phonemes'] = df['phonemes'].map(lambda row: re.sub(r'\d', '', row))
# df['label'] = df['label'].map(lambda row: re.sub(r'\d', '', row))
# df = df[~df['phonemes'].str.contains('#')]
# df = df[~df['word'].str.contains('.')]
# df = df[~df['word'].str.contains('(')]

#### Make sure words and phononmes have been filtered and editied

In [25]:
df[df['phonemes'].str.contains('#')].shape[0] + df[df['word'].str.contains('\.')].shape[0] + \
    df[df['label'].str.contains('#')].shape[0] + \
    + df[df['phonemes'].str.contains('\)')].shape[0] + df[df['phonemes'].str.contains('-')].shape[0] + \
    df[df['word'].str.contains('’')].shape[0]

3

In [39]:
df[df['label'].str.contains('#')]

Unnamed: 0,word,phonemes,label
99,a a l e n,start AE L AH N,#
115,a a l t o,start AA L T OW,#
197845,d a n g l a r s,start D AH NG L AA R Z,#


In [46]:
df.iloc[99,2] = 'stop'
df.iloc[115,2] = 'stop'
df.iloc[197845, 2] = 'stop'

In [47]:
df[df['label'].str.contains('#')]

Unnamed: 0,word,phonemes,label


In [48]:
df[df['phonemes'].str.contains('#')].shape[0] + df[df['word'].str.contains('\.')].shape[0] + \
    df[df['label'].str.contains('#')].shape[0] + \
    + df[df['phonemes'].str.contains('\)')].shape[0] + df[df['phonemes'].str.contains('-')].shape[0] + \
    df[df['word'].str.contains('’')].shape[0]

0

In [49]:
words = list(df['word'].values)
phonemes = list(df['phonemes'].values)
labels = list(df['label'].values)
phonemes = phonemes + labels

In [50]:
#Initialize tokenizers
word_tokenizer = Tokenizer(WordLevel(unk_token='UNK'))
phoneme_tokenizer = Tokenizer(WordLevel(unk_token='UNK'))

In [51]:
#Initialize trainers
word_trainer = WordLevelTrainer(special_tokens=['PAD', 'UNK'])
phone_trainer = WordLevelTrainer(special_tokens=['PAD', 'UNK'])

In [52]:
#Remove Whitespace
word_tokenizer.pre_tokenizer = Whitespace()
phoneme_tokenizer.pre_tokenizer = Whitespace()

In [53]:
#Enable padding
word_tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, 
               pad_token='PAD', length=None, pad_to_multiple_of=None)
phoneme_tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, 
               pad_token='PAD', length=None, pad_to_multiple_of=None)

In [54]:
# Train Tokenizers
word_tokenizer.train_from_iterator(words, trainer=word_trainer)
phoneme_tokenizer.train_from_iterator(phonemes, trainer=phone_trainer)

In [43]:
# Baseline Test (make sure it matches english only)
word_test = word_tokenizer.encode(words[203])
phoneme_test = phoneme_tokenizer.encode(phonemes[48])

In [44]:
word_test.ids

[2, 2, 7, 3, 5]

In [45]:
word_test.tokens

['a', 'a', 's', 'e', 'n']

In [46]:
#Test Random word in dataset
idx = np.random.choice(len(words))
word_test = word_tokenizer.encode(words[idx])
phoneme_test = phoneme_tokenizer.encode(phonemes[idx])

In [47]:
word_test.ids

[42, 2, 9, 8, 6, 25, 2, 28, 2, 13]

In [48]:
word_test.tokens

['ż', 'a', 'r', 't', 'o', 'w', 'a', 'ł', 'a', 'm']

In [49]:
#Padding Test
pad_test = word_tokenizer.encode_batch(list(df['word'][1000:1500].values))
pad_test[77].ids

[2, 19, 3, 9, 5, 2, 8, 17, 21, 0, 0, 0]

In [50]:
pad_test[77].tokens

['a', 'b', 'e', 'r', 'n', 'a', 't', 'h', 'y', 'PAD', 'PAD', 'PAD']

In [51]:
#Testing unknowns
phoneme_tokenizer.encode('t !').ids

[1, 1]

In [52]:
#Testing unknowns
word_tokenizer.encode('’').ids

[1]

In [53]:
#Check that longest label is 'stop', should return 4
max_L = 2
ix = 'B'
for item in df['label']:
    if len(item) > max_L:
        max_L = len(item)
        ix = item
max_L

4

In [54]:
tokens = phoneme_tokenizer.encode_batch(list(df['label'].values))
# checking for instances where the second token isn't a pad ID, should return an error
for i in range(len(tokens)):
    if tokens[i].ids[1] > 0:
        print(i)
        print(tokens[i].ids)

IndexError: list index out of range

In [55]:
#Final check for any unwanted punctuation
word_tokenizer.get_vocab()

{'ž': 46,
 'd': 14,
 'º': 80,
 'ï': 60,
 'è': 50,
 'ň': 66,
 'y': 21,
 'ă': 48,
 'é': 27,
 'o': 6,
 'ò': 73,
 'i': 4,
 'ø': 77,
 'ě': 43,
 'PAD': 0,
 'e': 3,
 'ů': 56,
 'í': 29,
 'ÿ': 79,
 'ä': 26,
 'ê': 58,
 'ť': 67,
 'đ': 65,
 'ó': 34,
 'ř': 44,
 'ț': 54,
 'l': 10,
 'î': 57,
 'ć': 37,
 'n': 5,
 'õ': 78,
 'z': 22,
 'x': 32,
 'â': 47,
 't': 8,
 'c': 15,
 'ô': 69,
 'þ': 62,
 'ö': 31,
 'ñ': 52,
 'á': 30,
 'ą': 36,
 'æ': 53,
 's': 7,
 'a': 2,
 'ë': 64,
 'û': 71,
 'ì': 75,
 'p': 16,
 'š': 40,
 'å': 76,
 'UNK': 1,
 'h': 17,
 'f': 24,
 'm': 13,
 'à': 63,
 'č': 35,
 'q': 33,
 'ù': 74,
 'œ': 70,
 'ș': 55,
 'ż': 42,
 'k': 12,
 'ð': 49,
 'w': 25,
 'j': 23,
 'b': 19,
 'r': 9,
 'v': 20,
 'ę': 38,
 'ý': 39,
 'ç': 61,
 'ü': 68,
 'ď': 72,
 'u': 11,
 'g': 18,
 'ú': 51,
 'ń': 45,
 'ź': 59,
 'ś': 41,
 'ł': 28}

In [56]:
phoneme_tokenizer.get_vocab()

{'EH': 4,
 'S': 7,
 'F': 24,
 'V': 23,
 'NG': 37,
 'AA': 12,
 'D': 16,
 'stop': 18,
 'EY': 38,
 'JH': 40,
 'ZH': 41,
 'AW': 43,
 'OY': 46,
 'DH': 45,
 'R': 10,
 'AO': 19,
 'AY': 39,
 'IH': 3,
 'OH': 13,
 'B': 20,
 'TH': 42,
 'Z': 26,
 'Q': 49,
 'IY': 31,
 'AH': 21,
 'PAD': 0,
 'T': 6,
 'EL': 47,
 'K': 8,
 'L': 11,
 'M': 14,
 'AX': 29,
 'DX': 30,
 'start': 2,
 'UW': 17,
 'EM': 48,
 'G': 25,
 'ER': 32,
 'SH': 33,
 'EN': 50,
 'HH': 27,
 'UX': 51,
 'UNK': 1,
 'W': 28,
 'UH': 44,
 'P': 15,
 'CH': 36,
 'N': 9,
 'AXR': 52,
 'AE': 5,
 'OW': 35,
 'IX': 34,
 'Y': 22}

In [57]:
# Save tokenizers
word_tokenizer.save("../data/token_encodings/word_tokenizer-universal.json")
phoneme_tokenizer.save('../data/token_encodings/phoneme_tokenizer-universal.json')