## BPE

In [32]:
import pandas as pd
import numpy as np

FILE_NAMES = ['appceleratorstudio',
             'aptanastudio',
             'bamboo',
             'clover',
             'datamanagement',
             'duracloud',
             'jirasoftware',
             'mesos',
             'moodle',
             'mule',
             'mulestudio',
             'springxd',
             'talenddataquality',
             'talendesb',
             'titanium',
             'usergrid']

train_data = pd.DataFrame(columns=['text'])
for file in FILE_NAMES:
    avg_text_len = {}
    file_name = 'dataset/' + file + '.csv'
    data = pd.read_csv(file_name)
    # some rows have no description, fill blank to avoid Null
    data = data.fillna(" ")
    split_point = int(len(data) * 0.6)
    data = data[:split_point]
    d = {'text': (data['title'] + data['description']).tolist()}

    parsed_df = pd.DataFrame(data=d)
    train_data = train_data.append(parsed_df)

print(len(train_data))

# write df to .txt file
np.savetxt('all_tokenizers/tokenizer_training_data.txt', train_data.values, fmt='%s', encoding='utf-8')

print('done')

13981
done


In [1]:
# Initialize a tokenizer
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer()
# Customize training
tokenizer.train(files="all_tokenizers/tokenizer_training_data.txt", 
                vocab_size=50257,
                min_frequency=2, 
                special_tokens=["<s>",
                                "<pad>",
                                "</s>",
                                "<unk>",
                                "<mask>",
                                ])

# Save files to disk
tokenizer.save_model(".", "bbpe_")
tokenizer.save('config.json')

## Word-level Tokenizer

In [2]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer

tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = WordLevelTrainer(vocab_size=50257,
                           special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                           min_frequency=2)

tokenizer.train(files=['all_tokenizers/tokenizer_training_data.txt'], trainer=trainer)

# Save the files
tokenizer.save("wordlevel.json")

print('done')

done


In [13]:
tokenizer = Tokenizer.from_file('all_tokenizers/word_level/wordlevel.json')

encoded = tokenizer.encode("hello how are you hello how are you hello how are you hello how are you hello how are you hello how are you hello how are you hello how are you",)
encoded.ids

[2875,
 348,
 60,
 133,
 2875,
 348,
 60,
 133,
 2875,
 348,
 60,
 133,
 2875,
 348,
 60,
 133,
 2875,
 348,
 60,
 133,
 2875,
 348,
 60,
 133,
 2875,
 348,
 60,
 133,
 2875,
 348,
 60,
 133]

## SentencePiece

In [15]:
from transformers import XLNetTokenizer
import sentencepiece as spm

spm.SentencePieceTrainer.Train('--input=all_tokenizers/tokenizer_training_data.txt --model_prefix=spm_tokenizer --vocab_size=41783')
XLNetTokenizer('spm_tokenizer.model')

PreTrainedTokenizer(name_or_path='', vocab_size=41783, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='left', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '<sep>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True), 'additional_special_tokens': ['<eop>', '<eod>']})

## WordPiece

In [16]:
from tokenizers import BertWordPieceTokenizer

# Initialize an empty tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=True,
    lowercase=False,
)

# And then train
tokenizer.train('all_tokenizers/tokenizer_training_data.txt',
                vocab_size=50257,
                min_frequency=2,
                show_progress=True,
                special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
                limit_alphabet=1000,
                wordpieces_prefix="##",
                )

# Save the files
tokenizer.save_model("./", "BERT")

['./BERT-vocab.txt']

In [17]:
from transformers import BertTokenizer
BertTokenizer("all_tokenizers/word_piece/vocab.txt")

PreTrainedTokenizer(name_or_path='', vocab_size=50257, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})