In [1]:
# from tokenizers import Tokenizer
# from tokenizers.models import BPE
# from tokenizers.trainers import BpeTrainer
# from tokenizers.pre_tokenizers import Whitespace

# vocab_size = 2000
# tokenizer = Tokenizer(BPE(unk_token="<unk>", single_world=True))
# trainer = BpeTrainer(special_tokens=['<s>',
#  '</s>',
#  '<pad>',
#  '<unk>',
#  '<special0>',
#  '<special1>',
#  '<special2>',
#  '<special3>',
#  '<special4>'], vocab_size=vocab_size)
# tokenizer.pre_tokenizer = Whitespace()

# files = [f"data/codex-m/{split}.txt" for split in ["test", "train", "valid"]]

# tokenizer.train(files, trainer)

In [2]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

vocab_size = 30000
tokenizer = Tokenizer(WordPiece(unk_token="<unk>", single_world=True))
trainer = WordPieceTrainer(special_tokens=['<s>',
 '</s>',
 '<pad>',
 '<unk>',
 '<special0>',
 '<special1>',
 '<special2>',
 '<special3>',
 '<special4>'], vocab_size=vocab_size)
tokenizer.pre_tokenizer = Whitespace()

# changing to train_with_rp.txt for relation prediction
# files = [f"data/codex-m/{split}.txt" for split in ["test", "train_with_rp", "valid"]]
files = [f"data/wikidata5m/{split}.txt" for split in ["test", "train", "valid"]]

tokenizer.train(files, trainer)

In [3]:
import os
from transformers import XLMTokenizer
from transformers import PreTrainedTokenizer
from transformers import GPT2Tokenizer

def saveBPETokenizer(tokenizer, prefix):
    # make directory for tokenizer
    path = os.path.join('data/bpe', prefix)
    if not os.path.exists(path):
        os.mkdir(path)
    # save + load data
    main_fname = 'data/bpe/{}/main.json'.format(prefix)
    tokenizer.save(main_fname)
    f = open(main_fname, 'r')
    data = json.load(f)
    f.close()
    #save data in format required by transformers.PretrainedTokenizer
    vocab_fname = 'data/bpe/{}/vocab.json'.format(prefix)
    merges_fname = 'data/bpe/{}/merges.txt'.format(prefix)
    json_object = json.dumps(data['model']['vocab'])
    with open(vocab_fname, "w") as outfile:
        outfile.write(json_object)
    f = open(merges_fname, 'w')
    for x in data['model']['merges']:
        f.write(x +'\n')
    f.close()
    print('Saved')
    
def saveWordPieceTokenizer(tokenizer, prefix):
    # make directory for tokenizer
    path = os.path.join('data/wordpiece', prefix)
    if not os.path.exists(path):
        os.mkdir(path)
    # save + load data
    main_fname = 'data/wordpiece/{}/main.json'.format(prefix)
    tokenizer.save(main_fname)
    print('Saved')



def loadBPETokenizer(prefix):
    vocab_fname = 'data/bpe/{}/vocab.json'.format(prefix)
    merges_fname = 'data/bpe/{}/merges.txt'.format(prefix)
#     tokenizer = XLMTokenizer(vocab_file=vocab_fname, merges_file=merges_fname)
    tokenizer = GPT2Tokenizer(vocab_file=vocab_fname, merges_file=merges_fname, 
                              unk_token='<unk>', 
                              bos_token='<s>', 
                              eos_token='</s>', 
                              add_prefix_space=True)
    return tokenizer

In [4]:
# saveBPETokenizer(tokenizer, 'codexm{}'.format(vocab_size))
saveWordPieceTokenizer(tokenizer, 'wikidata5m_{}'.format(vocab_size))

Saved


In [6]:
# text = train_data[0]
text = 'hello world'
tokenizer.encode_batch(["predict tail: obama | united states of america |", "How are you 😁 ?", text])

[Encoding(num_tokens=10, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=2, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [7]:
text = 'hello world  </s>'
tokenizer.enable_padding(pad_id=2, pad_token="<pad>")
x = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?", text])
x[2].tokens

['hello', 'world', '</s>', '<pad>', '<pad>', '<pad>']

In [12]:
x = tokenizer.encode('predict relation: obama | united states of america | </s>').ids
y = tokenizer.encode('fsadlk jfasdkf jsadlkfjas lkdjf').ids
out = tokenizer.decode_batch([x,y], skip_special_tokens=True)
# for x in out:
#     x = x.replace(' ##', '')
#     print(x)

In [8]:
y = tokenizer.encode('predict tail: obama | united states of america | </s>')
y.tokens

['predict',
 'tail',
 ':',
 'obama',
 '|',
 'united',
 'states',
 'of',
 'america',
 '|',
 '</s>']

In [236]:
tokenizer.encode('<pad>').ids[0]

2

In [209]:
tk = loadBPETokenizer('codexm{}'.format(vocab_size))

In [210]:
text = ' hello world'

In [270]:
tokenized = tk(text, truncation=True, max_length=128, return_tensors="pt")
# tokenized_ids = [0 if token == None else token for token in tokenized.input_ids]
# ''.join(tk.convert_ids_to_tokens(tokenized_ids))

In [59]:
t5tk.pad_token_id

0

In [277]:
import numpy as np
x = [1,2,3,4]
x = np.array(x)
np.stack([x,x])

array([[1, 2, 3, 4],
       [1, 2, 3, 4]])

In [259]:
from transformers import BatchEncoding
da = {}
da['input_ids'] = x.input_ids
da['attention_mask'] = x.attention_mask
da2 = BatchEncoding(da)
da2.input_ids

[3,
 65,
 42,
 36,
 53,
 3,
 77,
 31,
 3,
 34,
 3,
 1115,
 56,
 3,
 1587,
 3,
 617,
 3,
 60,
 3,
 193,
 3,
 111,
 3,
 60,
 3,
 554,
 3,
 777,
 107,
 122,
 211]

In [118]:
train_data = []
fname = 'data/codex-m/train.txt'
f = open(fname, 'r')
for line in f:
    train_data.append(line.strip())

In [197]:
tk.unk_token 

'<unk>'

In [20]:
import json
  
# Opening JSON file
fname = "data/tokenizer-codexm-2k.json"
f = open(fname, 'r')
  
# returns JSON object as 
# a dictionary
data = json.load(f)


In [22]:
kwargs = {
    'pretrained_vocab_files_map ': data
}


In [26]:
# tokenizer =  XLMTokenizer(vocab_file='vocab.json', merges_file='merges.txt')

Calling PreTrainedTokenizer.from_pretrained() with the path to a single file or url is deprecated


IndexError: list index out of range

In [88]:
xlmtk = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')

In [89]:
xlmtk.convert_ids_to_tokens([0,1,2,3,4,5,6,7,8])

['<s>',
 '</s>',
 '<pad>',
 '<unk>',
 '<special0>',
 '<special1>',
 '<special2>',
 '<special3>',
 '<special4>']

In [90]:
xlmtk.special_tokens

AttributeError: 'XLMTokenizer' object has no attribute 'special_tokens'

In [74]:
mytk = XLMTokenizer(vocab_file='vocab.json', merges_file='merges.txt')

In [75]:
mytk.convert_ids_to_tokens([0,1,2,3,4,5])

['<unk>', '<s>', '</s>', '<pad>', '<mask>', '!']

In [33]:
tokenizer.save('data/bpe', 'codexm2k')


TypeError: Can't convert 'codexm2k' to PyBool

In [34]:
tokenizer.save

<function Tokenizer.save(self, pretty=False)>

In [39]:
f = open('merges.txt', 'w')
for x in data['model']['merges']:
    f.write(x +'\n')
f.close()

In [42]:
from transformers import XLMTokenizer

In [44]:
tk2 = XLMTokenizer(vocab_file='vocab.json', merges_file='merges.txt')

In [45]:
tk2.vocab_size

2000

In [29]:
from transformers import T5Tokenizer

In [30]:
t5tk = T5Tokenizer.from_pretrained('t5-small')

In [60]:
t5tk.convert_tokens_to_ids('<e>')

2

In [35]:
t5tk('hello <s>')

{'input_ids': [21820, 3, 2, 7, 3155, 1], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [76]:
t5tk.convert_ids_to_tokens([0,1,2,3,4,5])

['<pad>', '</s>', '<unk>', '▁', 'X', '.']