In [1]:
from transformers import AlbertTokenizerFast

In [1]:
corpus = open('/scratch/props_corpus_train.txt', 'r').readlines()

In [2]:
len(corpus)

3473638

In [7]:
def get_training_corpus(corpus):
    return (
        corpus[i : i + 1000]
        for i in range(0, len(corpus), 1000)
    )


training_corpus = get_training_corpus(corpus)

In [16]:
words = set()
for line in corpus:
    words.update(line.split())

In [17]:
len(words)

3133435

In [4]:
old_tokenizer = AlbertTokenizerFast.from_pretrained("roberta-base")

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'AlbertTokenizerFast'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
example = "!Hero | basedOn | Gospel <TSP> !Hero | musicBy | Eddie DeGarmo <TSP> !Hero | subtitle | The Rock Opera"
tokens = old_tokenizer.tokenize(example)

In [7]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 32000)





In [17]:
tokenizer.save_pretrained("rdf-albert-tokenizer")

('rdf-albert-tokenizer/tokenizer_config.json',
 'rdf-albert-tokenizer/special_tokens_map.json',
 'rdf-albert-tokenizer/tokenizer.json')

## From scratch

In [2]:
other_tokenizer = SentencePieceUnigramTokenizer()
paths = ["/scratch/props_corpus.txt"]

In [3]:
other_tokenizer.train(files=paths, vocab_size=32000, special_tokens=[
    "<s>",
    "<TSP>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
], show_progress=True)





In [4]:
other_tokenizer.save_model(".", "sentpiece-rdf-tokenizer")

['./sentpiece-rdf-tokenizer-unigram.json']

## Tokenizers library

In [3]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

In [27]:
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

trainer = trainers.BpeTrainer(vocab_size=32000, special_tokens=["<unk>", "<s>", "</s>", "<TSP>", "<pad>", "<mask>", "|"])

In [28]:
tokenizer.train_from_iterator(get_training_corpus(corpus), trainer=trainer)






In [29]:
cls_token_id = tokenizer.token_to_id("<s>")
sep_token_id = tokenizer.token_to_id("</s>")
tsp_token_id = tokenizer.token_to_id("<TSP>")
print(cls_token_id, sep_token_id)

tokenizer.post_processor = processors.TemplateProcessing(
    single=f"<s>:0 $A:0 <TSP>:0",
    pair=f"<s>:0 $A:0 <TSP>:0 $B:1 </s>:1",
    special_tokens=[("<s>", cls_token_id), ("</s>", cls_token_id), ("<TSP>", sep_token_id)],
)
tokenizer.decoder = decoders.ByteLevel()

1 2


In [30]:
encoding = tokenizer.encode("Adolfo_Suárez_Madrid–Barajas_Airport | runwayLength | 3500.0 <TSP> Adolfo_Suárez_Madrid–Barajas_Airport | location | San_Sebastián_de_los_Reyes")
print(encoding.tokens)
print(encoding.type_ids)

['<s>', 'Ad', 'olfo', '_', 'Su', 'Ã¡rez', '_', 'Madrid', 'âĢĵ', 'Bar', 'aj', 'as', '_', 'Airport', 'Ġ', '|', 'ĠrunwayLength', 'Ġ', '|', 'Ġ3500', '.', '0', 'Ġ', '<TSP>', 'ĠAdolfo', '_', 'Su', 'Ã¡rez', '_', 'Madrid', 'âĢĵ', 'Bar', 'aj', 'as', '_', 'Airport', 'Ġ', '|', 'Ġlocation', 'Ġ', '|', 'ĠSan', '_', 'Sebast', 'iÃ¡n', '_', 'de', '_', 'los', '_', 'R', 'ey', 'es', '<TSP>']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [31]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    cls_token="<s>",
    sep_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>",
    bos_token="<s>",
    eos_token="</s>",
    additional_special_tokens=["<TSP>"],
)

In [32]:
wrapped_tokenizer.save_pretrained("rdf-bpe-tokenizer-32k")

('rdf-bpe-tokenizer-32k/tokenizer_config.json',
 'rdf-bpe-tokenizer-32k/special_tokens_map.json',
 'rdf-bpe-tokenizer-32k/tokenizer.json')

In [14]:
from transformers import AutoTokenizer

In [37]:
tok_roberta = AutoTokenizer.from_pretrained("roberta-base")
tok_scratch = AutoTokenizer.from_pretrained("rdf-bpe-tokenizer-32k")

In [16]:
texts = open("/scratch/props_corpus_train.txt", "r").readlines()

In [18]:
texts = texts[:1000]

In [38]:
roberta_encodings = tok_roberta(texts, truncation=False, padding=False)
scratch_encodings = tok_scratch(texts, truncation=False, padding=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (581 > 512). Running this sequence through the model will result in indexing errors


In [39]:
mean_roberta_length = sum(len(e) for e in roberta_encodings["input_ids"]) / len(roberta_encodings["input_ids"])
mean_scratch_length = sum(len(e) for e in scratch_encodings["input_ids"]) / len(scratch_encodings["input_ids"])

In [26]:
# 16k 
mean_roberta_length, mean_scratch_length

(184.527, 188.479)

In [40]:
# 32k 
mean_roberta_length, mean_scratch_length

(184.527, 178.989)

In [44]:
encoding = tok_scratch.encode("Adolfo_Suárez_Madrid–Barajas_Airport | runwayLength | 3500.0 <TSP> Adolfo_Suárez_Madrid–Barajas_Airport | location | San_Sebastián_de_los_Reyes")

In [47]:
token_map = [tok_scratch.convert_ids_to_tokens(e) for e in encoding]

In [48]:
token_map

['<s>',
 'Ad',
 'olfo',
 '_',
 'Su',
 'Ã¡rez',
 '_',
 'Madrid',
 'âĢĵ',
 'Bar',
 'aj',
 'as',
 '_',
 'Airport',
 'Ġ',
 '|',
 'ĠrunwayLength',
 'Ġ',
 '|',
 'Ġ3500',
 '.',
 '0',
 'Ġ',
 '<TSP>',
 'ĠAdolfo',
 '_',
 'Su',
 'Ã¡rez',
 '_',
 'Madrid',
 'âĢĵ',
 'Bar',
 'aj',
 'as',
 '_',
 'Airport',
 'Ġ',
 '|',
 'Ġlocation',
 'Ġ',
 '|',
 'ĠSan',
 '_',
 'Sebast',
 'iÃ¡n',
 '_',
 'de',
 '_',
 'los',
 '_',
 'R',
 'ey',
 'es',
 '</s>']