In [6]:
!pip install tokenizers

Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/6b/15/1c026f3aeafd26db30cb633d9915aae666a415179afa5943263e5dbd55a6/tokenizers-0.8.0-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 4.3MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.8.0


In [7]:
BIG_FILE_URL = 'https://raw.githubusercontent.com/dscape/spell/master/test/resources/big.txt'

# Let's download the file and save it somewhere
from requests import get
with open('big.txt', 'wb') as big_f:
    response = get(BIG_FILE_URL, )
    
    if response.status_code == 200:  #It's a HTTP status code, it means "OK" 
        big_f.write(response.content)
    else:
        print("Unable to get the file: {}".format(response.reason))


In [8]:
# For the user's convenience `tokenizers` provides some very high-level classes encapsulating
# the overall pipeline for various well-known tokenization algorithm. 
# Everything described below can be replaced by the ByteLevelBPETokenizer class. 

from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel

# First we create an empty Byte-Pair Encoding model (i.e. not trained model)
tokenizer = Tokenizer(BPE())

# Then we enable lower-casing and unicode-normalization
# The Sequence normalizer allows us to combine multiple Normalizer that will be
# executed in order.
tokenizer.normalizer = Sequence([
    NFKC(),
    Lowercase()
])

# Our tokenizer also needs a pre-tokenizer responsible for converting the input to a ByteLevel representation.
tokenizer.pre_tokenizer = ByteLevel()

# And finally, let's plug a decoder so we can recover from a tokenized input to the original one
tokenizer.decoder = ByteLevelDecoder()

In [9]:
from tokenizers.trainers import BpeTrainer

# We initialize our trainer, giving him the details about the vocabulary we want to generate
trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet())
tokenizer.train(trainer, ["big.txt"])

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

Trained vocab size: 25000


In [10]:
# You will see the generated files in the output.
tokenizer.model.save('.')

['./vocab.json', './merges.txt']

In [32]:
# Let's tokenizer a simple input
tokenizer.model = BPE('vocab.json', 'merges.txt')
encoding = tokenizer.encode("hello ashraf, alphabetically")

print("Encoded string: {}".format(encoding.tokens))

print("Encoded string: {}".format(encoding.ids))

decoded = tokenizer.decode(encoding.ids)
print("Decoded string: {}".format(decoded))

Encoded string: ['Ġhell', 'o', 'Ġash', 'ra', 'f', ',', 'Ġalph', 'abet', 'ically']
Encoded string: [16594, 78, 4275, 481, 69, 11, 21631, 24393, 2135]
Decoded string:  hello ashraf, alphabetically
