# WordPiece

In [1]:
path = '/content/drive/MyDrive/NLP/CA1_Codes/data/All_Around_the_Moon.txt'
s1 = 'This darkness is absolutely killing! If we ever take this trip again, it must be about the time of the sNew Moon!'
s2 = 'This is a tokenization task. Tokenization is the first step in a NLP pipeline. We will be comparing the tokens generated by each tokenization model.'

In [2]:
Unknown_token = "<UNK>" # Unknown words
Special_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"] # Special tokens

In [13]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
tokenizer = Tokenizer(WordPiece(unk_token=Unknown_token))

In [14]:
from tokenizers.trainers import WordPieceTrainer
trainer = WordPieceTrainer(special_tokens=Special_tokens)

In [15]:
from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()

In [16]:
tokenizer.train([path], trainer)

In [17]:
tokenizer.get_vocab_size()

17557

In [22]:
sorted(tokenizer.get_vocab().keys())

['!',
 '!!',
 '!!!"',
 '!!"',
 '!"',
 "!'",
 '!)',
 '!--',
 '!--"',
 '!}',
 '"',
 '",',
 '"--',
 '";',
 '#',
 '##!',
 '##!"',
 '##"',
 '##$',
 "##'",
 '##(',
 '##)',
 '##*',
 '##,',
 '##-',
 '##--',
 '##---',
 '##.',
 '##/',
 '##0',
 '##00',
 '##0000',
 '##1',
 '##11',
 '##16',
 '##2',
 '##20',
 '##21',
 '##25',
 '##27',
 '##3',
 '##33',
 '##38',
 '##4',
 '##41',
 '##44',
 '##45',
 '##5',
 '##50',
 '##51',
 '##52',
 '##541',
 '##57',
 '##6',
 '##60',
 '##62',
 '##7',
 '##71',
 '##8',
 '##88',
 '##89',
 '##9',
 '##98',
 '##:',
 '##;',
 '##?',
 '##?"',
 '##A',
 '##AB',
 '##ABALOO',
 '##AC',
 '##ACE',
 '##ACH',
 '##ACHING',
 '##ACK',
 '##ACT',
 '##AD',
 '##ADELPHIA',
 '##ADEMARK',
 '##AEL',
 '##AF',
 '##AG',
 '##AGRAP',
 '##AIM',
 '##AIMER',
 '##AIN',
 '##AK',
 '##AL',
 '##ALIZ',
 '##ALOO',
 '##AMS',
 '##AN',
 '##ANA',
 '##ANIED',
 '##ANNA',
 '##ANTAB',
 '##AP',
 '##APTER',
 '##AR',
 '##ARD',
 '##ARK',
 '##ART',
 '##ARTURE',
 '##ARY',
 '##AS',
 '##ASE',
 '##ASH',
 '##AST',
 '##AT',
 '##AT

In [19]:
output1 = tokenizer.encode(s1)
print(output1.tokens)

['This', 'darkness', 'is', 'absolutely', 'killing', '!', 'If', 'we', 'ever', 'take', 'this', 'trip', 'again', ',', 'it', 'must', 'be', 'about', 'the', 'time', 'of', 'the', 's', '##N', '##ew', 'Moon', '!']


In [20]:
output2 = tokenizer.encode(s2)
print(output2.tokens)

['This', 'is', 'a', 'to', '##ken', '##ization', 'task', '.', 'To', '##ken', '##ization', 'is', 'the', 'first', 'step', 'in', 'a', 'N', '##L', '##P', 'pip', '##el', '##ine', '.', 'We', 'will', 'be', 'comparing', 'the', 'to', '##ken', '##s', 'generated', 'by', 'each', 'to', '##ken', '##ization', 'model', '.']


# Byte-Pair Encoding

In [3]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
bpe_tokenizer = Tokenizer(BPE(unk_token=Unknown_token))

In [4]:
from tokenizers.trainers import BpeTrainer
bpe_trainer = BpeTrainer(special_tokens=Special_tokens)

In [5]:
from tokenizers.pre_tokenizers import Whitespace
bpe_tokenizer.pre_tokenizer = Whitespace()

In [6]:
bpe_tokenizer.train([path], bpe_trainer)

In [7]:
bpe_tokenizer.get_vocab_size()

16552

In [8]:
sorted(bpe_tokenizer.get_vocab().keys())

['!',
 '!!',
 '!!!"',
 '!!"',
 '!"',
 "!'",
 '!)',
 '!--',
 '!--"',
 '!}',
 '"',
 '",',
 '"--',
 '";',
 '#',
 '$',
 '%',
 "'",
 '\'"',
 "''",
 "'',",
 "''.",
 "',",
 '\'?"',
 '(',
 '($',
 '(-----',
 '(“',
 ')',
 ')!"',
 ')(',
 '),',
 ').',
 '*',
 '**',
 '***',
 '+',
 ',',
 ',"',
 ",'",
 ',--',
 ',--"',
 ',”',
 '-',
 '--',
 '--"',
 '---',
 '----',
 '-----',
 '-----)',
 '--.',
 '--?"',
 '.',
 '."',
 '."[',
 ".'",
 '.\'"',
 '.)',
 '.,',
 '.--',
 '..',
 '...',
 '..."',
 '...?"',
 '.:',
 '.[',
 '.]',
 '.}',
 '.”',
 '/',
 '//',
 '0',
 '00',
 '000',
 '0000',
 '000ft',
 '1',
 '10',
 '100',
 '1000',
 '11',
 '113',
 '115',
 '1152',
 '118',
 '11th',
 '12',
 '120',
 '13',
 '130',
 '1300',
 '13th',
 '14',
 '14th',
 '15',
 '150',
 '1500',
 '1541',
 '1598',
 '16',
 '160',
 '1600',
 '1611',
 '1625',
 '1640',
 '1645',
 '16457',
 '1647',
 '1665',
 '1666',
 '1671',
 '1688',
 '16th',
 '17',
 '1712',
 '1718',
 '1745',
 '1760',
 '1762',
 '1789',
 '17th',
 '18',
 '180',
 '1816',
 '1827',
 '1838',
 '1841',
 '

In [11]:
output1 = bpe_tokenizer.encode(s1)
print(output1.tokens)

['This', 'darkness', 'is', 'absolutely', 'killing', '!', 'If', 'we', 'ever', 'take', 'this', 'trip', 'again', ',', 'it', 'must', 'be', 'about', 'the', 'time', 'of', 'the', 's', 'New', 'Moon', '!']


In [12]:
output2 = bpe_tokenizer.encode(s2)
print(output2.tokens)

['This', 'is', 'a', 'to', 'ken', 'ization', 'task', '.', 'T', 'ok', 'en', 'ization', 'is', 'the', 'first', 'step', 'in', 'a', 'N', 'L', 'P', 'pi', 'pe', 'line', '.', 'We', 'will', 'be', 'comparing', 'the', 'to', 'k', 'ens', 'generated', 'by', 'each', 'to', 'ken', 'ization', 'model', '.']
