In [1]:
from custom_tokenizer import BytePairEncodingTokenizer,WordPieceTokenizer
import time

with open('preprocessed_text.txt','r') as f:
    text = f.read()


wp_tok = WordPieceTokenizer(num_tokens=20000,max_word_count=100000)
bpe_tok = BytePairEncodingTokenizer(num_tokens=20000,max_word_count=100000)


In [2]:
text[:1000]

"one of the other reviewers has mentioned that after watching just 1 oz episode you ' ll be hooked . they are right , as this is exactly what happened with me . the first thing that struck me about oz was its brutality and unflinching scenes of violence , which set in right from the word go . trust me , this is not a show for the faint hearted or timid . this show pulls no punches with regards to drugs , sex or violence . its is hardcore , in the classic use of the word . it is called oz as that is the nickname given to the oswald maximum security state penitentary . it focuses mainly on emerald city , an experimental section of the prison where all the cells have glass fronts and face inwards , so privacy is not high on the agenda . em city is home to many . aryans , muslims , gangstas , latinos , christians , italians , irish and more . so scuffles , death stares , dodgy dealings and shady agreements are never far away . i would say the main appeal of the show is due to the fact that

In [3]:
# training wordpiece tokenizer

start = time.time()

wp_tok([text])
wp_tok.train(iterations=3,min_pair_freq=100)

end = time.time()

print(f'total training time : {round((end - start) / 60 , 2)} minutes')

total training time : 1.36 minutes


In [4]:
start = time.time()

bpe_tok([text])
bpe_tok.train(iterations=3,min_pair_freq=100)

end = time.time()

print(f'total training time : {round((end - start) / 60 , 2)} minutes')

total training time : 1.52 minutes


In [5]:
len(wp_tok.vocab),len(bpe_tok.vocab)

(20013, 20013)

In [6]:
# 20013 = 20000 words + 5 special tokens + 8 punctuations

In [7]:
# tokenizing sequences

# set maxlen
wp_tok.maxlen = 20
bpe_tok.maxlen = 20

seq = "the french lost in normandy ?"

wp_seq = wp_tok.tokenize(seq)
bpe_seq = bpe_tok.tokenize(seq)

In [8]:
# wordpiece tokenization

' '.join(wp_tok.i2w[t] for t in wp_seq)

'<cls> the french lost in norman ##d <unk> ? <sep>'

In [9]:
# byte-pair tokenization

' '.join(bpe_tok.i2w[t] for t in bpe_seq)

'<cls> the french lost in nor mandy ? <sep>'

In [10]:
wp_padded_seq = wp_tok.add_padding(wp_seq)
bpe_padded_seq = bpe_tok.add_padding(bpe_seq)

In [11]:
wp_padded_seq

[1, 5, 721, 441, 15, 3259, 8304, 3, 59, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [12]:
bpe_padded_seq

[1, 5, 721, 441, 15, 939, 8086, 59, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [14]:
# testing subword splits

words = list(set(text.split()))[3000:3020]

for w in words:
    
    print(w,' -- ',wp_tok._split_oov(w))

moisturiser  --  <unk>
trains  --  trains
translvanian  --  trans ##l ##v ##a ##n ##i ##a ##n
bachar  --  bach ##a ##r
lucina  --  luc ##in ##a
crucification  --  <unk>
amovie  --  am ##o ##v ##i ##e
socomm  --  so ##co ##mm
wesa  --  wes ##a
tasogare  --  tas ##o ##g ##a ##r ##e
outfox  --  out ##f ##o ##x
eire  --  e ##i ##r ##e
disfigurement  --  <unk>
uresevsky  --  u ##r ##e ##s ##e ##v ##s ##k <unk>
oneshoe  --  ones ##h ##o ##e
thugaboo  --  thug ##a ##b ##oo
razed  --  ra ##z ##e ##d
proprieties  --  prop ##r ##i ##e ##t ##ies
kids  --  kids
sanitizes  --  san ##i ##t ##i ##z ##e ##s


In [15]:
for w in words:
    
    print(w,' -- ',bpe_tok._split_oov(w))

moisturiser  --  moi stu rise r
trains  --  trains
translvanian  --  trans l van ian
bachar  --  b ac har
lucina  --  luc in a
crucification  --  cr u c if i cat i on
amovie  --  a movie
socomm  --  so com m
wesa  --  we sa
tasogare  --  ta so ga re
outfox  --  out fox
eire  --  e i re
disfigurement  --  dis figure men t
uresevsky  --  u re se vs ky
oneshoe  --  one shoe
thugaboo  --  thug abo o
razed  --  r az ed
proprieties  --  pro pr ie ties
kids  --  kids
sanitizes  --  san it i z es
