In [8]:
from glob import glob
from itertools import chain
import sentencepiece as spm
from tokenizers import SentencePieceUnigramTokenizer, Tokenizer

text_sample = """We aimed to evaluate the effect of sleep quality on memory, executive function, and language performance in patients with refractory focal epilepsy and controlled epilepsy and compare these
 with healthy individuals. We prospectively enrolled 37 adolescent and adult patients with refractory focal epilepsy.

How to avoid anti-clockwise rotation animation when reseting rotation from 360deg to 0 deg?

I am creating an animation that looks like a fancy wheel, When resetting rotation from 360deg to 0 deg, It animating the wheel in anti-clockwise direction, How to Avoid this???
HTML
<ul class="cm">
  <li><span>01</span></li>
  <li><span>02</span></li>
  <li><span>03</span></li>
  <li><span>04</span></li>
  <li><span>05</span></li>
  <li><span>06</span></li>
  <li><span>07</span></li>
  <li><span>08</span></li>

</ul>
"""

print("Split on whitespace", len(text_sample.split()), "tokens")

for t in (4, 8, 16, 32):
    print("-"*100)
    sp = spm.SentencePieceProcessor(model_file=f'spmodels/pile_{t}k.model', add_eos=True)
    tokenized = sp.encode(text_sample.splitlines(), out_type=str)
    tokenized = list(chain(*tokenized))
    print(len(tokenized), "tokens")
    print("-" *100)
    print(" ".join(tokenized))


Split on whitespace 99 tokens
----------------------------------------------------------------------------------------------------
297 tokens
----------------------------------------------------------------------------------------------------
▁We ▁a im ed ▁to ▁evaluate ▁the ▁effect ▁of ▁sleep ▁quality ▁on ▁memory , ▁ex ec ut ive ▁function , ▁and ▁language ▁performance ▁in ▁patients ▁with ▁re frac t ory ▁fo cal ▁epi le p s y ▁and ▁control led ▁epi le p s y ▁and ▁comp are ▁these </s> ▁with ▁health y ▁individual s . ▁We ▁pro spec t ive ly ▁en roll ed ▁ 37 ▁ ado les cent ▁and ▁adult ▁patients ▁with ▁re frac t ory ▁fo cal ▁epi le p s y . </s> </s> ▁How ▁to ▁avoid ▁anti - c lock wise ▁ro t ation ▁an im ation ▁when ▁re set ing ▁ro t ation ▁from ▁3 60 de g ▁to ▁0 ▁de g ? </s> </s> ▁I ▁am ▁cre ating ▁an ▁an im ation ▁that ▁look s ▁like ▁a ▁f ancy ▁wh e el , ▁When ▁re set ting ▁ro t ation ▁from ▁3 60 de g ▁to ▁0 ▁de g , ▁It ▁an im ating ▁the ▁wh e el ▁in ▁anti - c lock wise ▁direction , ▁How ▁to

In [6]:
"""Save tokenizers into json"""

from tokenizers import SentencePieceUnigramTokenizer

for t in (4, 8, 16, 32):
    print("-"*100)
    print(f"Loading {t}k_model")
    tokenizer = SentencePieceUnigramTokenizer.from_spm(f"spmodels/pile_{t}k.model")
    tokenizer.save(f"tokenizers/pile_{t}.json", pretty=True)

----------------------------------------------------------------------------------------------------
Loading 4k_model
----------------------------------------------------------------------------------------------------
Loading 8k_model
----------------------------------------------------------------------------------------------------
Loading 16k_model
----------------------------------------------------------------------------------------------------
Loading 32k_model


In [11]:
import time
import sentencepiece as spm
from tokenizers import Tokenizer

text_sample = open("../data/enwik8/train.txt.raw").read().splitlines()

for t in (4, 8, 16, 32):
    print("-"*100)
    print(f"Loading tokenizers {t}k_model")
    tokenizer = Tokenizer.from_file(f"tokenizers/pile_{t}.json")
    
    starttime = time.time()
    tokenized = tokenizer.encode_batch(text_sample)
    
    print(sum(len(x) for x in tokenized), "tokens")
    print("processed in ", int(time.time() - starttime), "secs")
    print("-" *100)
    
for t in (4, 8, 16, 32):
    print("-"*100)
    print(f"Loading spm {t}k_model")
    sp = spm.SentencePieceProcessor(model_file=f'spmodels/pile_{t}k.model')

    starttime = time.time()
    sptokenized = sp.encode(text_sample)
    
    print(sum(len(x) for x in sptokenized), "tokens")
    print("processed in ", int(time.time() - starttime), "secs")
    print("-" *100)

----------------------------------------------------------------------------------------------------
Loading tokenizers 4k_model
34051862 tokens
processed in  17 secs
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Loading tokenizers 8k_model
30989028 tokens
processed in  20 secs
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Loading tokenizers 16k_model
26695247 tokens
processed in  19 secs
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Loading tokenizers 32k_model
24625391 tokens
processed in  18 secs
---------------------------