In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from datasets import load_dataset, DatasetDict
import re

In [3]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(vocab_size = 20000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.pre_tokenizer = Whitespace()

In [4]:
wiki_dataset = load_dataset('Salesforce/wikitext', "wikitext-103-raw-v1")

In [96]:
print(f"{wiki_dataset}") 

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


In [None]:
train = wiki_dataset["train"]
test = wiki_dataset["test"]
validation = wiki_dataset["validation"]

In [6]:
def iterator_wiki(train, test, validation):
  for data in [train, test, validation]:
    for i, data in enumerate(data):
      text = data.get("text", None) 
      if isinstance(text, str): # gotten object maybe not string
        text = text.strip() # removes leading and trailing white spaces
        if len(text) > 0: # only text with content
          if text.startswith("="): # removes '=' from heads
            text = text.replace("=", "").strip()
          yield text

In [94]:
regex = r"([A-Z][a-z]+)"

with open("../../llm/resources/emoji_list.txt", "r") as f:
    content = f.readlines()
    emojis = []
    for emoji in content:
        emoji: str = emoji.strip().split("\t")
        if len(emoji) > 1:
            emoji = emoji[1]
        else:
            if emoji[0].isdigit():
                continue
            if re.search(regex, emoji[0]):
                continue
            emoji = emoji[0]

        emojis.append(emoji + "\n")

with open("../../llm/resources/emoji_list_ok.txt", "w") as f:
        f.writelines(emojis)

In [87]:
emojiz = []
with open("../../llm/resources/emoji_list_ok.txt", "r") as f:
        emojiz = f.readlines()
print(len(emojiz))  

2796


In [88]:
tokenizer.add_tokens(emojiz)

0

In [89]:

trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train_from_iterator(iterator_wiki(
    train, test, validation), trainer=trainer)
tokenizer.save("/home/baosiek/Projects/tchumyt-yofy/llm/resources/wiki_tokenizer.json")






In [90]:
tokenizer1 = Tokenizer.from_file("/home/baosiek/Projects/tchumyt-yofy/llm/resources/wiki_tokenizer.json")

In [100]:
output = tokenizer1.encode("Hello, y'all! How are you 😎 😂 😀 😁 😈 all?")
print(output.tokens)
print(output.ids)

['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '😎', '😂', '😀', '[UNK]', '[UNK]', 'all', '?']
[27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 30000, 30001, 30002, 0, 0, 5097, 35]


In [101]:
new_output = tokenizer1.decode(output.ids)
new_output

"Hello , y ' all ! How are you 😎 😂 😀 all ?"