### Pretrained 가져오는 경우

In [5]:
from transformers import BertTokenizer
# bert-base-uncased & bert-base-cased

sentences = '경치가 너무 좋네요'

# 영어버전
tokenizer_eng = BertTokenizer.from_pretrained("bert-base-uncased")
# 영어
print(tokenizer_eng.tokenize("such a scene"))
# 한글
print(tokenizer_eng.tokenize(sentences))

['such', 'a', 'scene']
['ᄀ', '##ᅧ', '##ᆼ', '##ᄎ', '##ᅵ', '##ᄀ', '##ᅡ', 'ᄂ', '##ᅥ', '##ᄆ', '##ᅮ', '[UNK]']


In [4]:
# 한글버전
tokenizer_kor = BertTokenizer.from_pretrained("klue/bert-base")
# 영어
print(tokenizer_kor.tokenize("such a scene"))
# 한글
print(tokenizer_kor.tokenize(sentences))

['su', '##ch', 'a', 'sc', '##ene']
['경치', '##가', '너무', '좋', '##네', '##요']


### CLS 토큰 확인

In [7]:
sample_sentences = '경치가 너무 [CLS] 좋아요'
# 영어 tokenizer
print(tokenizer_eng.tokenize(sample_sentences))

# 한글 tokenizer
print(tokenizer_kor.tokenize(sample_sentences))

['ᄀ', '##ᅧ', '##ᆼ', '##ᄎ', '##ᅵ', '##ᄀ', '##ᅡ', 'ᄂ', '##ᅥ', '##ᄆ', '##ᅮ', '[CLS]', '[UNK]']
['경치', '##가', '너무', '[CLS]', '좋아', '##요']


### 새로 학습 시키는 Tokenizer

In [10]:
from tokenizers import BertWordPieceTokenizer

# Initialize an empty tokenizer
wp_tokenizer = BertWordPieceTokenizer(
    clean_text=True,   # ["이순신", "##은", " ", "조선"] ->  ["이순신", "##은", "조선"]
    # if char == " " or char == "\t" or char == "\n" or char == "\r":
    handle_chinese_chars=True,  # 한자는 모두 char 단위로 쪼게버립니다.
    strip_accents=False,    # True: [YehHamza] -> [Yep, Hamza]
    lowercase=False,    # Hello -> hello
)

# And then train
wp_tokenizer.train(
    files="./data/wiki_20190620_small.txt",
    vocab_size=20000,   # vocab size 를 지정해줄 수 있습니다.
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    wordpieces_prefix="##"
)

# Save the files
wp_tokenizer.save_model("./", "my_tokenizer")

['./my_tokenizer-vocab.txt']

In [20]:
vocab = []
with open('./my_tokenizer-vocab.txt', 'r') as f:
    vocabs = f.read().splitlines() 

In [22]:
vocabs[:8]

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '"', '#', '%']

In [26]:
# wp_tokenizer.tokenize(sample_sentences)
wp_tokenizer.encode(sample_sentences).tokens

['경', '##치가', '너무', '[CLS]', '좋아', '##요']

In [27]:
from transformers import BertConfig, BertForPreTraining, BertTokenizerFast
trained_tokenizer = BertTokenizerFast(
    vocab_file='./my_tokenizer-vocab.txt',
    max_len=128,
    do_lower_case=False,
    )

In [29]:
trained_tokenizer.tokenize(sample_sentences)

['경', '##치가', '너무', '[', 'C', '##L', '##S', ']', '좋아', '##요']

In [31]:
trained_tokenizer2 = BertTokenizerFast(
    vocab_file='./my_tokenizer-vocab.txt',
    max_len=128,
    do_lower_case=False,
    add_special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    )
trained_tokenizer2.tokenize(sample_sentences)

['경', '##치가', '너무', '[', 'C', '##L', '##S', ']', '좋아', '##요']