In [1]:
import os
import sys
import re
import json
from datasets import (load_dataset, 
    load_from_disk,
    Dataset,
    DatasetDict,
    Value,
    Features
)

In [2]:
import torch
import random
import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt

In [7]:
from transformers import AutoTokenizer, PreTrainedTokenizerFast
from tqdm.notebook import tqdm

## Bert Tokenizer

In [12]:
tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large', use_fast=True)

In [65]:
tokenizer.save_pretrained('/opt/ml/project/final-project-level3-nlp-02/Tokenizer')

('/opt/ml/project/final-project-level3-nlp-02/Tokenizer/tokenizer_config.json',
 '/opt/ml/project/final-project-level3-nlp-02/Tokenizer/special_tokens_map.json',
 '/opt/ml/project/final-project-level3-nlp-02/Tokenizer/vocab.txt',
 '/opt/ml/project/final-project-level3-nlp-02/Tokenizer/added_tokens.json',
 '/opt/ml/project/final-project-level3-nlp-02/Tokenizer/tokenizer.json')

### Before

In [34]:
print('Index : 31500 \t Token : %s' %tokenizer.convert_ids_to_tokens(31500))
print('Index : 31999 \t Token : %s' %tokenizer.convert_ids_to_tokens(31999))

Index : 31500 	 Token : [unused0]
Index : 31999 	 Token : [unused499]


## Optimizaing : Target Bert, Roberta

### Loading

In [38]:
def load_tokenizer(path) :
    with open(path, "r") as f:
       data = json.load(f)
    return data

In [39]:
tokenizer_data = load_tokenizer('../Tokenizer/tokenizer.json')

In [47]:
def load_vocab(path) : 
    with open(path, "r") as f :
        data = f.read()
    idx2vocab = {i:vocab for i, vocab in enumerate(data.split('\n')[:-1])}
    return idx2vocab

In [48]:
idx2vocab = load_vocab('../Tokenizer/vocab.txt')

### Updating

In [59]:
def update_unused(idx2vocab, ch_df) :
    unused_start = 31500
    
    size = 0
    map_id = 0
    while(size < 500) :
        ch, flag = ch_df.iloc[map_id][['Character', 'KoreanFlag']]

        target_id = unused_start + size
        if flag == False :
            idx2vocab[target_id] = ch
            size += 1
        else :
            if size + 2 >= 500 :
                map_id += 1
                continue

            idx2vocab[target_id] = ch
            idx2vocab[target_id+1] = '##' + ch
            size += 2
            
        map_id += 1

    return idx2vocab

In [50]:
extra_df = pd.read_csv('../Tokenizer/extra_characters.csv')

In [61]:
idx2vocab_after = update_unused(idx2vocab, extra_df)

### Writing

In [62]:
def update_vocab(idx2vocab, path) :
    data_size = len(idx2vocab)
    vocab_list = list(idx2vocab.values())
    f = open(path, 'w')
    for i in range(data_size):
        f.write(vocab_list[i]+'\n')
    f.close()

In [63]:
update_vocab(idx2vocab_after, '../Tokenizer/vocab.txt')

In [66]:
def update_tokenizer(idx2vocab, tokenizer_data, path) :
    vocab2idx = {idx2vocab[key] : key for key in idx2vocab.keys()}
    tokenizer_data['model']['vocab'] = vocab2idx
    with open(path, 'w') as f:
        json.dump(tokenizer_data, f)


In [67]:
update_tokenizer(idx2vocab_after, tokenizer_data, '../Tokenizer/tokenizer.json')

## Results

In [69]:
new_tokenizer = AutoTokenizer.from_pretrained('../Tokenizer', use_fast=True)

In [70]:
print('Index : 31500 \t Token : %s' %new_tokenizer.convert_ids_to_tokens(31500))
print('Index : 31599 \t Token : %s' %new_tokenizer.convert_ids_to_tokens(31999))

Index : 31500 	 Token : 乙
Index : 31599 	 Token : 舞


## Bart Tokniezr

In [139]:
tokenizer = AutoTokenizer.from_pretrained('gogamza/kobart-summarization', use_fast=True)

In [141]:
print('Index : 7 \t Token : %s' %tokenizer.convert_ids_to_tokens(7))
print('Index : 106 \t Token : %s' %tokenizer.convert_ids_to_tokens(106))

Index : 7 	 Token : <unused0>
Index : 106 	 Token : <unused99>


In [144]:
tokenizer.save_pretrained('/opt/ml/project/final-project-level3-nlp-02/Tokenizer')

('/opt/ml/project/final-project-level3-nlp-02/Tokenizer/tokenizer_config.json',
 '/opt/ml/project/final-project-level3-nlp-02/Tokenizer/special_tokens_map.json',
 '/opt/ml/project/final-project-level3-nlp-02/Tokenizer/vocab.json',
 '/opt/ml/project/final-project-level3-nlp-02/Tokenizer/merges.txt',
 '/opt/ml/project/final-project-level3-nlp-02/Tokenizer/added_tokens.json',
 '/opt/ml/project/final-project-level3-nlp-02/Tokenizer/tokenizer.json')

In [161]:
def load_json(path) :
   with open(path, "r") as f:
      data = json.load(f)
   return data

def write_json(tokenizer_data, path) :
   with open(path, 'w') as f:
      json.dump(tokenizer_data, f)

In [162]:
dir_path = '../Tokenizer'
tokenizer_path = '../Tokenizer/tokenizer.json'
vocab_path = '../Tokenizer/vocab.json'

In [175]:
def update_unused(idx2vocab, ch_df) :
    unused_start = 7
    unused_size = 100
    
    size = 0
    map_id = 0
    while(size < unused_size) :
        ch, flag = ch_df.iloc[map_id][['Character', 'KoreanFlag']]

        if ch in idx2vocab.values() :
            map_id += 1
            continue

        target_id = unused_start + size
        if flag == False :
            idx2vocab[target_id] = ch
            size += 1
        else :
            if size + 2 >= unused_size :
                map_id += 1
                continue

            idx2vocab[target_id] = ch
            idx2vocab[target_id+1] = '▁' + ch
            size += 2
            
        map_id += 1

    return idx2vocab

In [179]:
# optimizing tokenizer
def optimize(extra_ch_path) :
    ch_df = pd.read_csv(extra_ch_path)

    # load tokenizer data
    tokenizer_data = load_json(tokenizer_path)
    vocab2idx = load_json(vocab_path)
    idx2vocab = {idx:vocab for vocab, idx in vocab2idx.items()}
    
    # update tokenizer
    idx2vocab = update_unused(idx2vocab, ch_df)
    vocab_list = list(idx2vocab.values())

    vocab2idx = {idx2vocab[key] : key for key in idx2vocab.keys()}

    tokenizer_data['model']['vocab'] = vocab2idx
    tokenizer_added_tokens = tokenizer_data['added_tokens']

    for i in range(7,106+1) :
        tokenizer_added_tokens[i]['content'] = vocab_list[i]

    # write tokenizer data
    write_json(vocab2idx, vocab_path)
    write_json(tokenizer_data, tokenizer_path)

    # load updated tokenizer
    tokenizer = AutoTokenizer.from_pretrained(dir_path, use_fast=True)
    return tokenizer

In [180]:
new_tokenizer = optimize('../Tokenizer/extra_characters.csv')

In [181]:
print('Index : 7 \t Token : %s' %new_tokenizer.convert_ids_to_tokens(7))
print('Index : 106 \t Token : %s' %new_tokenizer.convert_ids_to_tokens(106))

Index : 7 	 Token : 룖
Index : 106 	 Token : 弇
