# **DACON-GAS**

## **Default Setting**

In [1]:
import torch
import transformers

import copy
import itertools
import json
import os
import pprint
import random

import numpy as np
import pandas as pd

from pathlib import Path

print("[VERSION]")
print(f"torch: {torch.__version__}")
print(f"transformers: {transformers.__version__}")

[VERSION]
torch: 1.9.0+cu111
transformers: 4.11.3


In [2]:
class HParams(object):
    def __init__(self):
        ## Path.
        self.data = Path("data") ## 문서요약 텍스트
        
        self.tr_data = self.data / Path("train")
        self.tr_law_data = self.tr_data / Path("tr_law_data.json")
        self.tr_journal_data = self.tr_data / Path("tr_journal_data.json")
        self.tr_article_data = self.tr_data / Path("tr_article_data.json")

        self.vl_data = self.tr_data / Path("validate")
        self.vl_law_data = self.vl_data / Path("vl_law_data.json")
        self.vl_journal_data = self.vl_data / Path("vl_journal_data.json")
        self.vl_article_data = self.vl_data / Path("vl_article_data.json")
        
        ## Dataloader
        self.seed = 42
        self.tokenizer_name = "beomi/KcELECTRA-base"
        
args = HParams()
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(vars(args))

{   'data': PosixPath('data'),
    'tr_data': PosixPath('data/train'),
    'tr_law_data': PosixPath('data/train/tr_law_data.json'),
    'tr_journal_data': PosixPath('data/train/tr_journal_data.json'),
    'tr_article_data': PosixPath('data/train/tr_article_data.json'),
    'vl_data': PosixPath('data/train/validate'),
    'vl_law_data': PosixPath('data/train/validate/vl_law_data.json'),
    'vl_journal_data': PosixPath('data/train/validate/vl_journal_data.json'),
    'vl_article_data': PosixPath('data/train/validate/vl_article_data.json'),
    'seed': 42,
    'tokenizer_name': 'beomi/KcELECTRA-base'}


In [3]:
!nvidia-smi; free -h

Sat Oct 30 20:43:48 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.74       Driver Version: 470.74       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:0A:00.0  On |                  N/A |
|  0%   38C    P8    18W / 220W |    276MiB /  7979MiB |      8%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## **Prepare Dataset**

### **Naming**

제공받은 데이터세트의 이름을 조금 변경했습니다.

In [5]:
# !tree -alh

### **Data Format**

참조: https://aihub.or.kr/aidata/8054

In [6]:
def print_sample(sample: dict):
    tmp = copy.deepcopy(sample)
    tmp["documents"] = [tmp["documents"][0], tmp["documents"][-1]]
    pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(tmp)

## Print sample.
with open(args.tr_law_data, "r") as f:
    sample = json.loads(f.read())
    
print_sample(sample)

{   'name': '법률문서 프로젝트',
    'delivery_date': '2020-12-23 17:23:13',
    'documents': [   {   'id': '100004',
                         'category': '일반행정',
                         'size': 'small',
                         'char_count': 377,
                         'publish_date': '19841226',
                         'title': '부당노동행위구제재심판정취소',
                         'text': [   [   {   'index': 0,
                                             'sentence': '원고가 소속회사의 노동조합에서 분규가 '
                                                         '발생하자 노조활동을 구실로 정상적인 '
                                                         '근무를 해태하고,',
                                             'highlight_indices': ''},
                                         {   'index': 1,
                                             'sentence': '노조조합장이 사임한 경우,',
                                             'highlight_indices': ''},
                                         {   'index': 2,
                                   

### **Fix Seed**

In [7]:
def seed_everything(seed: int = args.seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

### **DataLoader**

In [101]:
class TrainDataset(torch.utils.data.Dataset):
    
    def __init__(
        self, 
        data_path: Path, 
        vocab_size: int = 512,
        inp_pad_id: int = 0,
        cls_pad_id: int = -1,
        seg_pad_id: int = 0,
        debug: bool = True,
    ):
        self.documents = self._get_documents(data_path)
        self.vocab_size = vocab_size ## maximum embedding length
        self.inp_pad_id = inp_pad_id
        self.cls_pad_id = cls_pad_id
        self.seg_pad_id = seg_pad_id
        self.debug = debug
        
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer_name)
        self.data = self._data_loader(self.documents)
        
        
    def _get_documents(self, data_path: Path) -> list:
        documents = []
        for file_path in sorted(list(data_path.glob("*.json"))):
            with open(file_path, "r", encoding="utf-8") as f:
                document = json.loads(f.read()) ## list type -> not append, but extent
            documents.extend(document["documents"])

        return documents
        

    def _data_loader(self, documents: list) -> dict:
        ## Empty dictionary.
        data = {"inp": [], "tar_ext": [], "tar_abs": []}
        
        for document in documents:
            ## Maybe None in extractive sentence.
            if None in document["extractive"]:
                continue
                
            ## Elements.
            inp = [sentence["sentence"] for sentence in itertools.chain(*document["text"])]
            tar_ext = document["extractive"]
            tar_abs = document["abstractive"][0]
            
            ## Insert.
            data["inp"].append(inp)
            data["tar_ext"].append(tar_ext)
            data["tar_abs"].append(tar_abs)
        
            ## In development stage, we limit the maximum document size to 10 for fast experiments.
            if self.debug and len(data["inp"]) >= 10:
                break
    
        ## Convert to dataframe.
        df = pd.DataFrame(data)

        ## Encoding.
        df["inp"] = self._tokenize(df["inp"])
        
        ## Generate 'cls' s.t. means the index of tokens.
        df["cls"] = df["inp"].map(lambda x: np.concatenate([np.where(np.array(x) == self.tokenizer.cls_token_id)[0], [len(x)]]))
        
        ## Generate 'seg' s.t. means segmentation embeddings which represented as [0, 0, ..., 0, 1, ..., 1, 0, 0, ...].
        df["seg"] = df["cls"].map(lambda x: list(itertools.starmap(lambda x, y: [x] * y, zip(np.arange(len(np.diff(x))) % 2, np.diff(x)))))
        df["seg"] = df["seg"].map(lambda x: np.array(list(itertools.chain.from_iterable(x))))
        
        ## Drop the last token in cls.
        df["cls"] = df["cls"].map(lambda x: x[:-1])
        
        ## Padding.
        self.max_inp_len = max(df["inp"].map(lambda x: len(x)))
        self.max_cls_len = max(df["cls"].map(lambda x: len(x)))
        
        df["inp"] = self._pad(df["inp"], self.inp_pad_id, self.max_inp_len) ## 0
        df["cls"] = self._pad(df["cls"], self.cls_pad_id, self.max_cls_len) ## -1
        df["seg"] = self._pad(df["seg"], self.seg_pad_id, self.max_inp_len) ## 0
        df["msk"] = df["inp"].map(lambda x: ~(x == self.inp_pad_id))
        df["msk_cls"] = df["cls"].map(lambda x: ~(x == self.cls_pad_id))
        
        ## One hot label.
        df["tar_ext"] = df["tar_ext"].map(lambda x: self._one_hot_encoding(x))
        df["tar_abs"] = self._tokenize(df["tar_abs"])
        
        return df
    
    
    def _tokenize(self, data: pd.Series, truncation: bool = True, add_special_tokens: bool = True) -> pd.Series:
        ## Tokenize input and abstractive target.
        return data.map(lambda x: list(itertools.chain.from_iterable([self.tokenizer.encode(
            x[i], max_length=int(self.vocab_size / len(x)), truncation=truncation, add_special_tokens=add_special_tokens,
        ) for i in range(len(x))])))
        

    def _pad(self, data: pd.Series, pad_id: int, max_len: int) -> pd.Series:
        return data.map(lambda x: np.concatenate([x, np.array([pad_id] * (max_len - len(x)))]))
    
    
    def _one_hot_encoding(self, tar: list) -> list:
        return np.sum(np.eye(self.max_cls_len)[np.array(tar), :], axis=0)
    
        
    def __len__(self) -> int:
        return len(self.data["inp"])

    
    def __getitem__(self, idx) -> dict:
        return {key: value[idx] for key, value in self.data.items()}

In [102]:
aa = TrainDataset(args.tr_data)

In [103]:
aa.data

Unnamed: 0,inp,tar_ext,tar_abs,cls,seg,msk,msk_cls
0,"[2, 37224, 4130, 8268, 4160, 15977, 8303, 2505...","[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2, 2822, 3, 2, 1211, 3, 2, 602, 3, 2, 920, 3,...","[0, 13, 28, 53, 88, 127, 160, 199, 238, 277, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[True, True, True, True, True, True, True, Tru...","[True, True, True, True, True, True, True, Tru..."
1,"[2, 28, 4577, 13711, 16, 9155, 42716, 4982, 41...","[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[2, 2540, 3, 2, 2153, 3, 2, 2233, 3, 2, 780, 3...","[0, 16, 31, 58, 93, 127, 147, 178, 211, 237, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...","[True, True, True, True, True, True, True, Tru..."
2,"[2.0, 14810.0, 40353.0, 4721.0, 45656.0, 30141...","[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, ...","[2, 2822, 3, 2, 602, 3, 2, 1015, 3, 2, 1231, 3...","[0.0, 11.0, 26.0, 53.0, 81.0, 109.0, 137.0, 16...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[True, True, True, True, True, True, True, Tru...","[True, True, True, True, True, True, True, Tru..."
3,"[2, 10512, 96, 12556, 4129, 16, 39498, 97, 158...","[0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[2, 291, 3, 2, 2484, 3, 2, 2233, 3, 2, 780, 3,...","[0, 20, 35, 70, 106, 138, 177, 206, 242, 293, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...","[True, True, True, True, True, True, True, Tru..."
4,"[2, 10698, 29964, 4306, 4029, 25046, 36150, 39...","[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...","[2, 2576, 3, 2, 3800, 3, 2, 3, 2, 24, 3, 2, 26...","[0, 16, 31, 51, 90, 128, 156, 188, 226, 260, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...","[True, True, True, True, True, True, True, Tru..."
5,"[2, 8500, 4028, 24505, 11794, 1031, 38244, 406...","[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...","[2, 291, 3, 2, 2484, 3, 2, 2233, 3, 2, 780, 3,...","[0, 13, 28, 58, 103, 140, 170, 213, 248, 294, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[True, True, True, True, True, True, True, Tru...","[True, True, True, True, True, True, True, Tru..."
6,"[2, 13643, 8104, 32270, 158, 4151, 16, 15818, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[2, 21, 3, 2, 2662, 3, 2, 3, 2, 21, 3, 2, 2748...","[0, 10, 25, 47, 82, 124, 166, 195, 237, 257, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...","[True, True, True, True, True, True, True, Tru...","[True, True, True, True, True, True, True, Tru..."
7,"[2, 8690, 9033, 4258, 37417, 4876, 12122, 16, ...","[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[2, 291, 3, 2, 2484, 3, 2, 304, 3, 2, 2692, 3,...","[0, 15, 30, 69, 98, 137, 171, 195, 218, 244, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...","[True, True, True, True, True, True, True, Tru..."
8,"[2, 19053, 9782, 163, 4628, 4570, 2822, 16, 20...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, ...","[2, 1419, 3, 2, 282, 3, 2, 3, 2, 3971, 3, 2, 2...","[0, 21, 36, 54, 68, 98, 109, 139, 169, 199, 22...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...","[True, True, True, True, True, True, True, Tru..."
9,"[2, 24, 4010, 20578, 16, 20730, 4169, 4227, 27...","[0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[2, 291, 3, 2, 2484, 3, 2, 2233, 3, 2, 780, 3,...","[0, 15, 30, 52, 90, 129, 171, 213, 238, 279, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...","[True, True, True, True, True, True, True, Tru..."


array([  0,  13,  28,  53,  88, 127, 160, 199, 238, 277, 295, 325, 364,
       415])

In [85]:
len(aa.data.loc[0, "inp"])

415

In [75]:
aa.max_tar_len

AttributeError: 'TrainDataset' object has no attribute 'max_tar_len'

In [13]:
class TrainDataset(torch.utils.data.Dataset):
    
    def __init__(
        self, 
        data_path: Path, 
        vocab_size: int = 512,
        inp_pad_id: int = 0,
        cls_pad_id: int = -1,
        seg_pad_id: int = 0,
        debug: bool = True,
    ):
        self.documents = self.get_documents(data_path)
        self.vocab_size = vocab_size ## maximum embedding length
        self.inp_pad_id = inp_pad_id
        self.cls_pad_id = cls_pad_id
        self.seg_pad_id = seg_pad_id
        self.debug = debug
        
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer_name)
        self.data = self.data_loader(self.documents)
        
        
    def get_documents(self, data_path: Path) -> list:
        documents = []
        for file_path in sorted(list(data_path.glob("*.json"))):
            with open(file_path, "r", encoding="utf-8") as f:
                document = json.loads(f.read()) ## list type -> not append, but extent
            documents.extend(document["documents"])

        return documents
        

    def data_loader(self, documents: list) -> dict:
        data = {"inp": [], "cls": [], "seg": [], "msk": [], "msk_cls": [], "tar_ext": [], "tar_abs": []}
        for document in documents:
            ## Maybe None in extractive sentence.
            if None in document["extractive"]:
                continue
                
            ## Input.
            inp = [sentence["sentence"] for sentence in itertools.chain(*document["text"])]
            inp, cls, seg = self.tokenizing(inp)
            
            ## Extractive target.
            tar_ext = document["extractive"]
            
            ## Abstractive target.
            tar_abs = document["abstractive"][0] ## flatten
            tar_abs, _, _ = self.tokenizing(tar_abs)
            
            ## Append.
            data["inp"].append(inp)
            data["cls"].append(cls)
            data["seg"].append(seg)
            data["tar_ext"].append(tar_ext)
            data["tar_abs"].append(tar_abs)
            
            ## In development stage, we limit the maximum document size to 10 for fast experiments.
            if self.debug and len(data["inp"]) >= 10:
                break
            
        ## Calculate max length.
        max_inp_len = max([len(i) for i in data["inp"]]) ## max(#(vocabs))
        max_cls_len = max([len(i) for i in data["cls"]]) ## max(#(sentences))

        ## Apply padding and calculate masking.
        for i in range(len(data["inp"])):
            inp, seg, cls, msk, msk_cls = self.padding(data["inp"][i], data["seg"][i], data["cls"][i], max_inp_len, max_cls_len)
            tar_ext = self.one_hot_encoding(data["tar_ext"][i], max_cls_len)
            
            ## Append or modify.
            data["inp"][i] = inp
            data["seg"][i] = seg
            data["cls"][i] = cls
            
            data["msk"].append(msk)
            data["msk_cls"].append(msk_cls)
            
            data["tar_ext"][i] = tar_ext
            
        return data
    
        
    def cleaning(self, sentence: str) -> str:
        pass
            
            
    def tokenizing(self, inp: list) -> tuple:
        """ 각 sentence 별로 tokenizer를 이용한 인코딩 후 cls 및 seg 값을 계산합니다.
        """
        ## Encode the inputs per each sentences and concat all.
        inp = [self.tokenizer.encode(i, max_length=int(self.vocab_size / len(inp)), truncation=True, add_special_tokens=True) for i in inp]
        inp = np.array(list(itertools.chain.from_iterable(inp)))
        
        ## Generate 'cls' s.t. means the index of tokens.
        cls = np.concatenate([np.where(inp == self.tokenizer.cls_token_id)[0], [len(inp)]])
        
        ## Generate 'seg' s.t. means segmentation embeddings which represented as [0, 0, ..., 0, 1, ..., 1, 0, 0, ...].
        seg = list(itertools.starmap(lambda x, y: [x] * y, zip(np.arange(len(np.diff(cls))) % 2, np.diff(cls))))
        seg = np.array(list(itertools.chain.from_iterable(seg)))
        
        cls = cls[:-1]
        
        return inp, cls, seg
    
    
    def padding(self, inp: list, seg: list, cls: list, max_inp_len: int, max_cls_len: int) -> tuple:
        """ 각 sentence 별로 정해진 길이에 맞추어 inp, cls, seg 값을 패딩 처리합니다.
        """
        ## Pad all.
        inp = np.concatenate([inp, np.array([self.inp_pad_id] * (max_inp_len - len(inp)))])
        seg = np.concatenate([seg, np.array([self.seg_pad_id] * (max_inp_len - len(seg)))])
        cls = np.concatenate([cls, np.array([self.cls_pad_id] * (max_cls_len - len(cls)))])
        
        ## Generate msking value.
        msk = ~(np.where(inp == self.inp_pad_id)[0])
        msk_cls = ~(np.where(cls == self.cls_pad_id)[0])
       
        return inp, seg, cls, msk, msk_cls
    
    
    def one_hot_encoding(self, tar: list, max_cls_len: int) -> list:
        return np.sum(np.eye(max_cls_len)[np.array(tar), :], axis=0)
        
        
    def __len__(self) -> int:
        """ 데이터 세트의 전체 크기(길이)를 반환
        """
        return len(self.data["inp"])

    
    def __getitem__(self, idx) -> dict:
        """ idx 번째 샘플을 반환
        """
        return {key: value[idx] for key, value in self.data.items()}

In [14]:
ds = TrainDataset(args.tr_data)

[    2 37224  4130  8268  4160 15977  8303 25053 34750  4847 34455  8458
     3     2 10077  4595    85 33038  4611  4091    34 37224  4094 35498
 27398    18 37197     3     2 35320  4050  2267 19459 21578 11526  8077
  8646  8270  8366 11123  8045 11633  7974  2267 10008 19894 10715 10487
 11373 19292  9241    18     3     2  2267 10008 19894 12535  1707  4023
  2248 10667   699  4063  1707 10124 22912  4083  9365  4075  3427  1031
  8139 34750  4053  2248  7991  1707 18695 18426  9668 13323 29018 11854
 12581  4058    18     3     2 11123 14810  4041   699  8139 34750 28819
 10809 34461  4192  9849    25  4220 44787  4041  2471 11559     9  2744
 23048 28306  4449 44787  4096    16 28366  4019  4269  4020 20251 15516
    16 36351  4063 28125  8265    18     3     2  8458 11358 11929  4192
 22856  4182  2267 27941  4439  4086 10471  9864 33392  4063  8229  4600
    12 10557  4201    13  8173  1707 19909  8139 34750  4053 28819  4069
 27768 19714    18     3     2  8458 11929 34750  4

In [10]:
ds[0]

{'inp': array([    2, 37224,  4130,  8268,  4160, 15977,  8303, 25053, 34750,
         4847, 34455,  8458,     3,     2, 10077,  4595,    85, 33038,
         4611,  4091,    34, 37224,  4094, 35498, 27398,    18, 37197,
            3,     2, 35320,  4050,  2267, 19459, 21578, 11526,  8077,
         8646,  8270,  8366, 11123,  8045, 11633,  7974,  2267, 10008,
        19894, 10715, 10487, 11373, 19292,  9241,    18,     3,     2,
         2267, 10008, 19894, 12535,  1707,  4023,  2248, 10667,   699,
         4063,  1707, 10124, 22912,  4083,  9365,  4075,  3427,  1031,
         8139, 34750,  4053,  2248,  7991,  1707, 18695, 18426,  9668,
        13323, 29018, 11854, 12581,  4058,    18,     3,     2, 11123,
        14810,  4041,   699,  8139, 34750, 28819, 10809, 34461,  4192,
         9849,    25,  4220, 44787,  4041,  2471, 11559,     9,  2744,
        23048, 28306,  4449, 44787,  4096,    16, 28366,  4019,  4269,
         4020, 20251, 15516,    16, 36351,  4063, 28125,  8265,    18,

In [41]:
documents = []
for file_path in sorted(list(args.tr_data.glob("*.json"))):
    with open(file_path, "r", encoding="utf-8") as f:
        document = json.loads(f.read()) ## list type -> not append, but extent
    documents.extend(document["documents"])

In [42]:
len(documents)

361193

In [51]:
n = 0

for i, d in enumerate(documents):
    if None in d["extractive"]:
        print(d)
        print("\n\n\n")

{'id': '356944586', 'category': '종합', 'media_type': 'online', 'media_sub_type': '지역지', 'media_name': '기호일보', 'size': 'small', 'char_count': '886', 'publish_date': '2019-08-21 00:00:00', 'title': '2019 가평혁신교육지구 사제동행 역사·독서 기행 해외 답사', 'text': [[{'index': 0, 'sentence': '가평교육지원청과 가평군은 가평혁신교육지구 사업으로 사제동행 역사·독서 기행 해외 답사 프로그램으로 ‘가평의 아이들 윤동주, 이회영, 안중근을 만나다’ 프로그램을 지난 13일부터 17일까지 중국 옌지(延吉), 퉁화(通化), 단둥(丹東), 뤼순(旅順), 다롄(大連) 등지에서 4박 5일 동안 진행했다고 20일 밝혔다.', 'highlight_indices': ''}], [{'index': 1, 'sentence': '이번 프로그램은 관내 중고등학교의 공모를 거쳐 조종중·고등학교 학생 39명과 인솔 교사 6명, 학교와 교육지원청 관계자 3명으로 진행됐다.', 'highlight_indices': ''}], [{'index': 2, 'sentence': '가평교육지원청과 조종중·고등학교는 국어 역사 지리 음악 과목 교사들이 협력해 교과 융합 프로젝트수업을 구성했고 지난 6월부터 학교에서 독립운동 관련 독서활동과 독립운동 관련 교과융합 프로젝트를 진행했다.', 'highlight_indices': ''}], [{'index': 3, 'sentence': '국내 독립운동 유적답사(윤동주·김구·윤봉길·이봉창)를 진행하면서 일제강점기 빼앗긴 조국을 되찾기 위해 헌신하신 많은 독립지사들의 정신을 배우려고 노력해왔다.', 'highlight_indices': ''}], [{'index': 4, 'sentence': '특히 13일부터 진행하는 중국독립운동 유적 답사는 조종중고 교사들이 직접 답사 자료집을 제작해

In [50]:
n

8

In [40]:
aa

[None, 2, 1]

In [None]:
# tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer_name)

In [32]:
np.where(aa == tokenizer.cls_token_id)[0]

array([  0,  13,  28,  53,  88, 127, 160, 199, 238, 277, 295, 325, 364])

In [26]:
cls = np.concatenate([np.where(aa == tokenizer.cls_token_id)[0], len(aa)])
cls

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 0 dimension(s)