# **DACON-GAS**

## **Default Setting**

In [122]:
import torch
import transformers

import copy
import itertools
import json
import os
import pprint
import random

import numpy as np
import pandas as pd

from pathlib import Path

print("[VERSION]")
print(f"torch: {torch.__version__}")
print(f"transformers: {transformers.__version__}")

[VERSION]
torch: 1.9.0+cu111
transformers: 4.11.3


In [25]:
class HParams(object):
    def __init__(self):
        ## Path.
        self.data = Path("data") ## 문서요약 텍스트
        
        self.tr_data = self.data / Path("train")
        self.tr_law_data = self.tr_data / Path("tr_law_data.json")
        self.tr_journal_data = self.tr_data / Path("tr_journal_data.json")
        self.tr_article_data = self.tr_data / Path("tr_article_data.json")

        self.vl_data = self.tr_data / Path("validate")
        self.vl_law_data = self.vl_data / Path("vl_law_data.json")
        self.vl_journal_data = self.vl_data / Path("vl_journal_data.json")
        self.vl_article_data = self.vl_data / Path("vl_article_data.json")
        
args = HParams()
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(vars(args))

{   'data': PosixPath('data'),
    'tr_data': PosixPath('data/train'),
    'tr_law_data': PosixPath('data/train/tr_law_data.json'),
    'tr_journal_data': PosixPath('data/train/tr_journal_data.json'),
    'tr_article_data': PosixPath('data/train/tr_article_data.json'),
    'vl_data': PosixPath('data/train/validate'),
    'vl_law_data': PosixPath('data/train/validate/vl_law_data.json'),
    'vl_journal_data': PosixPath('data/train/validate/vl_journal_data.json'),
    'vl_article_data': PosixPath('data/train/validate/vl_article_data.json')}


In [18]:
!nvidia-smi; free -h

Thu Oct 21 08:08:09 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.74       Driver Version: 470.74       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:0A:00.0 Off |                  N/A |
|  0%   36C    P8    23W / 220W |     68MiB /  7979MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## **Prepare Dataset**

### **Naming**

제공받은 데이터세트의 이름을 조금 변경했습니다.

In [16]:
!tree -alh

[01;34m.[00m
├── [4.0K]  [01;34mdata[00m
│   ├── [4.0K]  [01;34mtrain[00m
│   │   ├── [1.2G]  tr_article_data.json
│   │   ├── [346M]  tr_journal_data.json
│   │   └── [ 90M]  tr_law_data.json
│   └── [4.0K]  [01;34mvalidate[00m
│       ├── [140M]  vl_article_data.json
│       ├── [ 35M]  vl_journal_data.json
│       └── [8.5M]  vl_law_data.json
└── [4.7K]  main.ipynb

3 directories, 7 files


### **Data Format**

참조: https://aihub.or.kr/aidata/8054

In [48]:
def print_sample(sample: dict):
    tmp = copy.deepcopy(sample)
    tmp["documents"] = [tmp["documents"][0], tmp["documents"][-1]]
    pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(tmp)

## Print sample.
with open(args.tr_law_data, "r") as f:
    sample = json.loads(f.read())
    
print_sample(sample)

{   'name': '법률문서 프로젝트',
    'delivery_date': '2020-12-23 17:23:13',
    'documents': [   {   'id': '100004',
                         'category': '일반행정',
                         'size': 'small',
                         'char_count': 377,
                         'publish_date': '19841226',
                         'title': '부당노동행위구제재심판정취소',
                         'text': [   [   {   'index': 0,
                                             'sentence': '원고가 소속회사의 노동조합에서 분규가 '
                                                         '발생하자 노조활동을 구실로 정상적인 '
                                                         '근무를 해태하고,',
                                             'highlight_indices': ''},
                                         {   'index': 1,
                                             'sentence': '노조조합장이 사임한 경우,',
                                             'highlight_indices': ''},
                                         {   'index': 2,
                                   

### **Fix Seed**

In [None]:
def seed_everything(seed: int = args.seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

### **DataLoader**

In [59]:
tokenizer = transformers.AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")

In [60]:
tokenizer.cls_token

'[CLS]'

In [120]:
inp = [
    "치료경과, 피해자의 연령과 직업 및 건강상태 등 제반 사정을 고려하여 합리적으로 판단할 수 있다.",
    "[2] 교통사고 피해자의 기왕증 등이 손해 확대에 기여한 부분이 있음에도,",
    "입원치료기간 중의 일실수입을 산정함에 있어 이를 참작하지 않은 원심판결을 파기한 사례.",
]

## Encode the inputs per each sentences and concat all.
inp = [tokenizer.encode(i, max_length=int(512 / len(inp)), truncation=True, add_special_tokens=True) for i in inp]
inp = np.array(list(itertools.chain.from_iterable(inp)))

## Generate 'cls' s.t. means the index of tokens.
cls = np.concatenate([np.where(inp == tokenizer.cls_token_id)[0], [len(inp)]])

seg = list(itertools.starmap(lambda x, y: [x] * y, zip(np.arange(len(np.diff(cls))) % 2, np.diff(cls))))

# seg = [[0] * (cls[i+1] - cls[i]) if i % 2 == 0 else [1] * (cls[i+1] - cls[i]) for i, _ in enumerate(cls[:-1])]

In [121]:
[len(i) for i in seg]

[26, 19, 24]

In [102]:
[len(i) for i in seg]

[0, 26, 45, 69]

In [98]:
np.diff(cls)

array([26, 19, 24])

In [84]:
np.where(bb == tokenizer.cls_token_id)[0]

array([0])

In [82]:
tokenizer.cls_token_id

2

In [75]:
np.where(bb == 2)

(array([], dtype=int64),)

In [78]:
aa = np.array([1, 2, 3])
np.where(aa == 2)

(array([1]),)

In [134]:
aa = np.array([2, 3])
np.sum(np.eye(5)[aa, :], axis=0)

array([0., 0., 1., 1., 0.])

In [None]:
class TrainDataset(torch.utils.data.Dataset):
    
    def __init__(
        self, 
        data_path: Path, 
        vocab_size: int = 512,
        inp_pad_id: int = 0,
        cls_pad_id: int = -1,
        seg_pad_id: int = 0,
    ):
        self.documents = get_documents(data_path)
        self.vocab_size = vocab_size ## maximum embedding length
        self.inp_pad_id = inp_pad_id
        self.cls_pad_id = cls_pad_id
        self.seg_pad_id = seg_pad_id
        
        self.data = self.data_loader(documents)
        
        
    def get_documents(self, data_path: Path) -> list:
        documents = []
        for file_path in sorted(list(data_path.glob("*.json"))):
            with open(file_path, "r", encoding="utf-8") as f:
                document = json.loads(f.read()) ## list type -> not append, but extent
            documents.extend(document["documents"])

        return documents
        

    def data_loader(self, documents: list) -> dict:
        data = {"inp": [], "cls": [], "seg": [], "msk": [], "msk_cls": [], "tar_ext": [], "tar_abs": []}
        for document in documents:
            ## Input.
            inp = [sentence["sentence"] for sentence in itertools.chain(*document["text"])]
            inp, cls, seg = self.tokenizing(inp)
            
            ## Extractive target.
            tar_ext = document["extractive"]
            
            ## Abstractive target.
            tar_abs = document["abstractive"][0] ## flatten
            
            ## Append.
            data["inp"].append(inp)
            data["cls"].append(cls)
            data["seg"].append(seg)
            data["tar_ext"].append(tar_ext)
            data["tar_abs"].append(tar_abs)
            
        ## Calculate max length.
        max_inp_len = max([len(i) for i in data["inp"]]) ## max(#(vocabs))
        max_cls_len = max([len(i) for i in data["cls"]]) ## max(#(sentences))

        ## Apply padding and calculate masking.
        for i in range(len(data["inp"])):
            inp, seg, cls, msk, msk_cls = self.padding(data["inp"][i], data["seg"][i], data["cls"][i], max_inp_len, max_cls_len)
            tar_ext = self.one_hot_encoding(data["tar"][i], max_cls_len)
            
            ## Append or modify.
            data["inp"][i] = inp
            data["seg"][i] = seg
            data["cls"][i] = cls
            
            data["msk"].append(msk)
            data["msk_cls"].append(msk_cls)
            
            data["tar"][i] = tar
            
        return data
    
        
    def cleaning(self, sentence: str) -> str:
        pass
            
            
    def tokenizing(self, inp: list) -> tuple:
        """ 각 sentence 별로 tokenizer를 이용한 인코딩 후 cls 및 seg 값을 계산합니다.
        """
        ## Encode the inputs per each sentences and concat all.
        inp = [self.tokenizer.encode(i, max_length=int(self.vocab_size / len(inp)), truncation=True, add_special_tokens=True) for i in inp]
        inp = np.array(list(itertools.chain.fron_iterable(inp)))
        
        ## Generate 'cls' s.t. means the index of tokens.
        cls = np.concatenate([np.where(i == self.tokenizer.cls_token_id)[0], len(inp)])
        
        ## Generate 'seg' s.t. means segmentation embeddings which represented as [0, 0, ..., 0, 1, ..., 1, 0, 0, ...].
        seg = list(itertools.starmap(lambda x, y: [x] * y, zip(np.arange(len(np.diff(cls))) % 2, np.diff(cls))))
        seg = np.array(list(itertools.chain.from_iterable(seg)))
        
        cls = cls[:-1]
        
        return inp, cls, seg
    
    
    def padding(self, inp: list, seg: list, cls: list, max_inp_len: int, max_cls_len: int) -> tuple:
        """ 각 sentence 별로 정해진 길이에 맞추어 inp, cls, seg 값을 패딩 처리합니다.
        """
        ## Pad all.
        inp = np.concatenate([inp, np.array([self.inp_pad_id] * (max_inp_len - len(inp)))])
        seg = np.concatenate([seg, np.array([self.seg_pad_id] * (max_inp_len - len(seg)))])
        cls = np.concatenate([cls, np.array([self.cls_pad_id] * (max_cls_len - len(cls)))])
        
        ## Generate msking value.
        msk = ~(np.where(inp == self.inp_pad_id)[0])
        msk_cls = ~(np.where(cls == self.cls_pad_id)[0])
        
        return inp, seg, cls, msk, msk_cls
    
    
    def one_hot_encoding(self, tar: list, max_cls_len: int) -> list:
        return np.sum(np.eye(max_cls_len)[np.array(tar), :], axis=0)
        
        
    def __len__(self):
        """ 데이터 세트의 전체 크기(길이)를 반환
        """
        return len(self.data["inp"])

    
    def __getitem__(self, idx):
        """ idx 번째 샘플을 반환
        """
        return {
            "inp": self.data["inp"][idx],
            "seg": self.data["seg"][idx],
            "cls": self.data["cls"][idx],
            "msk": self.data["msk"][idx],
            "msk_cls": self.data["msk_cls"][idx],
            "tar_ext": self.data["tar_ext"]
        }
        
        return [self.inputs[idx][i] for i in range(5)], self.labels[idx]