# 패키지 설치

In [None]:
#@title
# !git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
# %cd Mecab-ko-for-Google-Colab
# !bash install_mecab-ko_on_colab190912.sh

!pip install konlpy transformers Korpora sentencepiece torchdatasets

Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.2 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 54.6 MB/s 
[?25hCollecting Korpora
  Downloading Korpora-0.2.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 7.5 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 46.1 MB/s 
[?25hCollecting torchdatasets
  Downloading torchdatasets-0.2.0-py3-none-any.whl (29 kB)
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 60.8 MB/s 
[?25hCollecting beautifulsoup4==4.6.

# 기본 패키지 로드

In [None]:
import os, sys, io
from google.colab import drive

from easydict import EasyDict
from pathlib import Path

import math
import random
import time
import re

import csv

from tqdm.notebook import tqdm

import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
import torch.utils.data as tud
import torch.nn as nn
import torch.optim as optim

import torchdatasets as tds

from Korpora import Korpora
from Korpora import ModuNewsKorpus
import transformers
from transformers import BertModel, DistilBertModel

import konlpy
from collections import Counter

from cProfile import Profile
from pstats import Stats

from sklearn.utils.class_weight import compute_class_weight

# 기본 설정

In [None]:
# ?: https://stackoverflow.com/questions/60002128/failed-to-assign-a-dict-with-key-of-type-int-to-easydict
C = EasyDict()

C.gdrive = {}
# GDrive 마운트 설정
C.gdrive.mount_path = Path('/gdrive')
# 구글 드라이브 root 위치
C.gdrive.root = (C.gdrive.mount_path / 'My Drive') / 'cs492i'

C.corpus = {}
# 말뭉치 작은거 사용
C.corpus.use_smallset = True

C.dataset = {}
# 데이터셋 나누기 비율
C.dataset.split_train = 0.8
# 데이터셋 배치 크기
C.dataset.batch_size = 64
# 캐싱 사용?
C.dataset.use_cache = True
# 데이터셋 CSV인지 Korpora인지
C.dataset.use_korpora = True
# CSV 파일 이름 (root로부터)
C.dataset.csv_file = 'CleanTextNT.csv'

C.torch = {}
# Torch Seed 고정 (사용 안하면 None)
C.torch.manual_seed = 1234
# 가능하면 GPU 사용
C.torch.use_gpu = True

C.train = {}
# 학습하면서 자동 저장?
C.train.autosave = True
# 체크포인트 저장 위치
C.train.checkpoint_path = C.gdrive.root / 'nBTT_20_ckpts'
# 최대 훈련 Epoch 수
C.train.epochs = 25
# 배치 사이 로깅 (None이면 안함)
C.train.log_batch_interval = 100
# 배치 사이 로깅 시 샘플도?
C.train.log_batch_sample = True


# 구글 드라이브 마운트

In [None]:
# 드라이브 로드
drive.mount(str(C.gdrive.mount_path))


Mounted at /gdrive


# 설정 적용 및 잡일

In [None]:
C.train.checkpoint_path.mkdir(parents=True, exist_ok=True)

In [None]:
if C.torch.manual_seed is not None:
    torch.manual_seed(C.torch.manual_seed)
    torch.cuda.manual_seed(C.torch.manual_seed)
    print('Using torch manual seed:', C.torch.manual_seed)

device = torch.device("cuda" if torch.cuda.is_available() and C.torch.use_gpu else "cpu")
print('Using device:', device.type)

Using torch manual seed: 1234
Using device: cuda


In [None]:
%cd $C.gdrive.root
%pwd

/gdrive/.shortcut-targets-by-id/1uCYxyZyGrrk_QuM7N7v-PYCAmnJ6SKlW/cs492i


'/gdrive/.shortcut-targets-by-id/1uCYxyZyGrrk_QuM7N7v-PYCAmnJ6SKlW/cs492i'

In [None]:
profiler = Profile()

# 드라이브에서 파일 임포트

In [None]:
# load .py files from google drive
from tokenization_kobert import KoBertTokenizer

# 유틸, 데이터셋 및 변환, 토크나이저 및 버트 모델 초기화

In [None]:
def autoprofile(func, *args, **kwargs):
    profiler.runcall(func, *args, **kwargs)
    stats = Stats(profiler)
    stats.strip_dirs()
    stats.sort_stats('cumulative')
    stats.print_stats()
    print('=======================')
    stats.print_callers()

In [None]:
# data also contains korpora index for debugging?
class NewsDateDatasetFromKorpora(tds.Dataset):
    """News with Date dataset."""

    def __init__(self, corpus, news_transform=None, date_transform=None):
        """
        Args:
            corpus (Corpus): Korpora corpus with news object.
            news_transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        super().__init__()
        self.corpus = corpus
        self.news_transform = news_transform
        self.date_transform = date_transform

    def __len__(self):
        return len(self.corpus.train)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # Option apply
        paragraph = self.corpus.train[idx].paragraph
        if self.news_transform is not None:
            paragraph = self.news_transform(paragraph)
        
        date = (int(corpus.train[idx].date[0:4]), int(corpus.train[idx].date[4:6]))
        if self.date_transform is not None:
            date = self.date_transform(date)
        
        return paragraph, date


In [None]:
# data also contains korpora index for debugging?
class NewsDateDatasetFromCsv(tds.Dataset):
    """News with Date dataset."""

    def __init__(self, filename, news_transform=None, date_transform=None, 
                 max_rows=None):
        """
        Args:
            corpus (Corpus): Korpora corpus with news object.
            news_transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        super().__init__()
        self.filename = filename
        self.news_transform = news_transform
        self.date_transform = date_transform
        self.rows = []
        if max_rows is None:
            with open(self.filename, encoding='utf-8') as datafile:
                csvreader = csv.reader(datafile)
                for row in tqdm(csvreader):
                    self.rows.append(row)
        else:
            with open(self.filename, encoding='utf-8') as datafile:
                csvreader = csv.reader(datafile)
                for i in tqdm(range(max_rows)):
                    self.rows.append(next(csvreader))

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            raise Exception('Not implemented')
        
        paragraph = self.rows[idx][1]
        date = int(self.rows[idx][2])
        
        # Option apply
        if self.news_transform is not None:
            paragraph = self.news_transform(paragraph)
        
        date = (date // 12 + 2009, date % 12 + 1)
        if self.date_transform is not None:
            date = self.date_transform(date)
        
        return paragraph, date


In [None]:
tokenizer("some test 문장", add_special_tokens=True)

{'input_ids': [2, 517, 440, 427, 423, 389, 517, 442, 396, 2120, 7178, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
# tokenizer = tk_hanbert.HanBertTokenizer.from_pretrained('HanBert-54kN-torch')
tokenizer = KoBertTokenizer.from_pretrained('monologg/distilkobert')
"""
tokenizer.tokenize(문자열)
tokenizer.convert_tokens_to_ids(문자열)
tokenizer.covert_tokens_to_string(문자열)
"""

bert_model = DistilBertModel.from_pretrained('monologg/distilkobert')


Downloading:   0%|          | 0.00/363k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/76.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/441 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


Downloading:   0%|          | 0.00/108M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/distilkobert were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
class Compose(object):
    """Composes several transforms together. Copied from torchvision.transforms.Compose"""

    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img):
        for t in self.transforms:
            img = t(img)
        return img

    def __repr__(self):
        format_string = self.__class__.__name__ + '('
        for t in self.transforms:
            format_string += '\n'
            format_string += '    {0}'.format(t)
        format_string += '\n)'
        return format_string


# 엄청 느릴듯
class RegexSubstitution(object):
    """Regex substitution class for transform"""

    def __init__(self, regex, sub=''):
        if isinstance(regex, re.Pattern):
            self.regex = regex
        else:
            self.regex = re.compile(regex)
        self.sub = sub
    
    def __call__(self, target):
        if isinstance(target, list):
            return [ self.regex.sub(self.sub, self.regex.sub(self.sub, string)) for string in target ]
        else:
            return self.regex.sub(self.sub, self.regex.sub(self.sub, target))

class SquashParagraph(object):
    """Squash paragraph"""

    def __call__(self, sample):
        return '\n'.join(sample)

class ExcludeTitle(object):
    """Exclude title"""

    def __call__(self, sample):
        return sample[1:]

class AddSpecialToken(object):
    """ [CLS] ? [SEP] """

    def __call__(self, sample):
        return "[CLS] " + sample + " [SEP]"

class TokenizeString(object):
    """ Tokenize input """
    
    def __init__(self, max_length=0):
        self.max_length = max_length

    def __call__(self, sample):
        encoded_dict = tokenizer(
            text=sample,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            return_length=True
        )
        return encoded_dict['input_ids'], encoded_dict['length']

# One-Hot Encoding tensor
one_hot = F.one_hot(torch.arange(120))
one_hot = torch.as_tensor(one_hot, dtype = torch.float32)
class OneHotEncoding(object):
    """ year, month -> torch.tensor(180) """

    def __call__(self, date):
        year, month = date
        return one_hot[(12 * (year - 2009) + month - 1)]

class ToFloatTensor(object):
    def __call__(self, date):
        return torch.as_tensor(date, dtype=torch.float32)

def collate_batch_normal(batch):
    label_list, text_list, length_list = [], [], []
    for (_text, _label) in batch:
        para, length = _text
        label_list.append(_label)
        text_list.append(torch.as_tensor(para))
        length_list.append(length)
    # as_tensor로 복사 방지
    label_tensor = torch.as_tensor(label_list)
    length_tensor = torch.as_tensor(length_list)
    # Padding
    ids_tensor = nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=tokenizer.pad_token_id)
    # Length sorting
    len_sorted, sidx = length_tensor.sort(dim=0, descending=True)
    ids_tensor = torch.index_select(ids_tensor, 0, sidx)
    label_tensor = torch.index_select(label_tensor, 0, sidx)
    return ids_tensor.to(device), len_sorted.to(device), label_tensor.to(device)

# for One hot encoding?
def collate_batch(batch):
    label_list, text_list, length_list = [], [], []
    for (_text, _label) in batch:
        para, length = _text
        label_list.append(_label)
        text_list.append(torch.as_tensor(para))
        length_list.append(length)
    # as_tensor로 복사 방지
    label_tensor = torch.stack(label_list)
    length_tensor = torch.as_tensor(length_list)
    # Padding
    ids_tensor = nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=tokenizer.pad_token_id)
    # Length sorting
    len_sorted, sidx = length_tensor.sort(dim=0, descending=True)
    ids_tensor = torch.index_select(ids_tensor, 0, sidx)
    label_tensor = torch.index_select(label_tensor, 0, sidx)
    return ids_tensor.to(device), len_sorted.to(device), label_tensor.to(device)

# 데이터셋 로드

In [None]:
# 말뭉치 로드
if C.dataset.use_korpora:
    corpus_force_load = False
    if (not ('corpus' in globals())) or (corpus is None) or corpus_force_load:
        corpus = None
        if C.corpus.use_smallset:
            corpus = ModuNewsKorpus(root_dir = str(C.gdrive.root / 'smallnews'), force_download=False, load_light=False)
        else:
            corpus = ModuNewsKorpus(root_dir = str(C.gdrive.root / 'news'), force_download=False, load_light=False)
    else:
        print('Corpus already loaded')
else:
    print('C.dataset.use_korpora = False')


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    모두의 말뭉치는 문화체육관광부 산하 국립국어원에서 제공하는 말뭉치로
    총 13 개의 말뭉치로 이뤄져 있습니다.

    해당 말뭉치를 이용하기 위해서는 국립국어원 홈페이지에 가셔서 "회원가입 > 말뭉치 신청 > 승인"의
    과정을 거치셔야 합니다.

    https://corpus.korean.go.kr/#none

    모두의 말뭉치는 승인 후 다운로드 가능 기간 및 횟수 (3회) 에 제한이 있습니다.

    로그인 기능 및 Korpora 패키지에서의 다운로드 기능을 제공하려 하였지만,
    국립국어원에서 위의 이유로 이에 대한 기능은 제공이 불가함을 확인하였습니다.

    Korpora==0.2.0 에서는 "개별 말뭉치 신청 > 승인"이 완료되었다고 가정,
    로컬에 다운로드 된 말뭉치를 손쉽게 로딩하는 기능만 제공합니다

    (Korpora 개발진 lovit@github, ratsgo@github)

    # License
    모두의 말뭉치의 모든 저작권은 `문화체육관광부 국립국어원
    (National Institute of Korean Language)` 에 귀속됩니다.

    소유권을 포함한 전문은 다음의 주소에서 확인하실 수 있습니다.

    https://corpus.korean.go.kr/boards/termsInfo.do

    제13조 (소유권)
    ① 누리집이 제공하는 서비스, 그에 필요한 소프트웨어, 이미지, 마크, 로고, 디자인

Loading ModuNews: 100%|██████████| 10/10 [00:26<00:00,  2.65s/it]


In [None]:
if C.dataset.use_korpora:
    nds_dataset = NewsDateDatasetFromKorpora(corpus, news_transform=Compose([ExcludeTitle(),
                                                              SquashParagraph(),
                                                              RegexSubstitution(r'\([^()]+\)|[<>\'"△▲□■]'),
                                                              # PreprocessString(),
                                                              TokenizeString(512)
                                                              ]),
                              date_transform=OneHotEncoding() )
else:
    nds_dataset = NewsDateDatasetFromCsv(C.dataset.csv_file, news_transform=TokenizeString(512),
                              date_transform=OneHotEncoding() )
if C.dataset.use_cache:
    nds_dataset = nds_dataset.cache()
print("Total news:", len(nds_dataset))

n_train = math.floor(C.dataset.split_train * len(nds_dataset))
n_val = len(nds_dataset) - n_train
print("Dataset split to (train, val) = (%d, %d)" % (n_train, n_val))

train_dataset, valid_dataset = tud.random_split(nds_dataset, [n_train, n_val])

train_loader = tud.DataLoader(train_dataset,
                              batch_size=C.dataset.batch_size,
                              shuffle=True,
                              drop_last=False,
                              collate_fn=collate_batch)
valid_loader = tud.DataLoader(valid_dataset,
                              batch_size=C.dataset.batch_size,
                              shuffle=False,
                              drop_last=False,
                              collate_fn=collate_batch)


Total news: 115080
Dataset split to (train, val) = (92064, 23016)


In [None]:
# Caching dataset
if C.dataset.use_cache:
    for batch_result in enumerate(tqdm(nds_dataset)):
        pass

  0%|          | 0/115080 [00:00<?, ?it/s]

# 모델

In [None]:
class SanctiMoly(nn.Module):
    """ Holy Moly News BERT """

    def __init__(self, freeze_bert = True):
        super(SanctiMoly, self).__init__()
        self.encoder = bert_model
        # FC-BN-Tanh
        self.linear = nn.Sequential(nn.Linear(768, 1024),
                                    nn.BatchNorm1d(1024),
                                    nn.Tanh(),
                                    nn.Dropout(),
                                    nn.Linear(1024, 768),
                                    nn.BatchNorm1d(768),
                                    nn.Tanh(),
                                    nn.Dropout(),
                                    nn.Linear(768, 120)
                                    )
        # self.softmax = nn.LogSoftmax(dim=-1)

        if freeze_bert == True:
            for param in self.encoder.parameters():
                param.requires_grad = False
        else:
            for param in self.encoder.parameters():
                param.requires_grad = True

            
    def forward(self, input_ids, input_length):
        # calculate attention mask
        attn_mask = torch.arange(input_ids.size(1)).to(device)
        attn_mask = attn_mask[None, :] < input_length[:, None]

        enc_o = self.encoder(input_ids, attn_mask)
        
        output = self.linear(enc_o.last_hidden_state[:, 0, :])
        # print(output.shape)
        return output

# 학습 모듈

In [None]:
def i2ym(fl):
    return (fl // 12 + 2009, fl % 12 + 1)

class ModelContainer(object):
    """ Util class for pytorch models """

    # initialize with clean state
    def __init__(self, model, device):
        self.model = model.to(device)
        self.device = device
        self.train_epochs_done = 0
        self.best_valid_accuracy = -math.inf
        self.criterion = None
        self.optimizer = None
        self.scheduler = None
    
    # set cos
    def set_cos(self, criterion=None, optimizer=None, scheduler=None):
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
    
    def save_to_file(self, path):
        torch.save({
            'train_epochs_done': self.train_epochs_done,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': (self.optimizer.state_dict() if self.optimizer is not None else None),
            'scheduler_state_dict': (self.scheduler.state_dict() if self.scheduler is not None else None),
            'best_valid_accuracy': self.best_valid_accuracy
            }, path)
        print(f'{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} || Model saved as {path}')
    
    def load_from_file(self, path):
        checkpoint = torch.load(path)
        self.train_epochs_done = checkpoint['train_epochs_done']
        self.model.load_state_dict(checkpoint['model_state_dict'])
        osd = checkpoint['optimizer_state_dict']
        if (osd is not None) and (self.optimizer is not None):
            self.optimizer.load_state_dict(osd)
        ssd = checkpoint['scheduler_state_dict']
        if (ssd is not None) and (self.scheduler is not None):
            self.scheduler.load_state_dict(ssd)
        self.best_valid_accuracy = checkpoint['best_valid_accuracy']
        print(f'{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} || Model loaded from {path}')
        print(f'{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ||  ... with Train Epochs = {self.train_epochs_done}, Best Valid Accuracy = {self.best_valid_accuracy:.2%}')
    
    def __run_batch(self, is_train, data_loader):
        total_loss = 0
        n_total = 0
        n_correct = 0
        for batch in enumerate(tqdm(data_loader)):
            i_batch, sample_batched = batch
            ids, ids_len, labels = sample_batched
            
            pred = self.model(ids.to(self.device), ids_len.to(self.device))
            
            loss = self.criterion(pred, labels.to(self.device))
            if is_train:
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
            
            n_targets = labels.size(0)
            n_cur_correct = (pred.argmax(-1) == labels.argmax(-1)).long().sum().item()
            n_total += n_targets
            total_loss += float(loss)
            n_correct += n_cur_correct

            if (C.train.log_batch_interval is not None) and (i_batch % C.train.log_batch_interval == 0):
                if C.train.log_batch_sample:
                    pred_print = i2ym(pred.argmax(-1)[0].item())
                    label_print = i2ym(labels.argmax(-1)[0].item())
                    print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} ||   Batch {i_batch:>7} "
                        f"Sample Label: ({label_print[0]:4}, {label_print[1]:>2}) <> Pred: ({pred_print[0]:4}, {pred_print[1]:>2})")
                print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} ||   Batch {i_batch:>7} "
                    f"Loss {(float(loss)/n_targets):.6f} Accuracy {(n_cur_correct/n_targets):.2%}")
        return total_loss, n_total, n_correct

    def run_epoch(self, is_train, data_loader=None):
        if data_loader is None:
            data_loader = train_loader if is_train else valid_loader
        
        if is_train:
            self.model.train()
            total_loss, n_total, n_correct = self.__run_batch(is_train, data_loader)
        else:
            self.model.eval()
            with torch.no_grad():
                total_loss, n_total, n_correct = self.__run_batch(is_train, data_loader)
        total_loss /= n_total
        accu = n_correct / n_total
        
        if is_train:
            self.train_epochs_done = self.train_epochs_done + 1
        print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} || {'Train' if is_train else 'Valid'} Epoch {self.train_epochs_done:>2} "
            f"Loss {total_loss:.6f} Accuracy {accu:.2%}")
        return total_loss, accu

    def train(self, until_epochs=C.train.epochs, scheduler_call=None):
        with tqdm(total=until_epochs, initial=self.train_epochs_done) as pbar:
            while self.train_epochs_done < until_epochs:
                self.run_epoch(is_train=True, data_loader=train_loader)
                val_loss, valid_accuracy = self.run_epoch(is_train=False, data_loader=valid_loader)
                if self.scheduler is not None:
                    if scheduler_call is None:
                        self.scheduler.step(val_loss)
                    else:
                        scheduler_call(self.scheduler, val_loss)
                if self.best_valid_accuracy < valid_accuracy:
                    print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} || New best valid accuracy ({self.best_valid_accuracy:.2%} -> {valid_accuracy:.2%}")
                    self.best_valid_accuracy = valid_accuracy
                    self.save_to_file(C.train.checkpoint_path / 'best.pt')
                if C.train.autosave:
                    self.save_to_file(C.train.checkpoint_path / 'model.pt')
                pbar.update(1)
    
    def test(self, data_loader=None):
        if data_loader is None:
            data_loader = valid_loader
        val_loss, valid_accuracy = self.run_epoch(is_train=False, data_loader=data_loader)
    
    def test_single(self, data):
        self.model.eval()
        with torch.no_grad():
            sample_batched = collate_batch([data])
            ids, ids_len, labels = sample_batched
            pred = self.model(ids.to(self.device), ids_len.to(self.device))
            pred_print = i2ym(pred.argmax(-1)[0].item())
            label_print = i2ym(labels.argmax(-1)[0].item())

            print(f"Label: ({label_print[0]:4}, {label_print[1]:>2}) <> Pred: ({pred_print[0]:4}, {pred_print[1]:>2})")


# 실제 학습

In [None]:
model_container = ModelContainer(model=SanctiMoly(freeze_bert=False),
                                 device=device)

criterion = nn.CrossEntropyLoss().cuda()

# 학습시킬 파라미터만 넣자 -> Freeze 경우
optimizer = optim.AdamW(filter(lambda p: p.requires_grad==True, model_container.model.parameters()), lr=3e-5) # 0.001, 0.1, 3e-5가 Sota

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
        factor=0.25, patience=1, threshold=0.0001, threshold_mode='rel', 
        cooldown=0, min_lr=0, eps=1e-08, verbose=False)

model_container.set_cos(criterion, optimizer, scheduler)

# 처음부터 시작할거면 둘 다 주석처리 후 돌리기
# 아니면 로드할 파일 로드 후 고
# 저장 중복이 되지 않도록 C.train.checkpoint_path 적절히 수정하기
# model_container.load_from_file(C.train.checkpoint_path / 'model.pt')
model_container.load_from_file(C.train.checkpoint_path / 'best.pt')

2021-12-12 12:26:37 || Model loaded from /gdrive/My Drive/cs492i/nBTT_25_ckpts/best.pt
2021-12-12 12:26:37 ||  ... with Train Epochs = 20, Best Valid Accuracy = 36.46%


In [None]:
model_container.train(C.train.epochs)

 80%|########  | 20/25 [00:00<?, ?it/s]

  0%|          | 0/1439 [00:00<?, ?it/s]

2021-12-06 09:01:08 ||   Batch       0 Sample Label: (2012, 10) <> Pred: (2013, 10)
2021-12-06 09:01:08 ||   Batch       0 Loss 0.045572 Accuracy 21.88%
2021-12-06 09:02:33 ||   Batch     100 Sample Label: (2014,  9) <> Pred: (2011,  5)
2021-12-06 09:02:33 ||   Batch     100 Loss 0.037702 Accuracy 34.38%
2021-12-06 09:03:57 ||   Batch     200 Sample Label: (2016,  6) <> Pred: (2017,  8)
2021-12-06 09:03:57 ||   Batch     200 Loss 0.042941 Accuracy 29.69%
2021-12-06 09:05:22 ||   Batch     300 Sample Label: (2018,  5) <> Pred: (2018,  3)
2021-12-06 09:05:22 ||   Batch     300 Loss 0.038144 Accuracy 39.06%
2021-12-06 09:06:47 ||   Batch     400 Sample Label: (2013, 12) <> Pred: (2012, 11)
2021-12-06 09:06:47 ||   Batch     400 Loss 0.040291 Accuracy 26.56%
2021-12-06 09:08:11 ||   Batch     500 Sample Label: (2012,  3) <> Pred: (2012,  3)
2021-12-06 09:08:11 ||   Batch     500 Loss 0.042251 Accuracy 29.69%
2021-12-06 09:09:36 ||   Batch     600 Sample Label: (2013, 11) <> Pred: (2014,  2

  0%|          | 0/360 [00:00<?, ?it/s]

2021-12-06 09:21:25 ||   Batch       0 Sample Label: (2015,  7) <> Pred: (2012,  7)
2021-12-06 09:21:25 ||   Batch       0 Loss 0.036218 Accuracy 45.31%
2021-12-06 09:21:53 ||   Batch     100 Sample Label: (2009,  5) <> Pred: (2009,  6)
2021-12-06 09:21:53 ||   Batch     100 Loss 0.035182 Accuracy 34.38%
2021-12-06 09:22:22 ||   Batch     200 Sample Label: (2017,  3) <> Pred: (2015,  3)
2021-12-06 09:22:22 ||   Batch     200 Loss 0.039653 Accuracy 34.38%
2021-12-06 09:22:51 ||   Batch     300 Sample Label: (2013,  2) <> Pred: (2013,  2)
2021-12-06 09:22:51 ||   Batch     300 Loss 0.038174 Accuracy 31.25%
2021-12-06 09:23:08 || Valid Epoch 21 Loss 0.036580 Accuracy 36.17%
2021-12-06 09:23:09 || Model saved as /gdrive/My Drive/cs492i/nBTT_20_ckpts/model.pt


  0%|          | 0/1439 [00:00<?, ?it/s]

2021-12-06 09:23:10 ||   Batch       0 Sample Label: (2014,  5) <> Pred: (2010, 11)
2021-12-06 09:23:10 ||   Batch       0 Loss 0.041201 Accuracy 21.88%
2021-12-06 09:24:35 ||   Batch     100 Sample Label: (2009,  4) <> Pred: (2009,  4)
2021-12-06 09:24:35 ||   Batch     100 Loss 0.041215 Accuracy 26.56%
2021-12-06 09:26:00 ||   Batch     200 Sample Label: (2016,  5) <> Pred: (2018,  4)
2021-12-06 09:26:00 ||   Batch     200 Loss 0.033774 Accuracy 42.19%
2021-12-06 09:27:24 ||   Batch     300 Sample Label: (2009,  1) <> Pred: (2009,  1)
2021-12-06 09:27:24 ||   Batch     300 Loss 0.042510 Accuracy 28.12%
2021-12-06 09:28:49 ||   Batch     400 Sample Label: (2014,  4) <> Pred: (2014,  4)
2021-12-06 09:28:49 ||   Batch     400 Loss 0.038488 Accuracy 40.62%
2021-12-06 09:30:13 ||   Batch     500 Sample Label: (2015,  7) <> Pred: (2011,  8)
2021-12-06 09:30:13 ||   Batch     500 Loss 0.040129 Accuracy 34.38%
2021-12-06 09:31:38 ||   Batch     600 Sample Label: (2009,  2) <> Pred: (2009,  2

  0%|          | 0/360 [00:00<?, ?it/s]

2021-12-06 09:43:27 ||   Batch       0 Sample Label: (2015,  7) <> Pred: (2012,  7)
2021-12-06 09:43:27 ||   Batch       0 Loss 0.036284 Accuracy 45.31%
2021-12-06 09:43:56 ||   Batch     100 Sample Label: (2009,  5) <> Pred: (2009,  6)
2021-12-06 09:43:56 ||   Batch     100 Loss 0.035216 Accuracy 34.38%
2021-12-06 09:44:24 ||   Batch     200 Sample Label: (2017,  3) <> Pred: (2015,  3)
2021-12-06 09:44:24 ||   Batch     200 Loss 0.039805 Accuracy 34.38%
2021-12-06 09:44:53 ||   Batch     300 Sample Label: (2013,  2) <> Pred: (2013,  2)
2021-12-06 09:44:53 ||   Batch     300 Loss 0.038380 Accuracy 32.81%
2021-12-06 09:45:10 || Valid Epoch 22 Loss 0.036697 Accuracy 36.08%
2021-12-06 09:45:12 || Model saved as /gdrive/My Drive/cs492i/nBTT_20_ckpts/model.pt


  0%|          | 0/1439 [00:00<?, ?it/s]

2021-12-06 09:45:13 ||   Batch       0 Sample Label: (2011,  3) <> Pred: (2012, 10)
2021-12-06 09:45:13 ||   Batch       0 Loss 0.043369 Accuracy 26.56%
2021-12-06 09:46:37 ||   Batch     100 Sample Label: (2015, 11) <> Pred: (2016, 11)
2021-12-06 09:46:37 ||   Batch     100 Loss 0.038100 Accuracy 23.44%
2021-12-06 09:48:02 ||   Batch     200 Sample Label: (2016,  5) <> Pred: (2014, 11)
2021-12-06 09:48:02 ||   Batch     200 Loss 0.041567 Accuracy 32.81%
2021-12-06 09:49:26 ||   Batch     300 Sample Label: (2012,  5) <> Pred: (2009,  8)
2021-12-06 09:49:26 ||   Batch     300 Loss 0.041649 Accuracy 37.50%
2021-12-06 09:50:51 ||   Batch     400 Sample Label: (2009, 10) <> Pred: (2009,  1)
2021-12-06 09:50:51 ||   Batch     400 Loss 0.046975 Accuracy 25.00%
2021-12-06 09:52:15 ||   Batch     500 Sample Label: (2015,  6) <> Pred: (2015,  6)
2021-12-06 09:52:15 ||   Batch     500 Loss 0.036048 Accuracy 32.81%
2021-12-06 09:53:40 ||   Batch     600 Sample Label: (2012,  3) <> Pred: (2012,  3

  0%|          | 0/360 [00:00<?, ?it/s]

2021-12-06 10:05:29 ||   Batch       0 Sample Label: (2015,  7) <> Pred: (2012,  7)
2021-12-06 10:05:29 ||   Batch       0 Loss 0.036321 Accuracy 46.88%
2021-12-06 10:05:57 ||   Batch     100 Sample Label: (2009,  5) <> Pred: (2009,  6)
2021-12-06 10:05:57 ||   Batch     100 Loss 0.035305 Accuracy 34.38%
2021-12-06 10:06:26 ||   Batch     200 Sample Label: (2017,  3) <> Pred: (2015,  3)
2021-12-06 10:06:26 ||   Batch     200 Loss 0.039895 Accuracy 35.94%
2021-12-06 10:06:55 ||   Batch     300 Sample Label: (2013,  2) <> Pred: (2013,  2)
2021-12-06 10:06:55 ||   Batch     300 Loss 0.038584 Accuracy 32.81%
2021-12-06 10:07:12 || Valid Epoch 23 Loss 0.036775 Accuracy 36.03%
2021-12-06 10:07:14 || Model saved as /gdrive/My Drive/cs492i/nBTT_20_ckpts/model.pt


  0%|          | 0/1439 [00:00<?, ?it/s]

2021-12-06 10:07:14 ||   Batch       0 Sample Label: (2018,  6) <> Pred: (2018,  1)
2021-12-06 10:07:14 ||   Batch       0 Loss 0.043557 Accuracy 26.56%
2021-12-06 10:08:39 ||   Batch     100 Sample Label: (2015, 12) <> Pred: (2014, 10)
2021-12-06 10:08:39 ||   Batch     100 Loss 0.044775 Accuracy 32.81%
2021-12-06 10:10:04 ||   Batch     200 Sample Label: (2017,  3) <> Pred: (2010,  3)
2021-12-06 10:10:04 ||   Batch     200 Loss 0.046387 Accuracy 28.12%
2021-12-06 10:11:28 ||   Batch     300 Sample Label: (2012,  2) <> Pred: (2012,  2)
2021-12-06 10:11:28 ||   Batch     300 Loss 0.042189 Accuracy 23.44%
2021-12-06 10:12:53 ||   Batch     400 Sample Label: (2016,  8) <> Pred: (2016,  8)
2021-12-06 10:12:53 ||   Batch     400 Loss 0.042938 Accuracy 35.94%
2021-12-06 10:14:17 ||   Batch     500 Sample Label: (2017,  6) <> Pred: (2017,  6)
2021-12-06 10:14:17 ||   Batch     500 Loss 0.039923 Accuracy 29.69%
2021-12-06 10:15:42 ||   Batch     600 Sample Label: (2016,  3) <> Pred: (2016,  3

  0%|          | 0/360 [00:00<?, ?it/s]

2021-12-06 10:27:30 ||   Batch       0 Sample Label: (2015,  7) <> Pred: (2012,  7)
2021-12-06 10:27:30 ||   Batch       0 Loss 0.036191 Accuracy 45.31%
2021-12-06 10:27:59 ||   Batch     100 Sample Label: (2009,  5) <> Pred: (2009,  6)
2021-12-06 10:27:59 ||   Batch     100 Loss 0.035047 Accuracy 34.38%
2021-12-06 10:28:28 ||   Batch     200 Sample Label: (2017,  3) <> Pred: (2015,  3)
2021-12-06 10:28:28 ||   Batch     200 Loss 0.039760 Accuracy 35.94%
2021-12-06 10:28:57 ||   Batch     300 Sample Label: (2013,  2) <> Pred: (2013,  2)
2021-12-06 10:28:57 ||   Batch     300 Loss 0.038278 Accuracy 34.38%
2021-12-06 10:29:14 || Valid Epoch 24 Loss 0.036581 Accuracy 36.17%
2021-12-06 10:29:15 || Model saved as /gdrive/My Drive/cs492i/nBTT_20_ckpts/model.pt


  0%|          | 0/1439 [00:00<?, ?it/s]

2021-12-06 10:29:16 ||   Batch       0 Sample Label: (2014,  5) <> Pred: (2009, 10)
2021-12-06 10:29:16 ||   Batch       0 Loss 0.040583 Accuracy 32.81%
2021-12-06 10:30:41 ||   Batch     100 Sample Label: (2013,  2) <> Pred: (2009,  1)
2021-12-06 10:30:41 ||   Batch     100 Loss 0.041219 Accuracy 29.69%
2021-12-06 10:32:05 ||   Batch     200 Sample Label: (2011,  7) <> Pred: (2011,  7)
2021-12-06 10:32:05 ||   Batch     200 Loss 0.042739 Accuracy 34.38%
2021-12-06 10:33:30 ||   Batch     300 Sample Label: (2017,  3) <> Pred: (2017,  4)
2021-12-06 10:33:30 ||   Batch     300 Loss 0.043544 Accuracy 29.69%
2021-12-06 10:34:54 ||   Batch     400 Sample Label: (2012,  5) <> Pred: (2012,  6)
2021-12-06 10:34:54 ||   Batch     400 Loss 0.039018 Accuracy 34.38%
2021-12-06 10:36:19 ||   Batch     500 Sample Label: (2009,  9) <> Pred: (2009,  9)
2021-12-06 10:36:19 ||   Batch     500 Loss 0.038467 Accuracy 39.06%
2021-12-06 10:37:43 ||   Batch     600 Sample Label: (2018, 12) <> Pred: (2018,  7

  0%|          | 0/360 [00:00<?, ?it/s]

2021-12-06 10:49:32 ||   Batch       0 Sample Label: (2015,  7) <> Pred: (2012,  7)
2021-12-06 10:49:32 ||   Batch       0 Loss 0.036145 Accuracy 46.88%
2021-12-06 10:50:01 ||   Batch     100 Sample Label: (2009,  5) <> Pred: (2009,  6)
2021-12-06 10:50:01 ||   Batch     100 Loss 0.035079 Accuracy 35.94%
2021-12-06 10:50:30 ||   Batch     200 Sample Label: (2017,  3) <> Pred: (2015,  3)
2021-12-06 10:50:30 ||   Batch     200 Loss 0.039680 Accuracy 34.38%
2021-12-06 10:50:59 ||   Batch     300 Sample Label: (2013,  2) <> Pred: (2013,  2)
2021-12-06 10:50:59 ||   Batch     300 Loss 0.038163 Accuracy 32.81%
2021-12-06 10:51:16 || Valid Epoch 25 Loss 0.036599 Accuracy 36.28%
2021-12-06 10:51:17 || Model saved as /gdrive/My Drive/cs492i/nBTT_20_ckpts/model.pt


# Playground

In [None]:
test_corpus_idx = 0
if C.dataset.use_korpora:
    model_container.test_single(nds_dataset[test_corpus_idx])
    # print(nds_dataset[3])
    print(corpus.train[test_corpus_idx].date)
    print('\n'.join(corpus.train[test_corpus_idx].paragraph))

Label: (2009,  1) <> Pred: (2009,  1)
20090101
"대통령, 시장 방문만 하지 말고 실천해달라"
2008년의 마지막 새벽, 언론의 카메라는 서울 여의도를 향했다. 방송법 등 주요쟁점 법안이 상정될 국회 본회의장을 두고 여야 의원들의 전쟁을 기다리고 있었던 것.
같은 시각, 국회 밖 세상에서 서민들은 경제 위기와 강추위 속에서 삶의 고단함과 정치에 대한 절망감에 맞선 채 팍팍한 삶을 이어가고 있었다. 이들의 목소리를 듣기 위해 이날 새벽 3시 대한민국의 아침을 여는 서울 가락동 농수산물종합도매시장으로 향했다.
가락시장으로 가는 택시 안에서 2008년의 마지막 새벽을 맞는 서민들의 얘기를 엿들을 수 있었다. 택시기사 서인철(가명·63)씨는 말한다.
"LPG값이 휘발유보다 비싸졌고, 9만9000원 하는 사납금 내기도 벅차다. (한 달에) 100만원도 벌기 힘들다. 40년 동안 택시기사를 했는데, 요즘이 제일 힘들다. 택시 손님 열에 아홉은 힘들다고 한다. 더 암울한 건 내년엔 더 어렵다는 거다. 정치인들? 멱살 잡고 싸우는 거 말고 뭘 하나? 이젠 짜증내기도 귀찮다."
치열한 삶터, 가락시장에 드리워진 불황의 그림자
새벽 3시 30분, 가락시장의 첫인상은 그야말로 치열한 삶터였다. 가락시장 인근 도로엔 농수산물을 싣고 전국에서 올라온 트럭들이 '차산차해'를 이뤘다. 가락시장 안에는 트럭과 함께 짐을 옮기는 지게차·오토바이·손수레로 가득 차 더욱 복잡했다.
가락시장 수산시장에서는 방어 경매가 한창이었다. 중개인이 일반인들이 알아듣기 힘든 말로 몇 차례 중얼거리자 경매는 이내 끝났다. 방어는 어찌나 힘이 좋은지 한 번 몸을 뒤집자 바구니 밖 3~4m까지 물이 튈 정도였다.
상인들도 방어만큼이나 바삐 움직였다. 사람들은 분주히 수산물을 날랐다. 경매로 수산물을 산 중간도매상들은 좌판을 펼쳐놓고 손님들을 기다렸다. 그들은 머리엔 모자·스카프 등을 두르고, 신발은 털장화를 싣고는 난로 옆에 바싹 붙어 생선 손질을 하고 있었다.
쪼그리고 앉아

In [None]:
if C.dataset.use_korpora:
    
    model_container.model.eval()
    with torch.no_grad():
        top3_count = 0
        in3_count = 0
        for test_corpus_idx in range(1000):
            
            sample_batched = collate_batch([nds_dataset[test_corpus_idx]])
            ids, ids_len, labels = sample_batched
            pred = model_container.model(ids.to(device), ids_len.to(device))
            _, indices = torch.topk(pred, 3)
            if labels.argmax(-1)[0].to(device) in indices.unsqueeze(0):
                top3_count+=1
            if abs(pred.argmax(-1)[0].item()-labels.argmax(-1)[0].item()) <= 3:
                in3_count+=1
        print("Top3 Acc: ", top3_count/10)
        print("In3 Acc: ", in3_count/10)
            


Top3 Acc:  81.3
In3 Acc:  79.5


In [None]:
Text = input()

CleanText = RegexSubstitution(r'\([^()]+\)|[<>\'"△▲□■]')(Text)
print(CleanText)

btr_a, btr_b = TokenizeString(512)([CleanText])
btr_a = torch.as_tensor(btr_a)
btr_b = torch.as_tensor(btr_b)

model_container.model.eval()
with torch.no_grad():
    pred = model_container.model(btr_a.to(device), btr_b.to(device))
    _, indices = torch.topk(pred, 3)
    print(torch.tensor(99).to(device) in indices.squeeze(0))
    for i in indices.squeeze(0):
        print(i2ym(i.item()))
    print("got", i2ym(pred.argmax(-1)[0].item()))

더불어민주당 문재인 전대표가 국민의당과 비박계의 연대는 호남에 대한 배신이라며 다시 야권 통합론을 꺼냈습니다.  국민의당은 문 전대표는 분당의 주역으로 야권 통합을 말할 자격이 없다고 반발했습니다.  송수진 기자의 보도입니다.  <리포트>  문재인 전 민주당 대표가 민주당과 국민의당이 힘을 모아야 한다며 야권 통합론 띄우기에 나섰습니다.  모두 김대중, 노무현 정부의 후예라면서 대선 과정에서 다시 뭉쳐야 한다고 말했습니다.  <인터뷰> 문재인(더불어민주당 전 대표) : "함께 힘을 모아서 제3기 민주 정부를 만들어내라는 것이 호남 민심이 요구하는 것이라고..."  국민의당과 비박 세력의 연대는 호남 민심에 어긋나는 것이라며 제3지대 연대론을 비판했습니다.  문 전 대표의 이런 발언은 현재의 대선구도를 바꿀 수 있는 제3지대 연대론을 조기에 차단하고, 호남 민심을 결속시키기 위한 포석으로 풀이됩니다.  국민의당 지도부는 문재인 전 대표야말로 패권주의 청산을 거부해 분당을 만들어낸 인물이라며 반발했습니다.  <녹취> 주승용(국민의당 원내대표) : "친박과 친문을 제외한 모든 세력이 국민의당을 중심으로 플랫폼 정당으로 만들어서 이번 대선에서 반드시 승리할 수 있도록."  박지원 전 원내대표도 문 전 대표가 통합을 말할 자격이 있는지 의문이라며 문 전 대표와의 통합은 없을 것이라고 선을 그었습니다.  야권이 호남 민심을 놓고 양보 없는 경쟁을 벌이는 가운데 야권 통합론을 내세우는 민주당과 제3지대론을 내건 국민의당 간 대립은 더욱 치열해질 것으로 보입니다.
더불어민주당 문재인 전대표가 국민의당과 비박계의 연대는 호남에 대한 배신이라며 다시 야권 통합론을 꺼냈습니다.  국민의당은 문 전대표는 분당의 주역으로 야권 통합을 말할 자격이 없다고 반발했습니다.  송수진 기자의 보도입니다.  리포트  문재인 전 민주당 대표가 민주당과 국민의당이 힘을 모아야 한다며 야권 통합론 띄우기에 나섰습니다.  모두 김대중, 노무현 정부의 후예라면서 대선 과정에서 다시 뭉쳐야 한다고 말했습