## Package 설치


> 먼저 본 실습에 필요한 transformers 와 pytorch-lightning 을 설치합니다.

In [1]:
!pip install transformers==4.6.0
!pip install pytorch-lightning==1.3.1

Collecting transformers==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/b0/9e/5b80becd952d5f7250eaf8fc64b957077b12ccfe73e9c03d37146ab29712/transformers-4.6.0-py3-none-any.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 25.7MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 46.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 47.1MB/s 
Inst

## 데이터 다운로드


> 다음은 본 실습에서 필요한 데이터 셋을 다운로드 받아 주세요.



In [2]:
!git clone https://github.com/e9t/nsmc

Cloning into 'nsmc'...
remote: Enumerating objects: 14763, done.[K
remote: Total 14763 (delta 0), reused 0 (delta 0), pack-reused 14763[K
Receiving objects: 100% (14763/14763), 56.19 MiB | 22.23 MiB/s, done.
Resolving deltas: 100% (1749/1749), done.
Checking out files: 100% (14737/14737), done.


In [3]:
!ls -l nsmc/

total 38624
drwxr-xr-x 2 root root     4096 May 17 08:32 code
-rw-r--r-- 1 root root  4893335 May 17 08:32 ratings_test.txt
-rw-r--r-- 1 root root 14628807 May 17 08:32 ratings_train.txt
-rw-r--r-- 1 root root 19515078 May 17 08:32 ratings.txt
drwxr-xr-x 2 root root   458752 May 17 08:32 raw
-rw-r--r-- 1 root root     2596 May 17 08:32 README.md
-rw-r--r-- 1 root root    36746 May 17 08:32 synopses.json


## Torch Dataset

> 다운로드 받은 nsmc 데이터로부터 학습 데이터를 만들기 위해 Pytorch Dataset을 만들어 줍니다. 전처리는 가장 기본 적인 전처리 과정만 하였습니다.



In [4]:
import pandas as pd
import torch

from torch.utils.data import Dataset


class NSMCDataset(Dataset):
    def __init__(self, csv_file, tokenizer):
        df = pd.read_csv(csv_file, sep='\t')
        # NaN 값 제거
        df = df.dropna(axis=0)
        # 중복 제거
        df.drop_duplicates(subset=['document'], inplace=True)
        self.input_ids = tokenizer.batch_encode_plus(
            df['document'].to_list(),
            return_tensors='pt',
            truncation=True,
            padding='longest',
            add_special_tokens=True,
            return_token_type_ids=False,
            return_attention_mask=False,
        )['input_ids']
        self.labels = torch.LongTensor(df['label'])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.labels[idx]


## Pytorch Lightning Model


> KoBERT를 기반으로 분류모델을 만들어 줍니다. KoBERT는 huggingface 에 있는 모델을 사용 하였습니다.



In [5]:
import torch

from pytorch_lightning.core.lightning import LightningModule
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, BertTokenizerFast
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
from sklearn.metrics import accuracy_score


class KoBERT(LightningModule):
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters()
        self.bert = BertForSequenceClassification.from_pretrained(self.hparams.model_path)
        self.tokenizer = BertTokenizerFast.from_pretrained(self.hparams.model_path)

    def forward(self, **kwargs):
        return self.bert(**kwargs)

    def step(self, batch, batch_idx):
        data, labels = batch
        output = self(input_ids=data, labels=labels)

        loss = output.loss

        y_true = labels.tolist()
        y_pred = output.logits.argmax(dim=-1).tolist()

        return {
            'loss': loss,
            'y_true': y_true,
            'y_pred': y_pred,
        }

    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)

    def validation_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)

    def epoch_end(self, outputs, state):
        loss = torch.tensor(0, dtype=torch.float)
        y_true = []
        y_pred = []
        for output in outputs:
            loss += output['loss'].cpu().detach()
            y_true.extend(output['y_true'])
            y_pred.extend(output['y_pred'])
        loss = loss / len(outputs)

        self.log(state + '_loss', float(loss), on_epoch=True, prog_bar=True)
        self.log(state + '_acc', accuracy_score(y_true, y_pred), on_epoch=True, prog_bar=True)

    def training_epoch_end(self, outputs):
        return self.epoch_end(outputs, state='train')

    def validation_epoch_end(self, outputs):
        return self.epoch_end(outputs, state='val')

    def configure_optimizers(self):
        optimizer = AdamW(self.bert.parameters(), lr=self.hparams.lr)

        num_train_steps = len(self.train_dataloader()) * self.hparams.max_epochs
        num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio)

        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_train_steps
        )

        lr_scheduler = {
            'scheduler': scheduler,
            'interval': 'step',
            'frequency': 1
        }

        return [optimizer], [lr_scheduler]

    def dataloader(self, file_path, shuffle) -> DataLoader:
        dataset = NSMCDataset(file_path, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.hparams.batch_size,
            num_workers=self.hparams.num_workers,
            shuffle=shuffle,
            pin_memory=True,
        )

    def train_dataloader(self) -> DataLoader:
        return self.dataloader(self.hparams.train_data_path, shuffle=True)

    def val_dataloader(self) -> DataLoader:
        return self.dataloader(self.hparams.val_data_path, shuffle=False)

    def save_hugginface(self):
        self.bert.save_pretrained(self.hparams.save_path)

## 학습


> 모델을 학습합니다. 학습에 사용할 파라메터는 `args`에서 정의 하였습니다.



In [6]:
args = {
    'train_data_path': './nsmc/ratings_train.txt',
    'val_data_path': './nsmc/ratings_test.txt',
    'save_path': './huggingface_model',
    'max_epochs': 3,
    'model_path': 'kykim/bert-kor-base',
    'batch_size': 16,
    'num_workers': 2,
    'lr': 5e-5,
    'warmup_ratio': 0.1,
}

In [7]:
model = KoBERT(**args)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=725.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=475782997.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=344259.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=80.0, style=ProgressStyle(description_w…




In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    filename='epoch{epoch}-val_acc{val_acc:.4f}',
    monitor='val_acc',
    save_top_k=3,
    mode='max',
    auto_insert_metric_name=False,
)

trainer = Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=args['max_epochs'],
    num_sanity_val_steps=0,
    deterministic=torch.cuda.is_available(),
    gpus=-1 if torch.cuda.is_available() else None,
)
trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type                          | Params
-------------------------------------------------------
0 | bert | BertForSequenceClassification | 118 M 
-------------------------------------------------------
118 M     Trainable params
0         Non-trainable params
118 M     Total params
473.196   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…