In [2]:
from ratsnlp.nlpbook.classification import ClassificationTrainArguments

args = ClassificationTrainArguments(
    pretrained_model_name="beomi/kcbert-base", 
    downstream_corpus_name="nsmc", 
    downstream_corpus_root_dir="/home/key2317/main/BERT와GPT로 배우는 자연어처리/root/Korpora",
    downstream_model_dir="/home/key2317/main/BERT와GPT로 배우는 자연어처리/nlpbook/checkpoint-doccls",
    learning_rate=5e-5,
    batch_size=32,
)

데이터 내려받기

- 프리트레인을 이미 마친 모델을 다운스트림 데이터로 파인튜닝
- 파인튜닝을 위한 다운스트림 태스크용 데이터를 미리 내려받습니다.
- nsmc : 네이버 영화 리뷰 말뭉치


In [3]:
from Korpora import Korpora

Korpora.fetch(
    corpus_name=args.downstream_corpus_name,
    root_dir=args.downstream_corpus_root_dir,
    force_download=True,
)

[nsmc] download ratings_train.txt: 14.6MB [00:00, 15.1MB/s]                            
[nsmc] download ratings_test.txt: 4.90MB [00:00, 11.8MB/s]                            


Pre-trained 모델 준비

In [4]:
from transformers import BertConfig,BertForSequenceClassification

pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_labels=2,
)
model = BertForSequenceClassification.from_pretrained(
    args.pretrained_model_name,
    config=pretrained_model_config,
)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

토크나이저 준비

- 문장을 토큰 시퀀스로 분석하는 과정을 토큰화라 합니다.
- 그리고 그 토큰화를 수행하는 프로그램을 토크나이저라고 합니다.
- kcbert-base 모델이 사용하는 토크나이저를 준비하는 코드입니다. 

In [5]:
from transformers import BertTokenizer 

tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_name,
    do_lower_case=False,
)

데이터 로더 준비하기

- Pytorch는 딥러닝 모델 학습을 지원하는 파이썬 라이브러리입니다.
- Pytorch에는 데이터 로더라는게 포함되어 있습니다. 파이토치로 딥러닝 모델을 만들려면 이 데이터로더를 반드시 정의해야 합니다.


- 데이터 로더

  - 데이터를 배치 단위로 모델에 밀어 넣어주는 역할
  - 전체 데이터 중 일부를 뽑아(sample) 배치를 구성합니다.
  - 데이터셋은 데이터 로더의 구성 요소 중 하나입니다. 데이터셋은 여러 인스턴스(문서+레이블)을 포함하고 있습니다. 
  - 데이터 로더가 배치를 만들 때 인스턴스를 뽑는 방식은, 파이토치 사용자가 자유롭게 정할 수 있습니다.

- 배치의 모양은 고정적인 경우가 많습니다. 
- 다시 말해, 동일한 배치에 있는 문장들의 토큰 개수는 서로 같아야 합니다.
- 따라서 배치가 3인 인풋에서 각각 토큰의 길이가 5,3,4라면, 3,4의 길이를 5와 맞추기 위해 0을 삽입하게 됩니다.
- 이 때 이 삽입된 0 token을 padding token이라고 합니다.

문서 분류 데이터 로더 선언

In [7]:
from ratsnlp import nlpbook

In [8]:
from torch.utils.data import DataLoader, RandomSampler
from ratsnlp.nlpbook.classification import NsmcCorpus,ClassificationDataset 

corpus = NsmcCorpus()

train_dataset = ClassificationDataset(
    args=args,
    corpus=corpus,
    tokenizer=tokenizer,
    mode="train",
)

train_dataloader = DataLoader(
    train_dataset, 
    batch_size = args.batch_size,
    sampler = RandomSampler(train_dataset,replacement=False), 
    collate_fn=nlpbook.data_collator,
    drop_last=False,
    num_workers=args.cpu_workers,
)

모델 학습하기

- Trainer : 파이토치 라이트닝에서 제공하는 객체입니다. 이 객체가 실제 학습을 수행합니다.
- 다음 코드는 문서 분류 모델을 학습하는 예시입니다. 태스크, 트레이너를 정의한 다음, 
- 앞서 준비한 데이터 로더를 가지고 fit()함수를 호출하면 학습을 시작합니다.

In [9]:
from ratsnlp.nlpbook.classification import ClassificationTask 

task = ClassificationTask(model,args)

trainer = nlpbook.get_trainer(args)

trainer.fit(
    task,
    train_dataloader=train_dataloader,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
2021-12-29 17:09:49.095283: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-29 17:09:49.095316: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/8
2021-12-29 17:09:55.663192: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-29 17:09:55.663225: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
initializing ddp: GLOBAL_RANK: 1, MEMBER: 2/8
202

Epoch 0:   0%|          | 0/586 [00:00<?, ?it/s] 

2021-12-29 17:10:50.563652: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-29 17:10:50.563654: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-29 17:10:50.563687: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-12-29 17:10:50.563687: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-12-29 17:10:50.594714: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or dire

ProcessRaisedException: 

-- Process 6 terminated with the following error:
Traceback (most recent call last):
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
    fn(i, *args)
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py", line 172, in new_process
    results = trainer.run_stage()
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 807, in run_stage
    return self.run_train()
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 869, in run_train
    self.train_loop.run_training_epoch()
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 481, in run_training_epoch
    for batch_idx, (batch, is_last_batch) in train_dataloader:
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/pytorch_lightning/profiler/profilers.py", line 112, in profile_iterable
    value = next(iterator)
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/pytorch_lightning/trainer/supporters.py", line 530, in prefetch_iterator
    last = next(it)
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/pytorch_lightning/trainer/supporters.py", line 464, in __next__
    return self.request_next_batch(self.loader_iters)
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/pytorch_lightning/trainer/supporters.py", line 449, in loader_iters
    self._loader_iters = self.create_loader_iters(self.loaders)
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/pytorch_lightning/trainer/supporters.py", line 495, in create_loader_iters
    return apply_to_collection(loaders, Iterable, iter, wrong_dtype=(Sequence, Mapping))
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/pytorch_lightning/utilities/apply_func.py", line 84, in apply_to_collection
    return function(data, *args, **kwargs)
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 359, in __iter__
    return self._get_iterator()
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 305, in _get_iterator
    return _MultiProcessingDataLoaderIter(self)
  File "/home/key2317/anaconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 900, in __init__
    index_queue = multiprocessing_context.Queue()  # type: ignore[var-annotated]
  File "/home/key2317/anaconda3/lib/python3.8/multiprocessing/context.py", line 103, in Queue
    return Queue(maxsize, ctx=self.get_context())
  File "/home/key2317/anaconda3/lib/python3.8/multiprocessing/queues.py", line 42, in __init__
    self._rlock = ctx.Lock()
  File "/home/key2317/anaconda3/lib/python3.8/multiprocessing/context.py", line 68, in Lock
    return Lock(ctx=self.get_context())
  File "/home/key2317/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 162, in __init__
    SemLock.__init__(self, SEMAPHORE, 1, 1, ctx=ctx)
  File "/home/key2317/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 57, in __init__
    sl = self._semlock = _multiprocessing.SemLock(
OSError: [Errno 28] No space left on device


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/key2317/anaconda3/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/home/key2317/anaconda3/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
_pickle.UnpicklingError: pickle data was truncated
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/key2317/anaconda3/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/home/key2317/anaconda3/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
_pickle.UnpicklingError: pickle data was truncated
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/key2317/anaconda3/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, 