<a href="https://colab.research.google.com/github/bcmin1018/NLP/blob/main/MRC/notebooks/QA_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%cd "/content/drive/MyDrive/Colab Notebooks/QA/QA_KOBERT"

/content/drive/MyDrive/Colab Notebooks/QA/QA_KOBERT


In [3]:
from torch.utils.data import Dataset
from modules.utils import load_json
import numpy as np
import os
import torch

class QADataset(Dataset):
    
    def __init__ (self, data_dir: str, tokenizer, max_seq_len: int, mode = 'train', debug = False):
        self.mode = mode
        self.data = load_json(data_dir)
        
        # self.encodings = encodings
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.debug = debug
        if mode == 'test':
            self.encodings, self.question_ids = self.preprocess()
        else:
            self.encodings, self.answers = self.preprocess()
        
    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, index: int):
        return {key: torch.tensor(val[index]) for key, val in self.encodings.items()}

    # 훈련, 검증, 테스트 데이터에 따라 토크나이징 결과 값을 받아오는 함수
    def preprocess(self):
        contexts, questions, answers, question_ids = self.read_squad()
        if self.mode == 'test':
            encodings = self.tokenizer(contexts, questions, truncation=True, max_length = self.max_seq_len, padding=True)
            return encodings, question_ids
        else:
            self.add_end_idx(answers, contexts)
            encodings = self.tokenizer(contexts, questions, truncation=True, max_length = self.max_seq_len, padding=True)
            self.add_token_positions(encodings, answers)
        
            return encodings, answers

    #스쿼드 데이터를 리스트 형태로 변환 하는 함수
    def read_squad(self):
      contexts = []
      questions = []
      question_ids = []
      answers = []
    
      # train - val split
      if self.mode == 'train':
          self.data['data'] = self.data['data'][:-1*int(len(self.data['data'])*0.1)]
      elif self.mode == 'val':
          self.data['data'] = self.data['data'][-1*int(len(self.data['data'])*0.1):]
      
      
      till = 100 if self.debug else len(self.data['data'])
      

      for group in self.data['data'][:till]:
          for passage in group['paragraphs']:
              context = passage['context']
              for qa in passage['qas']:
                  question = qa['question']
                  if self.mode == 'test':
                      contexts.append(context)
                      questions.append(question)
                      question_ids.append(qa['question_id'])
                  else: # train or val
                      for ans in qa['answers']:
                          contexts.append(context)
                          questions.append(question)

                          if qa['is_impossible']:
                              answers.append({'text':'','answer_start':-1})
                          else:
                              answers.append(ans)
              
      # return formatted data lists
      return contexts, questions, answers, question_ids

    # context에서 정답의 마지막 인덱스를 찾는 함수
    def add_end_idx(self, answers, contexts):
      for answer, context in zip(answers, contexts):
          gold_text = answer['text']
          start_idx = answer['answer_start']
          end_idx = start_idx + len(gold_text)

          # in case the indices are off 1-2 idxs
          if context[start_idx:end_idx] == gold_text:
              answer['answer_end'] = end_idx
          else:
              for n in [1, 2]:
                  if context[start_idx-n:end_idx-n] == gold_text:
                      answer['answer_start'] = start_idx - n
                      answer['answer_end'] = end_idx - n
                  elif context[start_idx+n:end_idx+n] == gold_text:
                      answer['answer_start'] = start_idx + n
                      answer['answer_end'] = end_idx + n

    #토크나이저에 맞게 정답의 인덱스를 다시 구하는 함수
    def add_token_positions(self, encodings, answers):
        # should use Fast tokenizer
        start_positions = []
        end_positions = []
        for i in range(len(answers)):
            # 답변이 없는 경우
            if answers[i]['answer_start'] == -1:
                # set [CLS] token as answer if is_impossible
                start_positions.append(0)
                end_positions.append(1)
            else:
                start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))

                assert 'answer_end' in answers[i].keys(), f'no answer_end at {i}'
                end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

            # answer passage truncated
            if start_positions[-1] is None:
                start_positions[-1] = tokenizer.model_max_length
            # end position cannot be found, shift until found
            shift = 1
            while end_positions[-1] is None:
                end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
                shift += 1
        # char-based -> token based
        encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [4]:
from time import time
from tqdm import tqdm

class Trainer():

    def __init__(self,
                 model,
                 optimizer,
                 loss,
                 device,
                 tokenizer):
        
        self.model = model
        self.optimizer = optimizer
        self.loss = loss
        self.device = device
        self.tokenizer = tokenizer

        # History
        self.loss_sum = 0  # Epoch loss sum
        self.loss_mean = 0 # Epoch loss mean
        self.y = list()
        self.y_preds = list()
        self.score_dict = dict()  # metric score
        self.elapsed_time = 0
        

    def train(self, mode, dataloader, tokenizer, epoch_index=0):
        
        start_timestamp = time()
        self.model.train() if mode == 'train' else self.model.eval()
 
        for batch_index, batch in enumerate(tqdm(dataloader, leave=True)):
            
            # initialize calculated gradients (from prev step)
            self.optimizer.zero_grad()
            # pull all the tensor batches required for training
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            start_positions = batch['start_positions'].to(self.device)
            end_positions = batch['end_positions'].to(self.device)
            
            # train model on batch and return outputs (incl. loss)
            # Inference
            outputs = self.model(input_ids, attention_mask=attention_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)
            
            loss = outputs.loss
            start_score = outputs.start_logits
            end_score = outputs.end_logits
            
            
            start_idx = torch.argmax(start_score, dim=1).cpu().tolist()
            end_idx = torch.argmax(end_score, dim=1).cpu().tolist()
            
            # Update
            if mode == 'train':
                loss.backward()
                optimizer.step()
                
            elif mode in ['val', 'test']:
                pass
            
            # History
            # self.filenames += filename
            self.loss_sum += loss.item()
            
            # create answer; list of strings
            for i in range(len(input_ids)):
                if start_idx[i] > end_idx[i]:
                    output = ''
                
                self.y_preds.append(self.tokenizer.decode(input_ids[i][start_idx[i]:end_idx[i]]))
                self.y.append(self.tokenizer.decode(input_ids[i][start_positions[i]:end_positions[i]]))


            # Logging
            if batch_index % 100 == 0:
                msg = f"batch: {batch_index}/{len(dataloader)} loss: {loss.item()}"
                print(msg)
                
        # Epoch history
        self.loss_mean = self.loss_sum / len(dataloader)  # Epoch loss mean

        # Metric
        from sklearn.metrics import accuracy_score
        accuracy_score = accuracy_score(self.y, self.y_preds)
        print(accuracy_score)
        
        # Elapsed time
        end_timestamp = time()
        self.elapsed_time = end_timestamp - start_timestamp

    def clear_history(self):
        self.loss_sum = 0
        self.loss_mean = 0
        self.y_preds = list()
        self.y = list()
        self.score_dict = dict()
        self.elapsed_time = 0

In [5]:
import numpy as np
import logging

class EarlyStopper():

    def __init__(self, patience: int, mode:str):
        self.patience = patience
        self.mode = mode

        # Initiate
        self.patience_counter = 0
        self.stop = False
        self.best_loss = np.inf

        msg = f"Initiated early stopper, mode: {self.mode}, best score: {self.best_loss}, patience: {self.patience}"
        print(msg)
        
    def check_early_stopping(self, loss: float)-> None:
        loss = -loss if self.mode == 'max' else loss  # get max value if mode set to max

        if loss > self.best_loss:
            # got worse score
            self.patience_counter += 1

            msg = f"Early stopper, counter {self.patience_counter}/{self.patience}, best:{abs(self.best_loss)} -> now:{abs(loss)}"
            print(msg)
            
            if self.patience_counter == self.patience:
                msg = f"Early stopper, stop"
                print(msg)
                self.stop = True  # end

        elif loss <= self.best_loss:
            # got better score
            self.patience_counter = 0
            print(f"Early stopper, counter {self.patience_counter}/{self.patience}, best:{abs(self.best_loss)} -> now:{abs(loss)}")
            print(f"Set counter as {self.patience_counter}")
            print(f"Update best score as {abs(loss)}")
            self.best_loss = loss
            
        else:
            print('debug')

In [6]:
#config.py
config = {
    'DATALOADER': {
      'batch_size': 32,
      'num_workers': 0,
      'shuffle': True,
      'pin_memory': True,
      'drop_last': False
    },
    'TRAINER': {
        'optimizer' : 'adamw',
        'learning_rate' : '5.0e-4',
        'loss': 'crossentropy',
        'seed': 2022,
        'n_epochs': 1,
        'early_stopping_target': 'val_loss',
        'early_stopping_patience': 10,
        'early_stopping_mode': 'min'
    }
}

In [7]:
from datetime import datetime, timezone, timedelta
PROJECT_DIR = "/content/drive/MyDrive/Colab Notebooks/QA/QA_KOBERT"
kst = timezone(timedelta(hours=9))
train_serial = datetime.now(tz=kst).strftime("%Y%m%d_%H%M%S")
RECORDER_DIR = os.path.join(PROJECT_DIR, 'results', 'train', train_serial)
os.makedirs(RECORDER_DIR, exist_ok=True)

In [8]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 4.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 58.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 39.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 3.7 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [9]:
# random seed
import random
torch.manual_seed(config['TRAINER']['seed'])
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(config['TRAINER']['seed'])
random.seed(config['TRAINER']['seed'])

# get gpu or cpu device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# get tokenizer
from transformers import ElectraTokenizerFast
tokenizer = ElectraTokenizerFast.from_pretrained("monologg/koelectra-small-v3-discriminator")

# load dataset
# !gdown --id 1n74_kfEjrjkHYsUr1CugGgzxL7ns28gV
# !gdown --id 1lMszENg5tEyeTnm2XR0X0876HKltWlLk
# !gdown --id 14FUap4xX_XMfq1T4RS3nJXeomQiDm4Nu
# !gdown --id 1ygUD3YA9goXs20h6gvmrxkBjMzJlCQjR
!unzip -qq "./train.zip"
!unzip -qq "./test.zip"

train_data = "/content/drive/MyDrive/Colab Notebooks/QA/QA_KOBERT/train.json"
valid_data = "/content/drive/MyDrive/Colab Notebooks/QA/QA_KOBERT/train.json"
test_data = "/content/drive/MyDrive/Colab Notebooks/QA/QA_KOBERT/test.json"
submission = "/content/drive/MyDrive/Colab Notebooks/QA/QA_KOBERT/sample_submission.csv"

# data loader
train_dataset = QADataset(data_dir=train_data, tokenizer = tokenizer, max_seq_len = 512, mode = 'train')
valid_dataset = QADataset(data_dir=valid_data, tokenizer = tokenizer, max_seq_len = 512, mode = 'train')

from torch.utils.data import DataLoader
train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=config['DATALOADER']['batch_size'],
                              num_workers=config['DATALOADER']['num_workers'], 
                              shuffle=config['DATALOADER']['shuffle'],
                              pin_memory=config['DATALOADER']['pin_memory'],
                              drop_last=config['DATALOADER']['drop_last'])
val_dataloader = DataLoader(dataset=valid_dataset,
                            batch_size=config['DATALOADER']['batch_size'],
                            num_workers=config['DATALOADER']['num_workers'], 
                            shuffle=False,
                            pin_memory=config['DATALOADER']['pin_memory'],
                            drop_last=config['DATALOADER']['drop_last'])

#set model
from transformers import ElectraForQuestionAnswering
model = ElectraForQuestionAnswering.from_pretrained("monologg/koelectra-small-v3-discriminator").to(device)

#set trainer
import torch.optim as optim
optimizer = optim.AdamW(params=model.parameters(),lr=5.0e-4)

from torch.nn import functional as F
loss = F.cross_entropy

#early stopper
early_stopper = EarlyStopper(patience=config['TRAINER']['early_stopping_patience'],
                                mode=config['TRAINER']['early_stopping_mode'])

trainer = Trainer(model=model,
                      optimizer=optimizer,
                      loss=loss,
                      device=device,
                      tokenizer=tokenizer,)

Downloading:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/257k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/458 [00:00<?, ?B/s]

Downloading...
From: https://drive.google.com/uc?id=1n74_kfEjrjkHYsUr1CugGgzxL7ns28gV
To: /content/drive/MyDrive/Colab Notebooks/QA/QA_KOBERT/train.zip
100% 17.4M/17.4M [00:00<00:00, 163MB/s]
Downloading...
From: https://drive.google.com/uc?id=1lMszENg5tEyeTnm2XR0X0876HKltWlLk
To: /content/drive/MyDrive/Colab Notebooks/QA/QA_KOBERT/test.zip
100% 844k/844k [00:00<00:00, 109MB/s]
Downloading...
From: https://drive.google.com/uc?id=14FUap4xX_XMfq1T4RS3nJXeomQiDm4Nu
To: /content/drive/MyDrive/Colab Notebooks/QA/QA_KOBERT/sample.json
100% 28.1k/28.1k [00:00<00:00, 46.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ygUD3YA9goXs20h6gvmrxkBjMzJlCQjR
To: /content/drive/MyDrive/Colab Notebooks/QA/QA_KOBERT/sample_submission.csv
100% 27.7k/27.7k [00:00<00:00, 34.5MB/s]
replace train.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace test.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: y


Downloading:   0%|          | 0.00/54.0M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized: ['qa_outputs.bias'

Initiated early stopper, mode: min, best score: inf, patience: 10


In [10]:
# Train
n_epochs = config['TRAINER']['n_epochs']
for epoch_index in range(n_epochs):
  print(f"Train {epoch_index}/{n_epochs}")
  trainer.train(dataloader=train_dataloader, epoch_index=epoch_index, tokenizer=tokenizer, mode='train')
  print("train loss_mean {} / epoch {}".format(trainer.loss_mean , epoch_index + 1))
  print("train elapsed_time : ", trainer.elapsed_time)
  trainer.clear_history()

  print(f"Val {epoch_index}/{n_epochs}")
  trainer.train(dataloader=val_dataloader, epoch_index=epoch_index, tokenizer=tokenizer, mode='val')
  print("valid loss_mean {} / epoch {}".format(trainer.loss_mean , epoch_index + 1))
  print("valid elapsed_time : ", trainer.elapsed_time)


  early_stopping_target = config['TRAINER']['early_stopping_target']
  early_stopper.check_early_stopping(loss=trainer.loss_mean)

  if early_stopper.patience_counter == 0:
      check_point = {
          'epoch': epoch_index + 1,
          'model': model.state_dict(),
          'optimizer': optimizer.state_dict(),
      }
  weight_path = os.path.join(RECORDER_DIR, 'model.pt')
  torch.save(check_point, weight_path)  
  print("Recorder, epoch {} Model saved: {}".format(epoch_index + 1, weight_path))
  
  if early_stopper.stop == True:
      print(f"Early stopped, counter {early_stopper.patience_counter}/{config['TRAINER']['early_stopping_patience']}")
  break

Train 0/1


  0%|          | 1/790 [00:07<1:41:22,  7.71s/it]

batch: 0/790 loss: 5.960369110107422


 13%|█▎        | 101/790 [00:38<03:25,  3.36it/s]

batch: 100/790 loss: 2.648710250854492


 25%|██▌       | 201/790 [01:09<03:12,  3.06it/s]

batch: 200/790 loss: 1.4212837219238281


 38%|███▊      | 301/790 [01:39<02:26,  3.34it/s]

batch: 300/790 loss: 0.9766261577606201


 51%|█████     | 401/790 [02:09<01:55,  3.37it/s]

batch: 400/790 loss: 1.3423991203308105


 63%|██████▎   | 501/790 [02:39<01:25,  3.36it/s]

batch: 500/790 loss: 1.2833640575408936


 76%|███████▌  | 601/790 [03:09<00:56,  3.35it/s]

batch: 600/790 loss: 1.1602027416229248


 89%|████████▊ | 701/790 [03:39<00:26,  3.38it/s]

batch: 700/790 loss: 1.214773178100586


100%|██████████| 790/790 [04:05<00:00,  3.22it/s]


0.5465917187871111
train loss_mean 1.47125456499148 / epoch 1 : 
train elapsed_time :  246.375235080719
Val 0/1


  0%|          | 1/790 [00:00<01:33,  8.40it/s]

batch: 0/790 loss: 0.8122074604034424


 13%|█▎        | 102/790 [00:11<01:20,  8.50it/s]

batch: 100/790 loss: 0.5505495071411133


 26%|██▌       | 202/790 [00:23<01:09,  8.50it/s]

batch: 200/790 loss: 0.3543239235877991


 38%|███▊      | 302/790 [00:35<00:57,  8.51it/s]

batch: 300/790 loss: 0.9879108667373657


 51%|█████     | 402/790 [00:47<00:45,  8.54it/s]

batch: 400/790 loss: 0.3176060616970062


 64%|██████▎   | 502/790 [00:59<00:33,  8.49it/s]

batch: 500/790 loss: 0.3544155955314636


 76%|███████▌  | 602/790 [01:11<00:22,  8.53it/s]

batch: 600/790 loss: 0.5128646492958069


 89%|████████▉ | 702/790 [01:22<00:10,  8.51it/s]

batch: 700/790 loss: 0.9465636610984802


100%|██████████| 790/790 [01:33<00:00,  8.49it/s]


0.7672393318027076
valid loss_mean 0.7027491528022138 / epoch 1 : 
valid elapsed_time :  93.32425284385681
Early stopper, counter 0/10, best:inf -> now:0.7027491528022138
Set counter as 0
Update best score as 0.7027491528022138
Recorder, epoch 1 Model saved: /content/drive/MyDrive/Colab Notebooks/QA/QA_KOBERT/results/train/20220703_152819/model.pt


In [12]:
# Predict
test_dataset = QADataset(data_dir=test_data, tokenizer = tokenizer, max_seq_len = 512, mode = 'test')

In [13]:
question_ids = test_dataset.question_ids

In [14]:
test_dataloader = DataLoader(dataset=test_dataset,
                            batch_size=config['DATALOADER']['batch_size'],
                            num_workers=config['DATALOADER']['num_workers'], 
                            shuffle=False,
                            pin_memory=config['DATALOADER']['pin_memory'],
                            drop_last=config['DATALOADER']['drop_last'])

In [15]:
model = ElectraForQuestionAnswering.from_pretrained("monologg/koelectra-small-v3-discriminator").to(device)
checkpoint = torch.load(os.path.join(RECORDER_DIR, 'model.pt'))
model.load_state_dict(checkpoint['model'])

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized: ['qa_outputs.bias'

<All keys matched successfully>

In [16]:
model.eval()
import pandas as pd
pred_df = pd.read_csv(os.path.join(submission))

In [19]:
for batch_index, batch in enumerate(tqdm(test_dataloader, leave=True)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # Inference
        outputs = model(input_ids, attention_mask=attention_mask)
        
        start_score = outputs.start_logits
        end_score = outputs.end_logits
        
        start_idx = torch.argmax(start_score, dim=1).cpu().tolist()
        end_idx = torch.argmax(end_score, dim=1).cpu().tolist()
        
        y_pred = []
        for i in range(len(input_ids)):
            if start_idx[i] > end_idx[i]:
                output = ''
            
            ans_txt = tokenizer.decode(input_ids[i][start_idx[i]:end_idx[i]]).replace('#','')
            
            if ans_txt == '[CLS]':
                ans_txt == ''
            
            y_pred.append(ans_txt)
        

        q_end_idx = config['DATALOADER']['batch_size']*batch_index + len(y_pred)
        for q_id, pred in zip(question_ids[config['DATALOADER']['batch_size']*batch_index:q_end_idx], y_pred):
            pred_df.loc[pred_df['question_id'] == q_id,'answer_text'] = pred
            

        pred_df.to_csv(os.path.join(RECORDER_DIR, 'prediction.csv'), index=False)

100%|██████████| 51/51 [00:06<00:00,  7.81it/s]
