필요한 라이브러리 설치와 임포트

In [1]:
# !pip install sentencepiece
# !pip install mxnet
# !pip install gluonnlp==0.8.0
# !pip install tqdm pandas
# !pip install torch
# !pip install sentencepiece
# !pip install transformers



In [2]:
# !pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-fdeswrqf/kobert-tokenizer_93f7dd0a3ec7402a89e374cbefd5a3f3
  Running command git clone -q https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-fdeswrqf/kobert-tokenizer_93f7dd0a3ec7402a89e374cbefd5a3f3
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3


In [3]:
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [4]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [5]:
import gluonnlp as nlp
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import glob
import os



In [6]:
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device_type)

In [7]:
os.getcwd()

'/home2/jh981017/myubai/machinelearning'

각 폴더에서 텍스트 저장한 엑셀파일 위치 가져오기

In [8]:
section_folders = glob.glob('/home2/jh981017/myubai/NaverNews/*')
section_folders

['/home2/jh981017/myubai/NaverNews/economy',
 '/home2/jh981017/myubai/NaverNews/life',
 '/home2/jh981017/myubai/NaverNews/politics',
 '/home2/jh981017/myubai/NaverNews/science',
 '/home2/jh981017/myubai/NaverNews/society',
 '/home2/jh981017/myubai/NaverNews/world']

In [9]:
root = '/home2/jh981017/myubai/NaverNews'
sections = os.listdir(root)
sections

['economy', 'life', 'politics', 'science', 'society', 'world']

In [10]:
text_paths = []
for folder, section in zip(section_folders, sections):
  text_path = folder + '/' + section + 'text1' + '.xlsx'
  text_paths.append(text_path)

text_paths

['/home2/jh981017/myubai/NaverNews/economy/economytext1.xlsx',
 '/home2/jh981017/myubai/NaverNews/life/lifetext1.xlsx',
 '/home2/jh981017/myubai/NaverNews/politics/politicstext1.xlsx',
 '/home2/jh981017/myubai/NaverNews/science/sciencetext1.xlsx',
 '/home2/jh981017/myubai/NaverNews/society/societytext1.xlsx',
 '/home2/jh981017/myubai/NaverNews/world/worldtext1.xlsx']

각 섹션별 인덱스로 분석에 사용할 텍스트 가져오기, label - y 페어 만들기

In [11]:
idx_dictionary = {}

for section, text_path in zip(sections, text_paths):
  text = pd.read_excel(text_path)
  idx_section = list(text['idx'])

  idx_dictionary[section] = idx_section

In [12]:
len(idx_dictionary['economy'])

1888

In [13]:
np.random.seed(602)

cv_idx_dictionary = {}

for section in sections:
  cv_idx_section = list(np.random.choice(idx_dictionary[section], size = 1200, replace = False))
  cv_idx_section.sort()

  cv_idx_dictionary[section] = cv_idx_section

In [14]:
len(cv_idx_dictionary['economy'])

1200

In [15]:
new_idx_dictionary = {}

for section in sections:
  new_idx_section = [i for i in idx_dictionary[section] if i not in cv_idx_dictionary[section]]

  new_idx_dictionary[section] = new_idx_section

In [16]:
len(new_idx_dictionary['economy'])

688

In [17]:
# label_to_y = {section : idx for idx, section in enumerate(sections)}
# label_to_y

In [18]:
label_to_y = {
    'politics': 0,
    'society': 1,
    'science': 2,
    'life': 3,
    'world': 4,
    'economy': 5
}
label_to_y

{'politics': 0,
 'society': 1,
 'science': 2,
 'life': 3,
 'world': 4,
 'economy': 5}

전체 데이터 : 제목 + 본문 텍스트 (X), 라벨과 일대일대응된 숫자(y)

In [19]:
cv_data = []
new_data = []

for section_folder in section_folders:
  section = os.path.basename(section_folder)
  section_textpath = section_folder + '/' + section + 'text1' + '.xlsx' #정제한걸로했음

  section_texts = pd.read_excel(section_textpath)


  cv_indicies = cv_idx_dictionary[section]
  new_indicies = new_idx_dictionary[section]

  # X
  cv_condition = section_texts['idx'].isin(cv_indicies)
  new_condition = section_texts['idx'].isin(new_indicies)

  cv_articles = section_texts.loc[cv_condition, 'title'] + ' ' + section_texts.loc[cv_condition, 'body']
  new_articles = section_texts.loc[new_condition, 'title'] + ' ' + section_texts.loc[new_condition, 'body']


  # y
  y = label_to_y[section]


  for cv_idx, cv_article in zip(cv_indicies, cv_articles):
    data = []
    data.append(cv_idx)
    data.append(cv_article)
    data.append(y)

    cv_data.append(data)

  for new_idx, new_article in zip(new_indicies, new_articles):
    data = []
    data.append(new_idx)
    data.append(new_article)
    data.append(y)

    new_data.append(data)

In [20]:
len(cv_data)

7200

In [21]:
len(new_data)

5720

In [22]:
df_cv_data = pd.DataFrame(cv_data)
df_cv_data

Unnamed: 0,0,1,2
0,3,5대은행 가계대출 한달새 3조5000억 늘어...올 들어 최대 폭 5...,5
1,6,"빗썸 부리또 월렛, GBIC 해커톤 대회 성료 [서울=뉴시스]이지영 기자 = 빗썸 ...",5
2,9,"농협은행, 연합 플로깅 캠페인으로 ‘ESG’ 실천 NH농협은행(은행장 이석용)은 ...",5
3,13,"하나금융, 전 임직원 '모두하나데이'로 ESG 실천 하나금융그룹은 1일 하나금융그...",5
4,16,윤 대통령이 질타한 '은행 성과급 잔치'는 없었다 최근 수년간 대출 금리 인상 등으...,5
...,...,...,...
7195,1910,"미국서 2주 이상 실업수당 청구한 사람, 2년 만에 최대 16일(현지...",4
7196,1912,"이스라엘, 알시파 병원에 외신 불러… ""버려진 노트북에 인질 영상"" 주장 팔레스타인...",4
7197,1913,"러 매체, “‘푸틴 신뢰한다’는 한국 청년, 러시아군 자원 입대” 러시아 매체 ‘A...",4
7198,1914,"클린스만호, 싱가포르에 5-0 대승...손흥민 이강인 황희찬 등 공격진 모두 골골골...",4


In [23]:
df_new_data = pd.DataFrame(new_data)
df_new_data

Unnamed: 0,0,1,2
0,10,'살아난 비트코인' 코인 시장 자금 수급 이끌자…디파이·NFT장도 '꿈틀' (서울=...,5
1,15,부산디지털자산거래소 사업자 선정 돌입 부산시가 ‘부산디지털자산거래소’(BDX)의 설...,5
2,19,“비트코인 2년 내 2억 원 도달…현재 가격의 4배 폭등” 최근 상승세를 보이는 가...,5
3,21,"국민연금, 카카오 보유목적 ‘일반투자’로…주주권 행사 나서나 [이코노미스트 마켓in...",5
4,27,배터리 수장 '권영수부회장·지동섭대표'에 쏠린 시선…발언보니 국내 배터리 업계 최고...,5
...,...,...,...
5715,1909,"러시아, 미국 수교 90주년에 ""언제든 관계 끊길 수도"" 경고 러시아가 미국과 수교...",4
5716,1911,"동료가 준 생일선물, '폭탄'이었다…우크라군 총사령관 참모 사망 [서울경제] 발레리...",4
5717,1915,"""뉴스? '틱톡'으로 봐요""…미국인들 이용률 3년 새 '22％→43％' [서울경제]...",4
5718,1916,"한 알 먹고 ""몸이 왜 이러지?"" 병원행…日 '발칵' 뒤집은 이 젤리 16일 일본 ...",4


KoBERT 불러오기

In [24]:
class BERTSentenceTransform:
    r"""BERT style data transformation.

    Parameters
    ----------
    tokenizer : BERTTokenizer.
        Tokenizer for the sentences.
    max_seq_length : int.
        Maximum sequence length of the sentences.
    pad : bool, default True
        Whether to pad the sentences to maximum length.
    pair : bool, default True
        Whether to transform sentences or sentence pairs.
    """

    def __init__(self, tokenizer, max_seq_length,vocab, pad=True, pair=True):
        self._tokenizer = tokenizer
        self._max_seq_length = max_seq_length
        self._pad = pad
        self._pair = pair
        self._vocab = vocab

    def __call__(self, line):
        """Perform transformation for sequence pairs or single sequences.

        The transformation is processed in the following steps:
        - tokenize the input sequences
        - insert [CLS], [SEP] as necessary
        - generate type ids to indicate whether a token belongs to the first
        sequence or the second sequence.
        - generate valid length

        For sequence pairs, the input is a tuple of 2 strings:
        text_a, text_b.

        Inputs:
            text_a: 'is this jacksonville ?'
            text_b: 'no it is not'
        Tokenization:
            text_a: 'is this jack ##son ##ville ?'
            text_b: 'no it is not .'
        Processed:
            tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
            type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
            valid_length: 14

        For single sequences, the input is a tuple of single string:
        text_a.

        Inputs:
            text_a: 'the dog is hairy .'
        Tokenization:
            text_a: 'the dog is hairy .'
        Processed:
            text_a: '[CLS] the dog is hairy . [SEP]'
            type_ids: 0     0   0   0  0     0 0
            valid_length: 7

        Parameters
        ----------
        line: tuple of str
            Input strings. For sequence pairs, the input is a tuple of 2 strings:
            (text_a, text_b). For single sequences, the input is a tuple of single
            string: (text_a,).

        Returns
        -------
        np.array: input token ids in 'int32', shape (batch_size, seq_length)
        np.array: valid length in 'int32', shape (batch_size,)
        np.array: input token type ids in 'int32', shape (batch_size, seq_length)

        """

        # convert to unicode
        text_a = line[0]
        if self._pair:
            assert len(line) == 2
            text_b = line[1]

        tokens_a = self._tokenizer.tokenize(text_a)
        tokens_b = None

        if self._pair:
            tokens_b = self._tokenizer(text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            self._truncate_seq_pair(tokens_a, tokens_b,
                                    self._max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self._max_seq_length - 2:
                tokens_a = tokens_a[0:(self._max_seq_length - 2)]

        # The embedding vectors for `type=0` and `type=1` were learned during
        # pre-training and are added to the wordpiece embedding vector
        # (and position vector). This is not *strictly* necessary since
        # the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.

        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        #vocab = self._tokenizer.vocab
        vocab = self._vocab
        tokens = []
        tokens.append(vocab.cls_token)
        tokens.extend(tokens_a)
        tokens.append(vocab.sep_token)
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens.extend(tokens_b)
            tokens.append(vocab.sep_token)
            segment_ids.extend([1] * (len(tokens) - len(segment_ids)))

        input_ids = self._tokenizer.convert_tokens_to_ids(tokens)

        # The valid length of sentences. Only real  tokens are attended to.
        valid_length = len(input_ids)

        if self._pad:
            # Zero-pad up to the sequence length.
            padding_length = self._max_seq_length - valid_length
            # use padding tokens for the rest
            input_ids.extend([vocab[vocab.padding_token]] * padding_length)
            segment_ids.extend([0] * padding_length)

        return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
            np.array(segment_ids, dtype='int32')

In [25]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        #transform = nlp.data.BERTSentenceTransform(
        #    tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [26]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [27]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

tokenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [28]:
# kobert 공식 git에 있는 get_kobert_model 선언
def get_kobert_model(model_path, vocab_file, ctx="cpu"):
    bertmodel = BertModel.from_pretrained(model_path)
    device = torch.device(ctx)
    bertmodel.to(device)
    bertmodel.eval()
    vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file,
                                                         padding_token='[PAD]')
    return bertmodel, vocab_b_obj


In [29]:
from transformers import BertModel
bertmodel, vocab = get_kobert_model('skt/kobert-base-v1', tokenizer.vocab_file)
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower = False)

config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

In [30]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 6,   # 6 classes
                 dr_rate = None,
                 params = None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [31]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

Stratified K-fold CV

In [32]:
import torch, gc

In [33]:
cv_data[0]

[3,
 '5대은행 가계대출 한달새 3조5000억 늘어...올 들어 최대 폭         5대 시중은행 가계대출이 10월 한 달간 3조5000억원가량 늘었다. 대출금리가 오르고 있음에도 주택담보대출(주담대)을 중심으로 가계대출 증가폭이 더 커진 모습이다.1일 금융권에 따르면 국민·신한·하나·우리·NH농협 등 국내 5대 은행의 10월 말 가계대출 잔액은 685조7820억원으로 전월(682조3294억원)대비 3조4526억원 늘었다. 가계대출이 증가세로 돌아선 지난 5월 이후 가장 큰 폭이다. 가계대출은 5월 1431억원 6월 6332억원 7월 9754억원 8월 1조5912억워 9월 1조5174억원 증가해왔다.주담대가 크게 늘어난 점이 영향을 미쳤다. 5대 은행의 주담대는 지난달 말 520조9861억원으로 전월(517조858억원)대비 3조1273억원 늘었다. 이날 기준 5대 은행의 주담대 변동금리는 연 4.55~7.18%로 상단이 7%를 넘어섰다. 고정금리도 연 4.39~6.72%로 상단이 7%에 근접한 상태다.2021년 11월 이후 1년 10개월째 감소하던 신용대출도 증가 전환했다. 지난달 말 신용대출 잔액은 107조9490억원으로 전월(107조3409억원)대비 6081억원 늘었다. 전세자금 대출은 10월 말 121조6992억원으로 전월보다 4764억원 줄었다.',
 5]

In [34]:
df_cv_data

Unnamed: 0,0,1,2
0,3,5대은행 가계대출 한달새 3조5000억 늘어...올 들어 최대 폭 5...,5
1,6,"빗썸 부리또 월렛, GBIC 해커톤 대회 성료 [서울=뉴시스]이지영 기자 = 빗썸 ...",5
2,9,"농협은행, 연합 플로깅 캠페인으로 ‘ESG’ 실천 NH농협은행(은행장 이석용)은 ...",5
3,13,"하나금융, 전 임직원 '모두하나데이'로 ESG 실천 하나금융그룹은 1일 하나금융그...",5
4,16,윤 대통령이 질타한 '은행 성과급 잔치'는 없었다 최근 수년간 대출 금리 인상 등으...,5
...,...,...,...
7195,1910,"미국서 2주 이상 실업수당 청구한 사람, 2년 만에 최대 16일(현지...",4
7196,1912,"이스라엘, 알시파 병원에 외신 불러… ""버려진 노트북에 인질 영상"" 주장 팔레스타인...",4
7197,1913,"러 매체, “‘푸틴 신뢰한다’는 한국 청년, 러시아군 자원 입대” 러시아 매체 ‘A...",4
7198,1914,"클린스만호, 싱가포르에 5-0 대승...손흥민 이강인 황희찬 등 공격진 모두 골골골...",4


In [35]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 602)

In [36]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [40]:
max_len = 140
batch_size = 8
# 130 =0.7 정도

In [41]:
list_train_idx = []
list_test_idx = []

for train_idx, test_idx in cv.split(df_cv_data[1], df_cv_data[2]):
  list_train_idx.append(train_idx)
  list_test_idx.append(test_idx)

In [42]:
list_test_history = []
fold = 0


for train_idx, test_idx in zip(list_train_idx, list_test_idx):
  fold += 1

  warmup_ratio = 0.1
  num_epochs = 10    ### 에포크 수정 ###
  max_grad_norm = 1
  log_interval = 200
  learning_rate =  5e-5


  bertmodel, vocab = get_kobert_model('skt/kobert-base-v1', tokenizer.vocab_file)
  model = BERTClassifier(bertmodel,  dr_rate = 0.5).to(device)


  train_data = [cv_data[i] for i in train_idx]
  test_data = [cv_data[i] for i in test_idx]

  train_dataset = BERTDataset(dataset = train_data, sent_idx = 1, label_idx = 2, bert_tokenizer = tokenizer, vocab = vocab, max_len = max_len, pad = True, pair = False)
  test_dataset = BERTDataset(dataset = test_data, sent_idx = 1, label_idx = 2, bert_tokenizer = tokenizer, vocab = vocab, max_len = max_len, pad = True, pair = False)

  train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
  test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle = True)

  no_decay = ['bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
      {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]

  optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate)
  loss_fn = nn.CrossEntropyLoss() # 다중분류를 위한 loss function

  t_total = len(train_dataloader) * num_epochs
  warmup_step = int(t_total * warmup_ratio)

  scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = warmup_step, num_training_steps = t_total)

  train_history = []
  test_history = []
  loss_history = []

  for e in range(num_epochs):
      train_acc = 0.0
      test_acc = 0.0
      model.train()
      for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
          optimizer.zero_grad()
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)

          # print(label.shape, out.shape)
          loss = loss_fn(out, label)
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
          optimizer.step()
          scheduler.step()  # Update learning rate schedule
          train_acc += calc_accuracy(out, label)
          if batch_id % log_interval == 0:
              print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
              train_history.append(train_acc / (batch_id+1))
              loss_history.append(loss.data.cpu().numpy())
      print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
      # train_history.append(train_acc / (batch_id+1))

      # .eval() : nn.Module에서 train time과 eval time에서 수행하는 다른 작업을 수행할 수 있도록 switching 하는 함수
      # 즉, model이 Dropout이나 BatNorm2d를 사용하는 경우, train 시에는 사용하지만 evaluation을 할 때에는 사용하지 않도록 설정해주는 함수
      model.eval()
      for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length = valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          test_acc += calc_accuracy(out, label)
      print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
      test_history.append(test_acc / (batch_id+1))



  # 모형 가중치 저장
  torch.save(model.state_dict(), f'/home2/a301v/myubai/Machine Learning/Model Weights/ArticleWeight{fold}.pth')

  list_test_history.append(test_history)



  0%|          | 0/720 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.6039785146713257 train acc 0.375
epoch 1 batch id 201 loss 1.578283667564392 train acc 0.2599502487562189
epoch 1 batch id 401 loss 0.9938846230506897 train acc 0.432356608478803
epoch 1 batch id 601 loss 1.2086772918701172 train acc 0.5185108153078203
epoch 1 train acc 0.5460069444444444


  0%|          | 0/180 [00:00<?, ?it/s]

epoch 1 test acc 0.6638888888888889


  0%|          | 0/720 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.3626064956188202 train acc 0.875
epoch 2 batch id 201 loss 1.4289010763168335 train acc 0.7232587064676617
epoch 2 batch id 401 loss 1.004494547843933 train acc 0.7175810473815462
epoch 2 batch id 601 loss 1.1433840990066528 train acc 0.7196339434276207
epoch 2 train acc 0.7201388888888889


  0%|          | 0/180 [00:00<?, ?it/s]

epoch 2 test acc 0.7222222222222222


  0%|          | 0/720 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.9529440402984619 train acc 0.625
epoch 3 batch id 201 loss 0.4082198739051819 train acc 0.7842039800995025
epoch 3 batch id 401 loss 0.14625976979732513 train acc 0.7849127182044888
epoch 3 batch id 601 loss 0.670361340045929 train acc 0.7816139767054908
epoch 3 train acc 0.7789930555555555


  0%|          | 0/180 [00:00<?, ?it/s]

epoch 3 test acc 0.7277777777777777


  0%|          | 0/720 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.0844910740852356 train acc 1.0
epoch 4 batch id 201 loss 0.2319452464580536 train acc 0.8345771144278606
epoch 4 batch id 401 loss 0.09534298628568649 train acc 0.831359102244389
epoch 4 batch id 601 loss 0.10785331577062607 train acc 0.8361064891846922
epoch 4 train acc 0.8342013888888888


  0%|          | 0/180 [00:00<?, ?it/s]

epoch 4 test acc 0.7270833333333333


  0%|          | 0/720 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 1.7432633638381958 train acc 0.625
epoch 5 batch id 201 loss 0.383849173784256 train acc 0.8905472636815921
epoch 5 batch id 401 loss 0.45352527499198914 train acc 0.8765586034912718
epoch 5 batch id 601 loss 0.050256021320819855 train acc 0.8731281198003328
epoch 5 train acc 0.8690972222222222


  0%|          | 0/180 [00:00<?, ?it/s]

epoch 5 test acc 0.7381944444444445


  0%|          | 0/720 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.3511575162410736 train acc 0.875
epoch 6 batch id 201 loss 0.035872478038072586 train acc 0.9011194029850746
epoch 6 batch id 401 loss 0.2143760472536087 train acc 0.8993142144638404
epoch 6 batch id 601 loss 0.028612270951271057 train acc 0.8999584026622296
epoch 6 train acc 0.8980902777777777


  0%|          | 0/180 [00:00<?, ?it/s]

epoch 6 test acc 0.7159722222222222


  0%|          | 0/720 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.05592764914035797 train acc 1.0


KeyboardInterrupt: 

In [None]:
df_test_history = pd.DataFrame(list_test_history)
df_test_history.to_csv('/content/drive/MyDrive/Colab Notebooks/Machine Learning - Graduate/Stratified K-fold CVs/Article Baseline.csv')

In [None]:
df_test_history

New Data Prediction

In [None]:
new_data[1]

In [None]:
warmup_ratio = 0.1
num_epochs = 3    ### 에포크 수정 ###
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5



bertmodel, vocab = get_kobert_model('skt/kobert-base-v1', tokenizer.vocab_file)
model = BERTClassifier(bertmodel,  dr_rate = 0.5).to(device)


train_data = cv_data
test_data = new_data

train_dataset = BERTDataset(dataset = train_data, sent_idx = 1, label_idx = 2, bert_tokenizer = tokenizer, vocab = vocab, max_len = max_len, pad = True, pair = False)
test_dataset = BERTDataset(dataset = test_data, sent_idx = 1, label_idx = 2, bert_tokenizer = tokenizer, vocab = vocab, max_len = max_len, pad = True, pair = False)

train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle = True)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate)
loss_fn = nn.CrossEntropyLoss() # 다중분류를 위한 loss function

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = warmup_step, num_training_steps = t_total)

train_history = []
test_history = []
loss_history = []

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)

        # print(label.shape, out.shape)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
            train_history.append(train_acc / (batch_id+1))
            loss_history.append(loss.data.cpu().numpy())
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    # train_history.append(train_acc / (batch_id+1))

    # .eval() : nn.Module에서 train time과 eval time에서 수행하는 다른 작업을 수행할 수 있도록 switching 하는 함수
    # 즉, model이 Dropout이나 BatNorm2d를 사용하는 경우, train 시에는 사용하지만 evaluation을 할 때에는 사용하지 않도록 설정해주는 함수
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    test_history.append(test_acc / (batch_id+1))