# 0. 전에 쓰던 함수, 모델 불러오기

In [3]:
# 필요한 패키지 import
import torch
import numpy as np
from tqdm import tqdm

from transformers import BertTokenizer, BertModel, AdamW
from transformers import BertForMaskedLM,BertForSequenceClassification

import torch.nn.functional as F
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import random
import time
import datetime

In [4]:
# 파일 읽기
def read_file(filename):
    with open(filename, "r") as file:
        lines = file.readlines()
        l = []
        for line in lines:
            l.append(line.strip().split("</B>")[1])
#             print(line)
    return l
verb_hao = read_file("../data/verb_hao.txt")
verb_dao = read_file("../data/verb_dao.txt")
verb_xialai = read_file("../data/verb_xialai.txt")
verb_xiaqu = read_file("../data/verb_xiaqu.txt")

In [5]:
!head ../data/verb_hao.txt

﻿<B> 科技文献</B>是约４２ｍｍ的遮光罩镜头外型尺寸为以说一尘不染，可见其密封性<U>恨好</U>。如此笔者发现独立镜头厂家适配０７２５ｘ１２２ｍｍ光学构造为１
<B> 科技文献</B>很爱听，真像换了个脑筋，明年养多少头牛、羊都一家一户地帮我们<U>算好</U>了。”东湾乡大泉村二组村民马建雄忙着拉干部到自家去住，“干部住
<B> 科技文献</B>有关重大问题。第三，企业档案处置具体工作是收集、整理、统计、<U>保管好</U>企业已经形成的档案，清点库存，按有关规定做好档案留存与销毁的鉴
<B> 科技文献</B>策与财政政策的协调配合还体现为在经济发展的不同时期，两者都要<U>搭配好</U>。１９９８年下半年，财政为扩大内需，向国有商业银行增加发行１０
<B> 科技文献</B>政府贯彻〈中国教育改革和发展纲要〉的意见》。发展教育事业注意<U>处理好</U>三个关系：在数量和质量的关系上，更加重视质量；在德育和智育的关
<B> 科技文献</B>学管理的示范。一是要通过学习，增强创新创业的自觉性。要创业、<U>干好</U>事业，就必须抓重点、带全局。所有的人力、智力、财力、物力、科技
<B> 科技文献</B>解放思想，寻找经济快速发展的新路子。只有人的思想解放了，才能<U>运用好</U>中央制定的各项方针和政策，才能敢“闯”敢“新”，走出一条独特的
<B> 科技文献</B>到手术室，手术室大空间，手术床小范围，手术室到ＩＣＵ等各环节<U>做好</U>保温，避免外介温度的大波动引起婴儿不良反应。３．３减少患婴术中
<B> 科技文献</B>例如，在学习“如何做小主人和小客人”这一社会知识时，我们事先<U>联系好</U>一个家庭，并向主人详细讲解此次活动的目的、方法、步骤，以取得默
<B> 科技文献</B>切实加强农村信用社金融监管与行业管理的初步设想（一）必须正确<U>处理好</U>三大关系回、正确处理好信用社与联社的关系。基本原则是既要尊重信


In [6]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 2 GPU(s) available.
We will use the GPU: TITAN RTX


In [7]:
# 모델, 배치, 최대 길이 정의
MODEL_TYPE = 'bert-base-chinese'
MAX_SIZE = 150
BATCH_SIZE = 200

In [8]:
# 정확도 구하는 함수 통합
def get_accuracy(data, top_n = 1):
    def preprocessing(sentences) :
        sent = []
        label = []
        for sentence in sentences : 
            s1 = sentence.split("<U>")
            s2 = s1[1].split("</U>")
            label.append(s2[0][-1])
            sent.append(s1[0]+s2[0][:-1]+"[MASK]"+s2[1])
    #         print(s2[0][-1])
    #         print(s1[0]+s2[0][:-1]+"[MASK]"+s2[1])
        return sent, label
    def inference(sentences):
        predicts = []
        for sentence in sentences:
            input_ids = tokenizer(sentence, return_tensors="pt")["input_ids"]
            mask_idx = input_ids.tolist()[0].index(103) #103 is [MASK] id
            outputs = model(input_ids)
            logits = outputs[0].detach().numpy()
            pred = tokenizer.convert_ids_to_tokens(np.argmax(logits, axis=2)[0])
            predicts.append(pred[mask_idx])
        return predicts
    
    def softmax_inference(sentences, top_n=5):
        predicts = []
        probabilities = []
        for sentence in sentences:
            input_ids = tokenizer(sentence, return_tensors="pt")["input_ids"]
            mask_idx = input_ids.tolist()[0].index(103) #103 is [MASK] id
            outputs = model(input_ids)
            logits = outputs[0].detach()
            mask_predict = F.softmax(logits[0][mask_idx], dim=0).numpy()
            predict_top_n = mask_predict.argsort()[-top_n:][::-1]
            predicts.append(tokenizer.convert_ids_to_tokens(predict_top_n))
            probabilities.append(mask_predict[predict_top_n].tolist())
        return predicts, probabilities

    def evaluate(pred, label):
        cnt = 0
        for p, l in zip(pred, label):
            if p==l : 
                cnt += 1
        return cnt / len(pred)
    
    if top_n > 1:
        sent, label = preprocessing(data)
        pred, prob = softmax_inference(sent, top_n)
        for sentence, candidates, probabilities, label in zip(sent, pred, prob, label):
            print(sentence)
            for c, p in zip(candidates,  probabilities):
                print("{} : {:.2f}%".format(c, p*100))
            print("Answer : {}".format(label))
            print("")
    sent, label = preprocessing(data)
    predict = inference(sent)
    return evaluate(predict, label)

# 1. Language model fine-tuning

## 1.1 데이터를 raw text로 변환

In [None]:
verb_hao[0]

In [10]:
line = verb_hao[0]
line.replace('<U>',"").replace('</U>',"")

'是约４２ｍｍ的遮光罩镜头外型尺寸为以说一尘不染，可见其密封性恨好。如此笔者发现独立镜头厂家适配０７２５ｘ１２２ｍｍ光学构造为１'

In [11]:
def save_as_raw_text(data, filename):
    with open(filename, 'w') as f:
        for d in data:
            for line in d:
                line = line.replace('<U>',"").replace('</U>',"")
                f.write(line)
verb_hao_dao = [verb_hao, verb_dao]
save_as_raw_text(verb_hao_dao, "../data/verb_raw.txt")

In [12]:
# 만들어진 verb hao와 verb dao의 raw text를 사용하여 기존의 pretrained BERT(bert-base-chinese) 모델에 추가로 더 학습시켰다. 그 다음에 아래에서 새롭게 모델을 load함

In [13]:
tokenizer = BertTokenizer.from_pretrained('../src/models/hao_dao')
model = BertForMaskedLM.from_pretrained('../src/models/hao_dao')

In [14]:
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

### previous model accuracies
verb hao : 0.6897
verb dao : 0.7474

### pretrianed model accuracies

In [15]:
verb_hao_acc = get_accuracy(verb_hao)
verb_dao_acc = get_accuracy(verb_dao)

In [16]:
print("Verb_hao accuracy : ",verb_hao_acc)
print("Verb_dao accuracy : ",verb_dao_acc)

Verb_hao accuracy :  0.915
Verb_dao accuracy :  0.9416


In [17]:
# 성능이 큰 폭으로 오르는 것으로 보아, 이 방법이 매우 효과가 있어보임.

# 2. 2음절 Tokenizing

In [18]:
tokenizer.add_tokens(['下来','下去'])

2

In [19]:
def predict():
    sent = input('Input Sentence')
    input_ids = tokenizer(sent, return_tensors="pt")["input_ids"]
    mask_idx = input_ids.tolist()[0].index(103) #103 is [MASK] id
    model.eval()
    outputs = model(input_ids)
    logits = outputs[0].detach()
    mask_predict = F.softmax(logits[0][mask_idx], dim=0).numpy()
    predict_top_n = mask_predict.argsort()[-5:][::-1]
    pred = list()
    prob = list()
    pred.append(tokenizer.convert_ids_to_tokens(predict_top_n))
    prob.append(mask_predict[predict_top_n].tolist())
    for sentence, candidates, probabilities in zip(sent, pred, prob) :
        print(sentence)
        for c, p in zip(candidates,  probabilities):
            print("{} : {:.2f}%".format(c, p*100))
        print("")

In [22]:
predict()

Input Sentence这几天天气突然冷[MASK]了
这
了 : 21.12%
透 : 10.32%
些 : 7.15%
下 : 4.80%
起 : 4.06%



In [23]:
predict()

Input Sentence这几天天气突然冷[MASK]来了
这
起 : 88.22%
下 : 11.24%
回 : 0.26%
出 : 0.09%
上 : 0.06%



실패 ： 기존의 토크나이징 방법으로 학습된 것을 바꿀 수 있는 방법은 없는 것으로 판단

# 3. Sequence classification with new data

## 3.1 Load new data

过来 / 过去 / 起来 / 上来 / 下来 / 下去  
총 6개의 클래스， 클래스 별로 레이블링 해서 리스트에 저장.

In [24]:
def load_data(data_dir, *filenames):
    sentences = []
    for filename in filenames:
        with open(data_dir + filename, 'r') as f:
            for line in f.readlines():
                sentences.append(line.strip().split('\t')[-1])
    return sentences, len(sentences)

In [25]:
data_dir = "../data/week3_data/"
guolai, total_guolai = load_data(data_dir, "verb_guolai_literature.txt", "verb_guolai_media.txt", "verb_guolai_textbook.txt")
guoqu, total_guoqu = load_data(data_dir, "verb_guoqu_literature.txt", "verb_guoqu_media.txt", "verb_guoqu_textbook.txt")
qilai, total_qilai = load_data(data_dir, "verb_qilai_literature.txt", "verb_qilai_media.txt", "verb_qilai_textbook.txt")
shanglai, total_shanglai = load_data(data_dir, "verb_shanglai_literature.txt", "verb_shanglai_media.txt", "verb_shanglai_textbook.txt")
xialai, total_xialai = load_data(data_dir, "verb_xialai_literature.txt", "verb_xialai_media.txt", "verb_xialai_textbook.txt")
xiaqu, total_xiaqu = load_data(data_dir, "verb_xiaqu_literature.txt", "verb_xiaqu_media.txt", "verb_xiaqu_textbook.txt")

print("过来 :", total_guolai)
print("过去 :", total_guoqu)
print("起来 :", total_qilai)
print("上来 :", total_shanglai)
print("下来 :", total_xialai)
print("下去 :", total_xiaqu)

过来 : 13408
过去 : 25502
起来 : 43744
上来 : 9107
下来 : 24948
下去 : 11901


In [None]:
raw_data

In [None]:
guolai[0]

In [26]:
guolai[0]

'”说话间，果然有好几个人从西房背后走过来，一转弯就向大门这边来了。（当代1，\u3000赵树理《三里湾》）'

In [27]:
# 데이터 가공이 필요. 일관적인 포맷팅이 되어있지 않음.
# 어떤 곳은 [tab]으로, 어떤 곳은 [space]로 구분이 되어있음.
guolai[-1]

'C:\\t\\textbook\\Xingonglve levelup2.txt(615): 你对别人好，别人也会对你好反过来，你对别人不好,别人也会对你不好。努力学习才能取得好成绩反过来，不努力的人，什么也学不好。'

In [28]:
# label 이 존재하는 곳을 마스킹
sent = guolai[0]
print(sent)
print(sent.replace('过来','[MASK]'))

”说话间，果然有好几个人从西房背后走过来，一转弯就向大门这边来了。（当代1，　赵树理《三里湾》）
”说话间，果然有好几个人从西房背后走[MASK]，一转弯就向大门这边来了。（当代1，　赵树理《三里湾》）


## 3.2 Preprocess

In [29]:
MAX_LEN = 510
def preprocess(data, label):
    sentences = []
    labels = []
    error_cnt = 0
    for sent in data:
        # 해당 문장에 방향보어가 있는 지 확인
        if label not in sent:
            print(f"Sententce : {sent}")
            error_cnt += 1
            continue
        sent = sent[:MAX_LEN]
        sent = sent.replace(label, '[MASK]')
        sentences.append(sent)
        labels.append(label)
    print(f"{label} 방향보어 없는 문장 개수 : {error_cnt}")
    return sentences, labels

In [30]:
guolai_sent, guolai_label= preprocess(guolai, '过来')
guoqu_sent, guoqu_label = preprocess(guoqu, '过去')
qilai_sent, qilai_label = preprocess(qilai, '起来')
shanglai_sent, shanglai_label = preprocess(shanglai, '上来')
xialai_sent, xialai_label = preprocess(xialai, '下来')
xiaqu_sent, xiaqu_label = preprocess(xiaqu, '下去')

过来 방향보어 없는 문장 개수 : 0
过去 방향보어 없는 문장 개수 : 0
Sententce : C:\t\literature\作家94B.TXT(22687):
Sententce : 
Sententce : C:\t\literature\作家95B.TXT(13455):
Sententce : 
Sententce : C:\t\literature\作家95B.TXT(18769):
Sententce : 
Sententce : C:\t\literature\作家95B.TXT(23174):
Sententce : 
Sententce : C:\t\literature\作家95B.TXT(45729):
Sententce : 
Sententce : C:\t\literature\作家95B.TXT(51137):
Sententce : 
Sententce : C:\t\literature\作家95B.TXT(52305):
Sententce : 
Sententce : C:\t\literature\作家95B.TXT(52827):
Sententce : 
Sententce : C:\t\literature\作家95B.TXT(57763):
Sententce : 
Sententce : C:\t\literature\作家95B.TXT(58666):
Sententce : 
Sententce : C:\t\literature\作家95B.TXT(69868):
Sententce : 
Sententce : C:\t\literature\作家95B.TXT(75867):
Sententce : 
Sententce : C:\t\literature\作家95B.TXT(82416):
Sententce : 
Sententce : C:\t\literature\作家96B.TXT(12243): 在路上吃过一碗虾仁鳝丝面，中午饭就不吃了，关老爷要眯一小觉。
Sententce : C:\t\media\MLC_Corpus_2009_2.txt(49841):
Sententce : 
Sententce : C:\t\media\MLC_Corpus_2010_4.txt(205

In [31]:
all_sent = guolai_sent + guoqu_sent + qilai_sent + shanglai_sent + xialai_sent + xiaqu_sent

In [32]:
all_label = guolai_label + guoqu_label + qilai_label + shanglai_label + xialai_label + xiaqu_label

In [33]:
all_sent[:5]

['”说话间，果然有好几个人从西房背后走[MASK]，一转弯就向大门这边来了。（当代1，\u3000赵树理《三里湾》）',
 '快[MASK]给你贴上点膏药！（当代1，\u3000赵树理《三里湾》）',
 '”大胜还是不[MASK]，玉梅从窗台上取起个红皮笔记本来说：“你看我这红皮书！（当代1，\u3000赵树理《三里湾》）',
 '”大胜见是个新鲜东西，就跑[MASK]拿，金生媳妇向玉梅说：“可不敢玩人家那个！（当代1，\u3000赵树理《三里湾》）',
 '一会给你送[MASK]好了！（当代1，\u3000赵树理《三里湾》）']

In [34]:
all_label[:5]

['过来', '过来', '过来', '过来', '过来']

In [35]:
train_sent = all_sent
train_label = all_label

In [36]:
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)

### Load train data

In [37]:
tokenized_sent = [tokenizer.tokenize(sent) for sent in train_sent]

In [38]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_sent]

In [39]:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 128
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
input_ids[0]

array([ 100, 6432, 6413, 7313, 8024, 3362, 4197, 3300, 1962, 1126,  702,
        782,  794, 6205, 2791, 5520, 1400, 6624,  103, 8024,  671, 6760,
       2482, 2218, 1403, 1920, 7305, 6821, 6804, 3341,  749,  511, 8020,
       2496,  807,  122, 8024, 6627, 3409, 4415,  517,  676, 7027, 3968,
        518, 8021,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])

In [40]:
directional_list = ["过来", "过去", "起来", "上来", "下来", "下去"]
labels = [directional_list.index(x) for x in train_label]
labels[0]

0

In [41]:
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [42]:
# 훈련셋과 검증셋으로 분리
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                    labels, 
                                                                                    random_state=1, 
                                                                                    test_size=0.1)

# 어텐션 마스크를 훈련셋과 검증셋으로 분리
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=1, 
                                                       test_size=0.1)

# 데이터를 파이토치의 텐서로 변환
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)		

In [43]:
# 배치 사이즈
batch_size = 32

# 파이토치의 DataLoader로 입력, 마스크, 라벨을 묶어 데이터 설정
# 학습시 배치 사이즈 만큼 데이터를 가져옴
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

## 3.3 Train Model

In [44]:
import tensorflow as tf
# GPU 디바이스 이름 구함
device_name = tf.test.gpu_device_name()
# GPU 디바이스 이름 검사
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [45]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 2 GPU(s) available.
We will use the GPU: TITAN RTX


In [46]:
model = BertForSequenceClassification.from_pretrained(MODEL_TYPE, num_labels=6)
model.cuda()

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [47]:
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW, BertConfig

# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# 에폭수
epochs = 4

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

# 학습률을 조금씩 감소시키는 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [48]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [49]:
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [50]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 그래디언트 초기화
model.zero_grad()

# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행                
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)

        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # 로스 구함
        logits = outputs[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch   500  of  3,617.    Elapsed: 0:01:52.
  Batch 1,000  of  3,617.    Elapsed: 0:03:46.
  Batch 1,500  of  3,617.    Elapsed: 0:05:41.
  Batch 2,000  of  3,617.    Elapsed: 0:07:37.
  Batch 2,500  of  3,617.    Elapsed: 0:09:33.
  Batch 3,000  of  3,617.    Elapsed: 0:11:29.
  Batch 3,500  of  3,617.    Elapsed: 0:13:24.

  Average training loss: 0.70
  Training epcoh took: 0:13:51

Running Validation...
  Accuracy: 0.80
  Validation took: 0:00:30

Training...
  Batch   500  of  3,617.    Elapsed: 0:01:56.
  Batch 1,000  of  3,617.    Elapsed: 0:03:52.
  Batch 1,500  of  3,617.    Elapsed: 0:05:47.
  Batch 2,000  of  3,617.    Elapsed: 0:07:43.
  Batch 2,500  of  3,617.    Elapsed: 0:09:39.
  Batch 3,000  of  3,617.    Elapsed: 0:11:34.
  Batch 3,500  of  3,617.    Elapsed: 0:13:30.

  Average training loss: 0.47
  Training epcoh took: 0:13:57

Running Validation...
  Accuracy: 0.81
  Validation took: 0:00:30

Training...
  Batch   500  of  3,617.    Elapsed: 0:01:56

## 3.4 Test with new data

In [53]:
# 입력 데이터 변환
def convert_input_data(sentences):

    # BERT의 토크나이저로 문장을 토큰으로 분리
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    # 입력 토큰의 최대 시퀀스 길이
    MAX_LEN = 128

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 데이터를 파이토치의 텐서로 변환
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [61]:
# 문장 테스트
def test_sentences(sentences):

    # 평가모드로 변경
    model.eval()

    # 문장을 입력 데이터로 변환
    inputs, masks = convert_input_data(sentences)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu()

    return logits

In [57]:
logits = test_sentences(['这几天天气突然冷[MASK]了'])
directional_list = ["过来", "过去", "起来", "上来", "下来", "下去"]
print(logits)
print(directional_list[np.argmax(logits)])

[[-2.349181   -2.8879583   3.836629   -2.720859    3.824633    0.51645184]]
起来


In [58]:
logits = test_sentences(['教室里突然安静[MASK]了'])
directional_list = ["过来", "过去", "起来", "上来", "下来", "下去"]
print(logits)
print(directional_list[np.argmax(logits)])

[[-2.164058   -2.7196388   0.5187319  -2.844923    6.5916567   0.50808865]]
下来


In [72]:
import torch.nn.functional as F

In [73]:
def predict_sentence():
    sent = input('Input Sentence')
    logits = test_sentences([sent])
    softmax = F.softmax(logits[0], dim=0).numpy()
    for l, p in zip(directional_list, softmax):
        print(f"{l} : {p*100:.2f}%")

In [71]:
predict_sentence()

Input Sentence这几天天气突然热[MASK]了
过来 : 0.01%
过去 : 0.02%
起来 : 99.94%
上来 : 0.02%
下来 : 0.01%
下去 : 0.01%


In [None]:
predict_sentence()