In [13]:
import torch
import numpy as np
from tqdm import tqdm

from transformers import BertTokenizer, BertModel
from transformers import BertForMaskedLM

import torch.nn.functional as F

In [5]:
def read_file(filename):
    with open(filename, "r") as file:
        lines = file.readlines()
        l = []
        for line in lines:
            l.append(line.strip().split("</B>")[1])
#             print(line)
    return l
verb_hao = read_file("sample-data/verb_hao.txt")
verb_dao = read_file("sample-data/verb_dao.txt")
verb_xialai = read_file("sample-data/verb_xialai.txt")
verb_xiaqu = read_file("sample-data/verb_xiaqu.txt")

In [2]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 2 GPU(s) available.
We will use the GPU: TITAN RTX


In [3]:
MODEL_TYPE = 'bert-base-chinese'
MAX_SIZE = 150
BATCH_SIZE = 200

In [6]:
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)
model = BertForMaskedLM.from_pretrained(MODEL_TYPE)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 1. Softmax 

In [7]:
def preprocessing(sentences) :
    sent = []
    label = []
    for sentence in sentences : 
        s1 = sentence.split("<U>")
        s2 = s1[1].split("</U>")
        label.append(s2[0][-1])
        sent.append(s1[0]+s2[0][:-1]+"[MASK]"+s2[1])
        #debug
#         print(s2[0][-1]) # label
#         print(s1[0]+s2[0][:-1]+"[MASK]"+s2[1]) # sent
    return sent, label

In [11]:
def softmax_inference(sentences, top_n=5):
    predicts = []
    probabilities = []
    for sentence in sentences:
        input_ids = tokenizer(sentence, return_tensors="pt")["input_ids"]
        mask_idx = input_ids.tolist()[0].index(103) #103 is [MASK] id
        outputs = model(input_ids)
        if top_n == 1:
            logits = ouptuts[0].detach().numpy()
            pred = tokenizer.convert_ids_to_tokens(np.argmax(logits, axis=2)[0])
            predicts.append(pred[mask_idx])
        else :
            logits = outputs[0].detach()
            mask_predict = F.softmax(logits[0][mask_idx], dim=0).numpy()
            predict_top_n = mask_predict.argsort()[-top_n:][::-1]
            predicts.append(tokenizer.convert_ids_to_tokens(predict_top_n))
            probabilities.append(mask_predict[predict_top_n].tolist())
    return predicts, probabilities

In [18]:
def evaluate(pred, label):
    cnt = 0
    for p, l in zip(pred, label):
        if p==l : 
            cnt += 1
    return cnt / len(pred)

In [19]:
def get_accuracy(data):
    sent, label = preprocessing(data)
    predict = inference(sent)
    return evaluate(predict, label)

In [37]:
def print_softmax(data, top_n = 5, file_name=None):
    sent, label = preprocessing(data)
    pred, prob = softmax_inference(sent, top_n)
    for sentence, candidates, probabilities, label in zip(sent, pred, prob, label):
        print(sentence)
        for c, p in zip(candidates,  probabilities):
            print("{} : {:.2f}%".format(c, p*100))
        print("Answer : {}".format(label))
        print("")
    if file_name:
        with open(file_name, "w") as f:
            f.write(sentence)
            for c, p in zip(candidates,  probabilities):
                print("{} : {:.2f}%".format(c, p*100))
            f.write("Answer : {}".format(label))
            f.write("")

In [38]:
print_softmax(verb_hao[:10], 5)

是约４２ｍｍ的遮光罩镜头外型尺寸为以说一尘不染，可见其密封性恨[MASK]。如此笔者发现独立镜头厂家适配０７２５ｘ１２２ｍｍ光学构造为１
好 : 42.19%
佳 : 19.26%
强 : 11.28%
高 : 6.79%
差 : 4.63%
Answer : 好

很爱听，真像换了个脑筋，明年养多少头牛、羊都一家一户地帮我们算[MASK]了。”东湾乡大泉村二组村民马建雄忙着拉干部到自家去住，“干部住
账 : 36.73%
算 : 22.00%
好 : 16.04%
上 : 3.53%
计 : 3.52%
Answer : 好

有关重大问题。第三，企业档案处置具体工作是收集、整理、统计、保管[MASK]企业已经形成的档案，清点库存，按有关规定做好档案留存与销毁的鉴
各 : 36.24%
本 : 35.96%
该 : 4.26%
好 : 3.60%
在 : 1.77%
Answer : 好

策与财政政策的协调配合还体现为在经济发展的不同时期，两者都要搭配[MASK]。１９９８年下半年，财政为扩大内需，向国有商业银行增加发行１０
好 : 76.31%
住 : 6.40%
上 : 3.82%
合 : 3.16%
着 : 2.46%
Answer : 好

政府贯彻〈中国教育改革和发展纲要〉的意见》。发展教育事业注意处理[MASK]三个关系：在数量和质量的关系上，更加重视质量；在德育和智育的关
好 : 98.63%
了 : 0.32%
这 : 0.31%
的 : 0.21%
在 : 0.13%
Answer : 好

学管理的示范。一是要通过学习，增强创新创业的自觉性。要创业、干[MASK]事业，就必须抓重点、带全局。所有的人力、智力、财力、物力、科技
好 : 93.29%
大 : 3.46%
新 : 1.15%
事 : 0.48%
实 : 0.42%
Answer : 好

解放思想，寻找经济快速发展的新路子。只有人的思想解放了，才能运用[MASK]中央制定的各项方针和政策，才能敢“闯”敢“新”，走出一条独特的
好 : 85.65%
党 : 11.48%
住 : 1.81%
新 : 0.28%
了 : 0.15%
Answer : 好

到手术室，手术室大空间，手术床小范围，手术室到ＩＣＵ等各环节做[MASK]保温，避免外介温度的大波动引起婴儿不良反应。３．３减少患婴

# 2. Language model fine-tune