In [162]:
!pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /home/boychaboy/anaconda3/envs/cn_base/lib/python3.8/site-packages (0.23.1)


In [163]:
!pip install tensorflow
!pip install keras



In [1]:
import torch
import numpy as np
from tqdm import tqdm

from transformers import BertTokenizer, BertModel, AdamW
from transformers import BertForMaskedLM,BertForSequenceClassification

import torch.nn.functional as F
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import random
import time
import datetime

In [2]:
def read_file(filename):
    with open(filename, "r") as file:
        lines = file.readlines()
        l = []
        for line in lines:
            l.append(line.strip().split("</B>")[1])
#             print(line)
    return l
verb_hao = read_file("sample-data/verb_hao.txt")
verb_dao = read_file("sample-data/verb_dao.txt")
verb_xialai = read_file("sample-data/verb_xialai.txt")
verb_xiaqu = read_file("sample-data/verb_xiaqu.txt")

In [3]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 2 GPU(s) available.
We will use the GPU: TITAN RTX


In [4]:
MODEL_TYPE = 'bert-base-chinese'
MAX_SIZE = 150
BATCH_SIZE = 200

In [5]:
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)
model = BertForMaskedLM.from_pretrained(MODEL_TYPE)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 1. Softmax 

In [6]:
def preprocessing(sentences) :
    sent = []
    label = []
    for sentence in sentences :
        
        s1 = sentence.split("<U>")
        s2 = s1[1].split("</U>")
        label.append(s2[0][-1])
        sent.append("[CLS]"+s1[0]+s2[0][:-1]+"[MASK]"+s2[1]+"[SEP]")
        #debug
#         print(s2[0][-1]) # label
#         print(s1[0]+s2[0][:-1]+"[MASK]"+s2[1]) # sent
    return sent, label

In [7]:
def softmax_inference(sentences, top_n=5):
    predicts = []
    probabilities = []
    for sentence in sentences:
        input_ids = tokenizer(sentence, return_tensors="pt")["input_ids"]
        mask_idx = input_ids.tolist()[0].index(103) #103 is [MASK] id
        outputs = model(input_ids)
        if top_n == 1:
            logits = ouptuts[0].detach().numpy()
            pred = tokenizer.convert_ids_to_tokens(np.argmax(logits, axis=2)[0])
            predicts.append(pred[mask_idx])
        else :
            logits = outputs[0].detach()
            mask_predict = F.softmax(logits[0][mask_idx], dim=0).numpy()
            predict_top_n = mask_predict.argsort()[-top_n:][::-1]
            predicts.append(tokenizer.convert_ids_to_tokens(predict_top_n))
            probabilities.append(mask_predict[predict_top_n].tolist())
    return predicts, probabilities

In [8]:
sents = []
sents.append(["这个东西[MASK]贵"])
sents.append(["教室里突然安静[MASK]来了"])
pred, label = softmax_inference(sents)
print(pred[1], label[1])

['下', '起', '过', '出', '上'] [0.8841641545295715, 0.11500083655118942, 0.0002584183239378035, 9.911341476254165e-05, 7.786255446262658e-05]


In [9]:
def evaluate(pred, label):
    cnt = 0
    for p, l in zip(pred, label):
        if p==l : 
            cnt += 1
    return cnt / len(pred)

In [10]:
def inference(sentences):
    predicts = []
    for sentence in sentences:
        input_ids = tokenizer(sentence, return_tensors="pt")["input_ids"]
        mask_idx = input_ids.tolist()[0].index(103) #103 is [MASK] id
        outputs = model(input_ids)
        logits = outputs[0].detach().numpy()
        pred = tokenizer.convert_ids_to_tokens(np.argmax(logits, axis=2)[0])
        predicts.append(pred[mask_idx])
    return predicts

In [11]:
def get_accuracy(data):
    sent, label = preprocessing(data)
    predict = inference(sent)
    return evaluate(predict, label)

In [12]:
def print_softmax(data, top_n = 5, file_name=None):
    sent, label = preprocessing(data)
    pred, prob = softmax_inference(sent, top_n)
    for sentence, candidates, probabilities, label in zip(sent, pred, prob, label):
        print(sentence)
        for c, p in zip(candidates,  probabilities):
            print("{} : {:.2f}%".format(c, p*100))
        print("Answer : {}".format(label))
        print("")

In [13]:
print_softmax(verb_hao[:10], 5, "output.txt")

[CLS]是约４２ｍｍ的遮光罩镜头外型尺寸为以说一尘不染，可见其密封性恨[MASK]。如此笔者发现独立镜头厂家适配０７２５ｘ１２２ｍｍ光学构造为１[SEP]
好 : 43.59%
佳 : 20.00%
强 : 9.52%
高 : 8.08%
差 : 3.73%
Answer : 好

[CLS]很爱听，真像换了个脑筋，明年养多少头牛、羊都一家一户地帮我们算[MASK]了。”东湾乡大泉村二组村民马建雄忙着拉干部到自家去住，“干部住[SEP]
账 : 35.00%
算 : 20.17%
好 : 13.57%
计 : 4.53%
数 : 4.03%
Answer : 好

[CLS]有关重大问题。第三，企业档案处置具体工作是收集、整理、统计、保管[MASK]企业已经形成的档案，清点库存，按有关规定做好档案留存与销毁的鉴[SEP]
各 : 35.11%
本 : 31.27%
好 : 7.47%
该 : 5.38%
出 : 2.22%
Answer : 好

[CLS]策与财政政策的协调配合还体现为在经济发展的不同时期，两者都要搭配[MASK]。１９９８年下半年，财政为扩大内需，向国有商业银行增加发行１０[SEP]
好 : 72.64%
住 : 6.52%
上 : 5.45%
合 : 3.22%
着 : 2.45%
Answer : 好

[CLS]政府贯彻〈中国教育改革和发展纲要〉的意见》。发展教育事业注意处理[MASK]三个关系：在数量和质量的关系上，更加重视质量；在德育和智育的关[SEP]
好 : 97.80%
了 : 0.46%
这 : 0.42%
在 : 0.33%
的 : 0.32%
Answer : 好

[CLS]学管理的示范。一是要通过学习，增强创新创业的自觉性。要创业、干[MASK]事业，就必须抓重点、带全局。所有的人力、智力、财力、物力、科技[SEP]
好 : 93.50%
大 : 3.51%
新 : 1.03%
实 : 0.44%
事 : 0.36%
Answer : 好

[CLS]解放思想，寻找经济快速发展的新路子。只有人的思想解放了，才能运用[MASK]中央制定的各项方针和政策，才能敢“闯”敢“新”，走出一条独特的[SEP]
好 : 92.17%
党 : 5.79%
住 : 1.31%
新 : 0.18%
了 : 0.11%
Answer : 好

In [176]:
verb_hao_acc = get_accuracy(verb_hao)
verb_dao_acc = get_accuracy(verb_dao)
verb_xialai_acc = get_accuracy(verb_xialai)
verb_xiaqu_acc = get_accuracy(verb_xiaqu)

In [177]:
print("Verb_hao accuracy : ",verb_hao_acc)
print("Verb_dao accuracy : ",verb_dao_acc)
print("Verb_xialai accuracy : ",verb_xialai_acc)
print("Verb_xiaqu accuracy : ",verb_xiaqu_acc)

Verb_hao accuracy :  0.6963
Verb_dao accuracy :  0.7506
Verb_xialai accuracy :  0.8381
Verb_xiaqu accuracy :  0.8715
