## 1. Preprocessing

### 1.1 중국어 전처리 패키지

In [1]:
# 중국어 전처리 패키지
from snownlp import SnowNLP
# pd dataframe
import pandas as pd

In [2]:
s = SnowNLP(u'这个东西太贵')

In [3]:
s.words

['这个', '东西', '太', '贵']

In [4]:
for a in s.tags:
    print(a)

('这个', 'r')
('东西', 'n')
('太', 'd')
('贵', 'a')


In [5]:
s.sentiments

0.3091627951693915

### 1.2 Load data

In [6]:
def read_file(filename):
    with open(filename, "r") as file:
        lines = file.readlines()
        l = []
        for line in lines:
            l.append(line.strip().split("</B>")[1])
#             print(line)
    return l

In [7]:
verb_hao = read_file("sample-data/verb_hao.txt")
verb_dao = read_file("sample-data/verb_dao.txt")
verb_xialai = read_file("sample-data/verb_xialai.txt")
verb_xiaqu = read_file("sample-data/verb_xiaqu.txt")

print(len(verb_xiaqu))
print(verb_xiaqu[0])

10000
俺听听，也为你分忧。”宋江道:“也没什么大事,只是弟兄们这样<U>闹下去</U>,不成体统,咱们得想个法子,给兄弟们排排座次。刘庆邦在《凭良心


## 2. Training

### 2.1 Load model

In [8]:
import torch
import numpy as np
from tqdm import tqdm

from transformers import BertTokenizer, BertModel
from transformers import BertForMaskedLM

In [9]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 2 GPU(s) available.
We will use the GPU: TITAN RTX


In [10]:
MODEL_TYPE = 'bert-base-chinese'
MAX_SIZE = 150
BATCH_SIZE = 200

In [11]:
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)
model = BertForMaskedLM.from_pretrained(MODEL_TYPE)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 2.2 Test model with examle

In [12]:
input_ids = tokenizer("这个东西[MASK]贵", return_tensors="pt")["input_ids"]

In [24]:
input_ids

tensor([[ 101, 6821,  702,  691, 6205,  103, 6586,  102]])

In [14]:
# 101 : [CLS], 103 : [MASK], 102 : [EOS]

In [15]:
input_ids.tolist()[0].index(103) # get mask index

5

In [16]:
model.eval() # switch to eval mode

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [18]:
outputs = model(input_ids) # do eval

In [19]:
logits = outputs[0]

In [20]:
logits = logits.detach().numpy()

In [22]:
logits.shape # (batch, seq_len, word count)

(1, 8, 21128)

In [25]:
predict = np.argmax(logits, axis=2) # best score for all sentence

In [131]:
predict[0]

array([8024,  511,  702,  691, 6205, 2523, 8043,  511])

In [88]:
sentence = []
for id in predict : 
    sentence.append(tokenizer.convert_ids_to_tokens(id))

In [89]:
sentence

[['，', '。', '个', '东', '西', '很', '？', '。']]

### Get top 10 for the position of [MASK] token

In [29]:
mask_predict = logits[0][5]

In [30]:
predict_top_10 = mask_predict.argsort()[-10:][::-1]

In [33]:
predict_top_10 # top 10 prediction for [MASK] position

array([2523, 1962,  679, 4696, 1922, 3297, 3291, 2923, 1914, 6631])

In [39]:
# convert to tokens
tokenizer.convert_ids_to_tokens(predict_top_10)

['很', '好', '不', '真', '太', '最', '更', '挺', '多', '超']

### 2.3 Inference without fine-tuning

In [42]:
def preprocessing(sentences) :
    sent = []
    label = []
    for sentence in sentences : 
        s1 = sentence.split("<U>")
        s2 = s1[1].split("</U>")
        label.append(s2[0][-1])
        sent.append(s1[0]+s2[0][:-1]+"[MASK]"+s2[1])
        #debug
#         print(s2[0][-1]) # label
#         print(s1[0]+s2[0][:-1]+"[MASK]"+s2[1]) # sent
    return sent, label

In [43]:
preprocessing(verb_hao[:10])

(['是约４２ｍｍ的遮光罩镜头外型尺寸为以说一尘不染，可见其密封性恨[MASK]。如此笔者发现独立镜头厂家适配０７２５ｘ１２２ｍｍ光学构造为１',
  '很爱听，真像换了个脑筋，明年养多少头牛、羊都一家一户地帮我们算[MASK]了。”东湾乡大泉村二组村民马建雄忙着拉干部到自家去住，“干部住',
  '有关重大问题。第三，企业档案处置具体工作是收集、整理、统计、保管[MASK]企业已经形成的档案，清点库存，按有关规定做好档案留存与销毁的鉴',
  '策与财政政策的协调配合还体现为在经济发展的不同时期，两者都要搭配[MASK]。１９９８年下半年，财政为扩大内需，向国有商业银行增加发行１０',
  '政府贯彻〈中国教育改革和发展纲要〉的意见》。发展教育事业注意处理[MASK]三个关系：在数量和质量的关系上，更加重视质量；在德育和智育的关',
  '学管理的示范。一是要通过学习，增强创新创业的自觉性。要创业、干[MASK]事业，就必须抓重点、带全局。所有的人力、智力、财力、物力、科技',
  '解放思想，寻找经济快速发展的新路子。只有人的思想解放了，才能运用[MASK]中央制定的各项方针和政策，才能敢“闯”敢“新”，走出一条独特的',
  '到手术室，手术室大空间，手术床小范围，手术室到ＩＣＵ等各环节做[MASK]保温，避免外介温度的大波动引起婴儿不良反应。３．３减少患婴术中',
  '例如，在学习“如何做小主人和小客人”这一社会知识时，我们事先联系[MASK]一个家庭，并向主人详细讲解此次活动的目的、方法、步骤，以取得默',
  '切实加强农村信用社金融监管与行业管理的初步设想（一）必须正确处理[MASK]三大关系回、正确处理好信用社与联社的关系。基本原则是既要尊重信'],
 ['好', '好', '好', '好', '好', '好', '好', '好', '好', '好'])

In [75]:
def inference(sentences, top_n = 1):
    predicts = []
    for sentence in sentences:
        input_ids = tokenizer(sentence, return_tensors="pt")["input_ids"]
        mask_idx = input_ids.tolist()[0].index(103) #103 is [MASK] id
        outputs = model(input_ids)
        logits = outputs[0].detach().numpy()
        if top_n == 1:
            pred = tokenizer.convert_ids_to_tokens(np.argmax(logits, axis=2)[0])
            predicts.append(pred[mask_idx])
        else : 
            mask_predict = logits[0][mask_idx]
            predict_top_n = mask_predict.argsort()[-top_n:][::-1]
            predicts.append(tokenizer.convert_ids_to_tokens(predict_top_n))
        
    return predicts

### test with toy example : n = 1

In [51]:
toy_sent, toy_label = preprocessing(verb_hao[:10])
toy_infer = inference(toy_sent)
print(toy_infer)

['好', '账', '各', '好', '好', '好', '好', '好', '了', '好']


In [54]:
for _ in range(10):
    test_infer = inference(toy_sent)
    print(test_infer)

['好', '账', '各', '好', '好', '好', '好', '好', '了', '好']
['好', '账', '各', '好', '好', '好', '好', '好', '了', '好']
['好', '账', '各', '好', '好', '好', '好', '好', '了', '好']
['好', '账', '各', '好', '好', '好', '好', '好', '了', '好']
['好', '账', '各', '好', '好', '好', '好', '好', '了', '好']
['好', '账', '各', '好', '好', '好', '好', '好', '了', '好']
['好', '账', '各', '好', '好', '好', '好', '好', '了', '好']
['好', '账', '各', '好', '好', '好', '好', '好', '了', '好']
['好', '账', '各', '好', '好', '好', '好', '好', '了', '好']
['好', '账', '各', '好', '好', '好', '好', '好', '了', '好']


In [56]:
def evaluate(pred, label):
    cnt = 0
    for p, l in zip(pred, label):
        if p==l : 
            cnt += 1
    return cnt / len(pred)

In [58]:
evaluate(toy_infer, toy_label)

0.7

In [67]:
get_accuracy(verb_hao[:100])

0.68

### Test with toy example : n = 5

In [84]:
print("Input Sentences")
toy5_sent,toy5_label = preprocessing(verb_hao[:10])
for sent in toy5_sent:
    print(sent)
inference(toy5_sent, top_n=5)

Input Sentences
是约４２ｍｍ的遮光罩镜头外型尺寸为以说一尘不染，可见其密封性恨[MASK]。如此笔者发现独立镜头厂家适配０７２５ｘ１２２ｍｍ光学构造为１
很爱听，真像换了个脑筋，明年养多少头牛、羊都一家一户地帮我们算[MASK]了。”东湾乡大泉村二组村民马建雄忙着拉干部到自家去住，“干部住
有关重大问题。第三，企业档案处置具体工作是收集、整理、统计、保管[MASK]企业已经形成的档案，清点库存，按有关规定做好档案留存与销毁的鉴
策与财政政策的协调配合还体现为在经济发展的不同时期，两者都要搭配[MASK]。１９９８年下半年，财政为扩大内需，向国有商业银行增加发行１０
政府贯彻〈中国教育改革和发展纲要〉的意见》。发展教育事业注意处理[MASK]三个关系：在数量和质量的关系上，更加重视质量；在德育和智育的关
学管理的示范。一是要通过学习，增强创新创业的自觉性。要创业、干[MASK]事业，就必须抓重点、带全局。所有的人力、智力、财力、物力、科技
解放思想，寻找经济快速发展的新路子。只有人的思想解放了，才能运用[MASK]中央制定的各项方针和政策，才能敢“闯”敢“新”，走出一条独特的
到手术室，手术室大空间，手术床小范围，手术室到ＩＣＵ等各环节做[MASK]保温，避免外介温度的大波动引起婴儿不良反应。３．３减少患婴术中
例如，在学习“如何做小主人和小客人”这一社会知识时，我们事先联系[MASK]一个家庭，并向主人详细讲解此次活动的目的、方法、步骤，以取得默
切实加强农村信用社金融监管与行业管理的初步设想（一）必须正确处理[MASK]三大关系回、正确处理好信用社与联社的关系。基本原则是既要尊重信


[['好', '佳', '强', '高', '差'],
 ['账', '算', '好', '上', '计'],
 ['各', '本', '该', '好', '在'],
 ['好', '住', '上', '合', '着'],
 ['好', '了', '这', '的', '在'],
 ['好', '大', '新', '事', '实'],
 ['好', '党', '住', '新', '了'],
 ['好', '到', '。', '足', '预'],
 ['了', '上', '好', '到', '每'],
 ['好', '这', '"', '的', '。']]

In [60]:
def get_accuracy(data):
    sent, label = preprocessing(data)
    predict = inference(sent)
    return evaluate(predict, label)

In [203]:
verb_hao_acc = get_accuracy(verb_hao)
verb_dao_acc = get_accuracy(verb_dao)
verb_xialai_acc = get_accuracy(verb_xialai)
verb_xiaqu_acc = get_accuracy(verb_xiaqu)

### 2.4 Final result

In [204]:
print("Verb_hao accuracy : ",verb_hao_acc)
print("Verb_dao accuracy : ",verb_dao_acc)
print("Verb_xialai accuracy : ",verb_xialai_acc)
print("Verb_xiaqu accuracy : ",verb_xiaqu_acc)

Verb_hao accuracy :  0.6897
Verb_dao accuracy :  0.7474
Verb_xialai accuracy :  0.8272
Verb_xiaqu accuracy :  0.872
