## 1. Preprocessing

### 1.1 중국어 전처리 패키지

In [9]:
# 중국어 전처리 패키지
from snownlp import SnowNLP
# pd dataframe
import pandas as pd

In [10]:
s = SnowNLP(u'这个东西太贵')

In [11]:
s.words

['这个', '东西', '太', '贵']

In [12]:
for a in s.tags:
    print(a)

('这个', 'r')
('东西', 'n')
('太', 'd')
('贵', 'a')


In [13]:
s.sentiments

0.3091627951693915

### 1.2 Load data

In [14]:
def read_file(filename):
    with open(filename, "r") as file:
        lines = file.readlines()
        l = []
        for line in lines:
            l.append(line.strip().split("</B>")[1])
#             print(line)
    return l

In [15]:
verb_hao = read_file("../data/verb_hao.txt")
verb_dao = read_file("../data/verb_dao.txt")
verb_xialai = read_file("../data/verb_xialai.txt")
verb_xiaqu = read_file("../data/verb_xiaqu.txt")

print(len(verb_xiaqu))
print(verb_xiaqu[0])

10000
俺听听，也为你分忧。”宋江道:“也没什么大事,只是弟兄们这样<U>闹下去</U>,不成体统,咱们得想个法子,给兄弟们排排座次。刘庆邦在《凭良心


## 2. Training

### 2.1 Load model

In [16]:
import torch
import numpy as np
from tqdm import tqdm

from transformers import BertTokenizer, BertModel
from transformers import BertForMaskedLM

In [17]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 2 GPU(s) available.
We will use the GPU: TITAN RTX


In [18]:
MODEL_TYPE = 'bert-base-chinese'
MAX_SIZE = 150
BATCH_SIZE = 200

In [19]:
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)
model = BertForMaskedLM.from_pretrained(MODEL_TYPE)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
input_ids = tokenizer("这个东西[MASK]贵", return_tensors="pt")["input_ids"]

In [21]:
tokenizer("这个东西[MASK]贵", return_tensors="pt")

{'input_ids': tensor([[ 101, 6821,  702,  691, 6205,  103, 6586,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [22]:
input_ids

tensor([[ 101, 6821,  702,  691, 6205,  103, 6586,  102]])

In [23]:
input_ids.tolist()[0].index(103)

5

In [24]:
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [25]:
outputs = model(input_ids)

In [26]:
logits = outputs[0]

In [27]:
logits = logits.detach().numpy()

In [28]:
logits.shape

(1, 8, 21128)

In [29]:
predict = np.argmax(logits, axis=2)

In [30]:
predict[0]

array([8024,  511,  702,  691, 6205, 2523, 8043,  511])

In [31]:
sentence = []
for id in predict : 
    sentence.append(tokenizer.convert_ids_to_tokens(id))

In [32]:
sentence

[['，', '。', '个', '东', '西', '很', '？', '。']]

### 2.2 Inference without fine-tuning

In [33]:
def preprocessing(sentences) :
    sent = []
    label = []
    for sentence in sentences : 
        s1 = sentence.split("<U>")
        s2 = s1[1].split("</U>")
        label.append(s2[0][-1])
        sent.append(s1[0]+s2[0][:-1]+"[MASK]"+s2[1])
#         print(s2[0][-1])
#         print(s1[0]+s2[0][:-1]+"[MASK]"+s2[1])
    return sent, label

In [34]:
def inference(sentences):
    predicts = []
    for sentence in sentences:
        input_ids = tokenizer(sentence, return_tensors="pt")["input_ids"]
        mask_idx = input_ids.tolist()[0].index(103) #103 is [MASK] id
        outputs = model(input_ids)
        logits = outputs[0].detach().numpy()
        pred = tokenizer.convert_ids_to_tokens(np.argmax(logits, axis=2)[0])
        predicts.append(pred[mask_idx])
    return predicts

In [35]:
def evaluate(pred, label):
    cnt = 0
    for p, l in zip(pred, label):
        if p==l : 
            cnt += 1
    return cnt / len(pred)

In [36]:
def get_accuracy(data):
    sent, label = preprocessing(data)
    predict = inference(sent)
    return evaluate(predict, label)

In [38]:
get_accuracy(verb_hao[:100])

0.68

In [203]:
verb_hao_acc = get_accuracy(verb_hao)
verb_dao_acc = get_accuracy(verb_dao)
verb_xialai_acc = get_accuracy(verb_xialai)
verb_xiaqu_acc = get_accuracy(verb_xiaqu)

### 2.3 Final result

In [204]:
print("Verb_hao accuracy : ",verb_hao_acc)
print("Verb_dao accuracy : ",verb_dao_acc)
print("Verb_xialai accuracy : ",verb_xialai_acc)
print("Verb_xiaqu accuracy : ",verb_xiaqu_acc)

Verb_hao accuracy :  0.6897
Verb_dao accuracy :  0.7474
Verb_xialai accuracy :  0.8272
Verb_xiaqu accuracy :  0.872
