In [206]:
torch.__version__

'1.8.0+cu111'

In [208]:
transformers.__version__

'4.8.1'

In [1]:
from transformers import (
    BertConfig,
    BertForTokenClassification,
    BertTokenizer,)

import torch
from transformers import AutoModel

model_name = "bert-base-cased"
model = AutoModel.from_pretrained(model_name)

configuration = model.config

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
# python
from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")


In [3]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
data_path = "../NER_test/multiple_tag_train.csv"

In [4]:


def process_csv(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tags = df.groupby("Sentence #")["multiTag"].apply(list).values
    return sentences, tags

In [193]:
df.head(50)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Sentence #,Word,POS,Tag,multiTag
0,0,0,Sentence: 1,Thousands,NNS,O,O
1,1,1,,of,IN,O,O
2,2,2,,demonstrators,NNS,O,O
3,3,3,,have,VBP,O,O
4,4,4,,marched,VBN,O,O
5,5,5,,through,IN,O,O
6,6,6,,London,NNP,B-geo,B-geo|Location|Party
7,7,7,,to,TO,O,O
8,8,8,,protest,VB,O,O
9,9,9,,the,DT,O,O


In [5]:
from torch.utils.data import Dataset
from sklearn.preprocessing import OneHotEncoder
    
    


class NER_Dataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer, data_path, labels):
        assert mode in ["train", "test"]  # 一般訓練你會需要 dev set
        self.mode = mode
        # 大數據你會需要用 iterator=True
        self.sentences, self.tags = process_csv(data_path)
        self.len = len(self.sentences)
        

        if mode != "test":
            self.label_map = {}
            for i in range(len(labels)):
                self.label_map[labels[i]] = i
                
            possible_labels = np.array(range(len(labels))).reshape(-1, 1)
            self.oneHotEncoder = OneHotEncoder()
            self.oneHotEncoder.fit(possible_labels)
        else:
            self.label_map = None
        
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
        self.O_label = self.label_map["O"]

    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            label_tensor = None
        else:
            label = ["O"] + self.tags[idx] + ["O"]

            label = np.array(label)
            label = label.reshape(-1,1)

            label = np.apply_along_axis(self.split_one_hot_multiTags, 1, label)
            label_tensor = torch.tensor(label, dtype = torch.float32)
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ['[CLS]']
        word_pieces += self.sentences[idx]
        word_pieces += ['[SEP]']
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0
        segments_tensor = torch.zeros_like(tokens_tensor)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
    def split_one_hot_multiTags(self, tags):
        # tags = ['B-org|Party|String']
        tags = tags[0]
        tags = tags.split("|")


        tags_num = list(map(lambda x: self.label_map[x], tags))
        #[5, 20, 23]

        tags_num = np.array(tags_num).reshape(-1,1)

        tags_one_hot = self.oneHotEncoder.transform(tags_num).toarray()

        tags_one_hot = tags_one_hot.sum(axis = 0)

        #return torch.tensor(tags_one_hot, dtype = torch.float32)

        return tags_one_hot
    
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞

df = pd.read_csv(data_path, encoding="latin-1")

labels = np.unique("|".join(list(df.multiTag)).split("|"))
print(f"labels: {labels}")

trainset = NER_Dataset("train", tokenizer=tokenizer, data_path=data_path, labels= labels)

labels: ['B-art' 'B-eve' 'B-geo' 'B-gpe' 'B-nat' 'B-org' 'B-per' 'B-tim' 'Event'
 'I-art' 'I-eve' 'I-geo' 'I-gpe' 'I-nat' 'I-org' 'I-per' 'I-tim'
 'Location' 'O' 'Object' 'Party' 'Race' 'SpecialTerm' 'String'
 'TemporalUnit']


In [6]:
trainset.label_map

{'B-art': 0,
 'B-eve': 1,
 'B-geo': 2,
 'B-gpe': 3,
 'B-nat': 4,
 'B-org': 5,
 'B-per': 6,
 'B-tim': 7,
 'Event': 8,
 'I-art': 9,
 'I-eve': 10,
 'I-geo': 11,
 'I-gpe': 12,
 'I-nat': 13,
 'I-org': 14,
 'I-per': 15,
 'I-tim': 16,
 'Location': 17,
 'O': 18,
 'Object': 19,
 'Party': 20,
 'Race': 21,
 'SpecialTerm': 22,
 'String': 23,
 'TemporalUnit': 24}

In [7]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = [s[2] for s in samples]
        label_ids = pad_sequence(label_ids, 
                                  batch_first=True)
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

BATCH_SIZE = 64: 10883MiB

In [8]:
BATCH_SIZE = 16*4
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [9]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([64, 56]) 
tensor([[  101, 26159,  1104,  ...,     0,     0,     0],
        [  101,  7239,  3878,  ...,     0,     0,     0],
        [  101,   100,   100,  ...,     0,     0,     0],
        ...,
        [  101,  1130,   170,  ...,   119,   100,   102],
        [  101,  1697,  6096,  ...,     0,     0,     0],
        [  101,  1258,  6086,  ...,     0,     0,     0]])
------------------------
segments_tensors.shape = torch.Size([64, 56])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([64, 56])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------


NUM_LABELS = len(labels)

model = BertForTokenClassification.from_pretrained(
    model_name, num_labels=NUM_LABELS)

In [10]:
from transformers import BertPreTrainedModel, BertModel
from torch import nn


In [11]:
import transformers

In [12]:
from transformers.modeling_outputs import TokenClassifierOutput

In [13]:

class BertForTokenMultiLabelClassification(BertPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)
        


    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else True#self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            #inputs_embeds=inputs_embeds,
            #output_attentions=output_attentions,
            #output_hidden_states=output_hidden_states,
            #return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        logits = torch.sigmoid(logits)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCELoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                #active_logits = logits.view(-1, self.num_labels)
                #active_labels = torch.where(
                #    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                #)

                active_logits = logits.view(-1, self.num_labels)[attention_mask.view(-1)== 1]
                active_labels = labels.view(-1, self.num_labels)[attention_mask.view(-1)== 1]
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )



In [14]:
NUM_LABELS = len(labels)
model = BertForTokenMultiLabelClassification.from_pretrained(model_name, num_labels=NUM_LABELS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenMultiLabelClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenMultiLabelClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenMultiLabelClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenMultiLabelClassification were not initialized fr

In [15]:
model

BertForTokenMultiLabelClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [16]:
# high-level show modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=25, bias=True)


In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

device: cuda:0


In [18]:
with torch.no_grad():
    # test_run
    for data in trainloader:
        data = [t.to("cuda:0") for t in data if t is not None]
        tokens_tensors, segments_tensors, masks_tensors = data[:3]
        outputs = model(input_ids=tokens_tensors, 
                token_type_ids=segments_tensors, 
                attention_mask=masks_tensors)
        break

In [None]:
%%time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


EPOCHS = 16  # 
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # add to batch loss
        running_loss += loss.item()
        
    
    #_, acc = get_predictions(model, trainloader, compute_acc=True)
    acc = 0 # no accurate calculation to save training time.
    
    #print('[epoch %d] loss: %.3f, acc: %.3f' %
    #      (epoch + 1, running_loss, acc))
    
    print('[epoch %d] loss: %.3f' %
      (epoch + 1, running_loss))
    

[epoch 1] loss: 66.058, acc: 0.000
[epoch 2] loss: 20.139, acc: 0.000
[epoch 3] loss: 14.409, acc: 0.000
[epoch 4] loss: 11.926, acc: 0.000
[epoch 5] loss: 10.402, acc: 0.000
[epoch 6] loss: 9.266, acc: 0.000
[epoch 7] loss: 8.369, acc: 0.000
[epoch 8] loss: 7.601, acc: 0.000
[epoch 9] loss: 6.940, acc: 0.000
[epoch 10] loss: 6.412, acc: 0.000


In [169]:
print('[epoch %d] loss: %.3f' %
      (epoch + 1, running_loss))

[epoch 16] loss: 3.947


In [23]:
torch.save(model.state_dict(), "./test_models/0704_multi_label_16_epoch")

In [None]:
torch.save(model.state_dict(), "./test_models/0704_16_epoch")

In [68]:
label_id_mapping = trainset.label_map

id_label_mapping = dict()
for key in label_id_mapping.keys():
    id_label_mapping[label_id_mapping[key]] = key

def test_model(model, sentence, device = "cpu"):
    tokenized_sentence = torch.tensor([tokenizer.encode(sentence)])
    pos = torch.tensor([[0] * len(tokenized_sentence)])
    tags = torch.tensor([[1] * len(tokenized_sentence)])

    model = model.to(device)
    outputs = model(input_ids=tokenized_sentence.to(device), 
                    token_type_ids=pos.to(device), 
                    attention_mask=tags.to(device))

    logits = outputs[0]

    _, pred_labels = torch.max(logits, 2)

    out_labels = []
    for row in pred_labels:
        result = list(map(lambda x: id_label_mapping[int(x)], row))
        out_labels.append(result)
    #return tokenizer.tokenize(sentence), out_labels[0], logits
    return tokenizer.tokenize(sentence), out_labels[0][1:-1], logits[:, 1:-1]

In [170]:
model2 = BertForTokenMultiLabelClassification.from_pretrained(model_name, num_labels=NUM_LABELS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenMultiLabelClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenMultiLabelClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenMultiLabelClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenMultiLabelClassification were not initialized fr

In [195]:
model2.load_state_dict(torch.load("./test_models/0704_multi_label_16_epoch"))

<All keys matched successfully>

In [196]:
sentence = "Dan will be deemed to have completed its delivery obligations before 2021-07-05 if in Niall's opinion, the Jeep Car satisfies the Acceptance Criteria, and Niall notifies Dan in writing that it is accepting the Jeep Car."
sen, pred, logits = test_model(model2, sentence, device = 'cpu')


In [197]:
np.array(sen)

array(['Dan', 'will', 'be', 'deemed', 'to', 'have', 'completed', 'its',
       'delivery', 'obligations', 'before', '202', '##1', '-', '07', '-',
       '05', 'if', 'in', 'Niall', "'", 's', 'opinion', ',', 'the', 'Jeep',
       'Car', 'sat', '##is', '##fies', 'the', 'A', '##cc', '##ept',
       '##ance', 'C', '##rite', '##ria', ',', 'and', 'Niall', 'not',
       '##ifies', 'Dan', 'in', 'writing', 'that', 'it', 'is', 'accepting',
       'the', 'Jeep', 'Car', '.'], dtype='<U11')

In [198]:
np.array(pred)

array(['Party', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'String', 'String', 'I-tim', 'String', 'String', 'String', 'O',
       'O', 'Party', 'O', 'O', 'O', 'O', 'O', 'String', 'String', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'Party', 'O', 'O', 'Party', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'String', 'String', 'O'], dtype='<U6')

In [199]:
logits.shape

torch.Size([1, 54, 25])

In [200]:
for i in range(len(sen)):
    print(f"{sen[i]}: {pred[i]}")

Dan: Party
will: O
be: O
deemed: O
to: O
have: O
completed: O
its: O
delivery: O
obligations: O
before: O
202: String
##1: String
-: I-tim
07: String
-: String
05: String
if: O
in: O
Niall: Party
': O
s: O
opinion: O
,: O
the: O
Jeep: String
Car: String
sat: O
##is: O
##fies: O
the: O
A: O
##cc: O
##ept: O
##ance: O
C: O
##rite: O
##ria: O
,: O
and: O
Niall: Party
not: O
##ifies: O
Dan: Party
in: O
writing: O
that: O
it: O
is: O
accepting: O
the: O
Jeep: String
Car: String
.: O


In [201]:
out = logits[0]

In [202]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets


In [203]:
def interact_word(i):
    print(i)
    print(sen[i])
    target = out[i]

    for i in range(len(target)):
        print(f"{i} {id_label_mapping[i].ljust(6)} \t: {target[i]:.5f}")

In [204]:
sen[12]

'##1'

In [205]:
interact(lambda x: interact_word(x), x=widgets.IntSlider(min=0, max=len(sen)-1, step=1, value=0))

interactive(children=(IntSlider(value=0, description='x', max=53), Output()), _dom_classes=('widget-interact',…

<function __main__.<lambda>(x)>

In [168]:

i = 3
print(sen[i])
target = out[i]

for i in range(len(target)):
    print(f"{i} {id_label_mapping[i].ljust(6)} \t: {target[i]:.5f}")

deemed
0 B-art  	: 0.00001
1 B-eve  	: 0.00001
2 B-geo  	: 0.00003
3 B-gpe  	: 0.00003
4 B-nat  	: 0.00001
5 B-org  	: 0.00003
6 B-per  	: 0.00002
7 B-tim  	: 0.00002
8 Event  	: 0.00002
9 I-art  	: 0.00001
10 I-eve  	: 0.00001
11 I-geo  	: 0.00002
12 I-gpe  	: 0.00001
13 I-nat  	: 0.00001
14 I-org  	: 0.00002
15 I-per  	: 0.00002
16 I-tim  	: 0.00001
17 Location 	: 0.00003
18 O      	: 0.99992
19 Object 	: 0.00002
20 Party  	: 0.00004
21 Race   	: 0.00004
22 SpecialTerm 	: 0.00001
23 String 	: 0.00006
24 TemporalUnit 	: 0.00003


In [None]:
i = 5
print(sen[i-1])
target = out[i]

for i in range(len(target)):
    print(f"{i} {id_label_mapping[i].ljust(6)} \t: {target[i]:.5f}")

In [None]:
i = 6
print(sen[i-1])
target = out[i]

for i in range(len(target)):
    print(f"{i} {id_label_mapping[i].ljust(6)} \t: {target[i]:.5f}")