In [1]:
from transformers import (
    BertConfig,
    BertForTokenClassification,
    BertTokenizer,)

import torch
from transformers import AutoModel

model_name = "bert-base-cased"
model = AutoModel.from_pretrained(model_name)

configuration = model.config

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
# python
from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")


In [3]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
data_path = "../../keras/ner_dataset.csv"

def process_csv(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    enc_tag = preprocessing.LabelEncoder()
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tag = df.groupby("Sentence #")["Tag"].apply(list).values
    return sentences, tag, enc_tag

In [4]:
df = pd.read_csv(data_path, encoding="latin-1")

In [5]:
labels = list(np.unique(df.Tag))

In [6]:
from torch.utils.data import Dataset
from sklearn.preprocessing import OneHotEncoder
    
class NER_Dataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer, data_path, labels):
        assert mode in ["train", "test"]  # 一般訓練你會需要 dev set
        self.mode = mode
        # 大數據你會需要用 iterator=True
        self.sentences, self.tags, self.tag_encoder = process_csv(data_path)
        self.len = len(self.sentences)
        

        if mode != "test":
            self.label_map = {}
            for i in range(len(labels)):
                self.label_map[labels[i]] = i
                
            possible_labels = np.array(range(len(labels))).reshape(-1, 1)
            self.oneHotEncoder = OneHotEncoder()
            self.oneHotEncoder.fit(possible_labels)
        else:
            self.label_map = None
        
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
        self.O_label = self.label_map["O"]

    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            label_tensor = None
        else:
            label = [self.O_label] + self.tags[idx] + [self.O_label]
            label = np.array(label).reshape(-1,1)
            label = self.oneHotEncoder.transform(label).toarray()
            label_tensor = torch.tensor(label, dtype = torch.float32)
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ['[CLS]']
        word_pieces += self.sentences[idx]
        word_pieces += ['[SEP]']
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0
        segments_tensor = torch.zeros_like(tokens_tensor)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞

df = pd.read_csv(data_path, encoding="latin-1")
labels = list(np.unique(df.Tag))

trainset = NER_Dataset("train", tokenizer=tokenizer, data_path=data_path, labels= labels)

In [7]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = [s[2] for s in samples]
        label_ids = pad_sequence(label_ids, 
                                  batch_first=True)
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [8]:
BATCH_SIZE = 16*4
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [9]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([64, 56]) 
tensor([[  101, 26159,  1104,  ...,     0,     0,     0],
        [  101,  7239,  3878,  ...,     0,     0,     0],
        [  101,   100,   100,  ...,     0,     0,     0],
        ...,
        [  101,  1130,   170,  ...,   119,   100,   102],
        [  101,  1697,  6096,  ...,     0,     0,     0],
        [  101,  1258,  6086,  ...,     0,     0,     0]])
------------------------
segments_tensors.shape = torch.Size([64, 56])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([64, 56])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------


NUM_LABELS = len(labels)

model = BertForTokenClassification.from_pretrained(
    model_name, num_labels=NUM_LABELS)

In [11]:
from transformers import BertPreTrainedModel, BertModel
from torch import nn


In [12]:
import transformers

In [13]:
from transformers.modeling_outputs import TokenClassifierOutput

In [14]:

class BertForTokenMultiLabelClassification(BertPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)
        


    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else True#self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            #inputs_embeds=inputs_embeds,
            #output_attentions=output_attentions,
            #output_hidden_states=output_hidden_states,
            #return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        logits = torch.sigmoid(logits)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCELoss()
            loss_fct.ignore_index = -100
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                #active_logits = logits.view(-1, self.num_labels)
                #active_labels = torch.where(
                #    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                #)

                active_logits = logits.view(-1, self.num_labels)[attention_mask.view(-1)== 1]
                active_labels = labels.view(-1, self.num_labels)[attention_mask.view(-1)== 1]
                #print(f"{logits.shape}\n{active_logits.shape}, {active_labels.shape}")
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )



In [15]:
model = BertForTokenMultiLabelClassification.from_pretrained(model_name, num_labels=NUM_LABELS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenMultiLabelClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenMultiLabelClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenMultiLabelClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenMultiLabelClassification were not initialized fr

In [16]:
model

BertForTokenMultiLabelClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [17]:
# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=17, bias=True)


In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

device: cuda:0


In [19]:
with torch.no_grad():
    # 遍巡整個資料集
    for data in trainloader:
        data = [t.to("cuda:0") for t in data if t is not None]
        tokens_tensors, segments_tensors, masks_tensors = data[:3]
        outputs = model(input_ids=tokens_tensors, 
                token_type_ids=segments_tensors, 
                attention_mask=masks_tensors)
        break

In [21]:
%%time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


EPOCHS = 16  # 幸運數字
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)
    acc = 0

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))
    

[epoch 1] loss: 9.127, acc: 0.000
[epoch 2] loss: 7.938, acc: 0.000
[epoch 3] loss: 7.204, acc: 0.000
[epoch 4] loss: 6.608, acc: 0.000
[epoch 5] loss: 6.094, acc: 0.000
[epoch 6] loss: 5.578, acc: 0.000
[epoch 7] loss: 5.118, acc: 0.000
[epoch 8] loss: 4.705, acc: 0.000
[epoch 9] loss: 4.334, acc: 0.000
[epoch 10] loss: 3.990, acc: 0.000
[epoch 11] loss: 3.676, acc: 0.000
[epoch 12] loss: 3.393, acc: 0.000
[epoch 13] loss: 3.144, acc: 0.000
[epoch 14] loss: 2.902, acc: 0.000
[epoch 15] loss: 2.675, acc: 0.000
[epoch 16] loss: 2.482, acc: 0.000
CPU times: user 32min 41s, sys: 1.74 s, total: 32min 42s
Wall time: 25min 13s


17

In [109]:
get_predictions(model, trainloader, compute_acc=True)

(tensor([16, 16, 16,  ..., 16, 16, 16], device='cuda:0'), 0.9898610126929566)

In [108]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors, labels = data
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]


            active_logits = logits.view(-1, model.num_labels)[masks_tensors.view(-1)== 1]
            active_labels = labels.view(-1, model.num_labels)[masks_tensors.view(-1)== 1]

            _, pred_label = torch.max(active_logits, 1)

            _, real_label = torch.max(active_labels, 1)

            (pred_label == real_label).sum()/len(real_label)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                total += real_label.size(0)
                correct += (pred_label == real_label).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

In [44]:
torch.save(model.state_dict(), "./test_models/0630_16_epoch")

In [373]:
label_id_mapping = trainset.label_map

id_label_mapping = dict()
for key in label_id_mapping.keys():
    id_label_mapping[label_id_mapping[key]] = key

def test_model(model, sentence, device = "cpu"):
    tokenized_sentence = torch.tensor([tokenizer.encode(sentence)])
    pos = torch.tensor([[0] * len(tokenized_sentence)])
    tags = torch.tensor([[0] * len(tokenized_sentence)])

    model = model.to(device)
    outputs = model(input_ids=tokenized_sentence.to(device), 
                    token_type_ids=pos.to(device), 
                    attention_mask=tags.to(device))

    logits = outputs[0]

    _, pred_labels = torch.max(logits, 2)

    out_labels = []
    for row in pred_labels:
        result = list(map(lambda x: id_label_mapping[int(x)], row))
        out_labels.append(result)
    return tokenizer.tokenize(sentence), out_labels[0], logits
    return tokenizer.tokenize(sentence), out_labels[0][1:-1], logits

In [374]:
model2 = BertForTokenMultiLabelClassification.from_pretrained(model_name, num_labels=NUM_LABELS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenMultiLabelClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenMultiLabelClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenMultiLabelClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenMultiLabelClassification were not initialized fr

In [375]:
model2.load_state_dict(torch.load("./test_models/0630_16_epoch"))

<All keys matched successfully>

In [376]:
sentence = "My name is Eason I live in Taiwan"
sen, pred, logits = test_model(model2, sentence, device = 'cpu')

In [377]:
for i in range(len(sen)):
    print(f"{sen[i]}: {pred[i]}")

My: O
name: O
is: O
E: O
##as: B-per
##on: B-per
I: I-per
live: O
in: O
Taiwan: O


In [378]:
out = logits[0]

In [379]:
target

tensor([6.4062e-04, 1.2314e-04, 6.5276e-03, 1.3721e-03, 1.1180e-04, 1.1220e-01,
        1.5720e-01, 2.4676e-04, 1.0463e-03, 1.3654e-04, 3.3562e-03, 8.4393e-04,
        1.8475e-04, 2.3194e-02, 2.6318e-02, 7.9404e-05, 1.6108e-02],
       grad_fn=<SelectBackward>)

In [381]:
i = 4
print(sen[i-1])
target = out[i]

for i in range(len(target)):
    print(f"{i} {id_label_mapping[i].ljust(6)} \t: {target[i]:.5f}")

E
0 B-art  	: 0.00079
1 B-eve  	: 0.00007
2 B-geo  	: 0.00370
3 B-gpe  	: 0.00046
4 B-nat  	: 0.00008
5 B-org  	: 0.03405
6 B-per  	: 0.89344
7 B-tim  	: 0.00018
8 I-art  	: 0.00003
9 I-eve  	: 0.00002
10 I-geo  	: 0.00010
11 I-gpe  	: 0.00003
12 I-nat  	: 0.00001
13 I-org  	: 0.00012
14 I-per  	: 0.00009
15 I-tim  	: 0.00015
16 O      	: 0.01759


In [380]:
i = 5
print(sen[i-1])
target = out[i]

for i in range(len(target)):
    print(f"{i} {id_label_mapping[i].ljust(6)} \t: {target[i]:.5f}")

##as
0 B-art  	: 0.00064
1 B-eve  	: 0.00012
2 B-geo  	: 0.00653
3 B-gpe  	: 0.00137
4 B-nat  	: 0.00011
5 B-org  	: 0.11220
6 B-per  	: 0.15720
7 B-tim  	: 0.00025
8 I-art  	: 0.00105
9 I-eve  	: 0.00014
10 I-geo  	: 0.00336
11 I-gpe  	: 0.00084
12 I-nat  	: 0.00018
13 I-org  	: 0.02319
14 I-per  	: 0.02632
15 I-tim  	: 0.00008
16 O      	: 0.01611


In [382]:
i = 6
print(sen[i-1])
target = out[i]

for i in range(len(target)):
    print(f"{i} {id_label_mapping[i].ljust(6)} \t: {target[i]:.5f}")

##on
0 B-art  	: 0.00055
1 B-eve  	: 0.00007
2 B-geo  	: 0.00030
3 B-gpe  	: 0.00036
4 B-nat  	: 0.00007
5 B-org  	: 0.00077
6 B-per  	: 0.15957
7 B-tim  	: 0.00030
8 I-art  	: 0.00339
9 I-eve  	: 0.00024
10 I-geo  	: 0.00254
11 I-gpe  	: 0.00093
12 I-nat  	: 0.00028
13 I-org  	: 0.07247
14 I-per  	: 0.59209
15 I-tim  	: 0.00036
16 O      	: 0.00385
