In [1]:
import pandas as pd
import numpy as np

In [2]:
import torch
print(f"Torch Version: {torch.__version__}")

import transformers
print(f"transformers (Adapter) Version: {transformers.__version__}")

Torch Version: 1.8.1
transformers (Adapter) Version: 2.0.1


In [3]:
from transformers import RobertaTokenizer
import numpy as np

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")

In [4]:
data_path = "./NER_multilabel_data_v2.csv"
df = pd.read_csv(data_path)

all_tags = df.newTag

all_tags = set(all_tags)

all_tags = "|".join(all_tags)
all_tags = all_tags.split("|")
all_tags = set(all_tags)
all_tags = list(all_tags)

In [5]:

def process_csv(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tags = df.groupby("Sentence #")["newTag"].apply(list).values
    return sentences, tags

sentences, tags = process_csv(data_path)

In [6]:
from torch.utils.data import Dataset
from sklearn.preprocessing import OneHotEncoder
    
    


class NER_Dataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer, data_path, labels):
        assert mode in ["train", "test"]  # 一般訓練你會需要 dev set
        self.mode = mode
        # 大數據你會需要用 iterator=True
        self.sentences, self.tags = process_csv(data_path)
        self.len = len(self.sentences)
        

        if mode != "test":
            self.label_map = {}
            for i in range(len(labels)):
                self.label_map[labels[i]] = i
                
            possible_labels = np.array(range(len(labels))).reshape(-1, 1)
            self.oneHotEncoder = OneHotEncoder()
            self.oneHotEncoder.fit(possible_labels)
        else:
            self.label_map = None
        
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
        self.O_label = self.label_map["O"]

    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            label_tensor = None
        else:
            label = ["O"] + self.tags[idx] + ["O"]

            label = np.array(label)
            label = label.reshape(-1,1)

            label = np.apply_along_axis(self.split_one_hot_multiTags, 1, label)
            label_tensor = torch.tensor(label, dtype = torch.float32)
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ['[CLS]']
        word_pieces += self.sentences[idx]
        word_pieces += ['[SEP]']
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0
        segments_tensor = torch.zeros_like(tokens_tensor)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
    def split_one_hot_multiTags(self, tags):
        # tags = ['B-org|Party|String']
        tags = tags[0]
        tags = tags.split("|")


        tags_num = list(map(lambda x: self.label_map[x], tags))
        #[5, 20, 23]

        tags_num = np.array(tags_num).reshape(-1,1)

        tags_one_hot = self.oneHotEncoder.transform(tags_num).toarray()

        tags_one_hot = tags_one_hot.sum(axis = 0)

        #return torch.tensor(tags_one_hot, dtype = torch.float32)

        return tags_one_hot
    
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞

df = pd.read_csv(data_path, encoding="latin-1")

labels = np.unique("|".join(list(df.newTag)).split("|"))
print(f"labels: {labels}")

trainset = NER_Dataset("train", tokenizer=tokenizer, data_path=data_path, labels= labels)

labels: ['B-art' 'B-eve' 'B-geo' 'B-gpe' 'B-nat' 'B-org' 'B-per' 'B-tim'
 'CountryCode' 'CryptoCurrencyCode' 'CurrencyCode' 'Event' 'Float' 'I-art'
 'I-eve' 'I-geo' 'I-gpe' 'I-nat' 'I-org' 'I-per' 'I-tim' 'Integer'
 'Location' 'Month' 'O' 'Object' 'Party' 'Race' 'SpecialTerm'
 'TemporalUnit' 'Time' 'Timezone' 'US_States']


In [7]:
from torch.utils.data import DataLoader, IterableDataset
from torch.nn.utils.rnn import pad_sequence
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = [s[2] for s in samples]
        label_ids = pad_sequence(label_ids, 
                                  batch_first=True)
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [8]:
trainset.id2label = {}
for key in trainset.label_map.keys():
    trainset.id2label[trainset.label_map[key]] = key

In [9]:
from transformers import RobertaConfig, RobertaModelWithHeads

config = RobertaConfig.from_pretrained(
    "roberta-base",
    num_labels=len(all_tags),
    label2id = trainset.label_map, 
    id2label = trainset.id2label
)
model = RobertaModelWithHeads.from_pretrained(
    "roberta-base",
    config=config,
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

name = model.load_adapter("./save_adapters/ALL_tag_0730")
model.add_tagging_head(
        name,
        num_labels=len(trainset.label_map.keys()), overwrite_ok=True
      )
model.train_adapter(name)

In [10]:
BATCH_SIZE = 256
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [11]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data

'''print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")'''

'print(f"""\ntokens_tensors.shape   = {tokens_tensors.shape} \n{tokens_tensors}\n------------------------\nsegments_tensors.shape = {segments_tensors.shape}\n{segments_tensors}\n------------------------\nmasks_tensors.shape    = {masks_tensors.shape}\n{masks_tensors}\n------------------------\nlabel_ids.shape        = {label_ids.shape}\n{label_ids}\n""")'


no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
                {
                    "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                    "weight_decay": 1e-5,
                },
                {
                    "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                    "weight_decay": 0.0,
                },
            ]
optimizer = torch.optim.AdamW(params=optimizer_grouped_parameters, lr=1e-4)


In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [13]:
all_tags = ['Float','TemporalUnit','I-gpe','CountryCode','CurrencyCode','Timezone','CryptoCurrencyCode','Month','Party','B-tim','I-art','Time','B-per','B-gpe','B-geo','O','Location','Event','I-nat','Race','B-org','I-geo','I-tim','I-eve','SpecialTerm','B-art','US_States','B-eve','I-org','B-nat','Object','I-per','Integer']

In [1]:
from telegram_notifier import send_message as telegram_bot_sendtext

In [15]:
train_id = 0
device = torch.device(f"cuda:{train_id}" if torch.cuda.is_available() else "cpu")

model = model.to(device)

In [None]:
for index, tag in enumerate(all_tags):
    if index % 2 == train_id:
        print(f"\nSkip {tag}.\n")
        continue
    model.add_adapter(tag)
    model.add_tagging_head(
        tag,
        num_labels=2
      )
    model.train_adapter(tag)
    model = model.to(device)
    for epoch in range(2):
        print(f"\n{tag}: epoch {epoch}")
        for i, data in enumerate(trainloader):

            tokens_tensors, segments_tensors, \
            masks_tensors, labels = [t.to(device) for t in data]

            outputs = model(input_ids = tokens_tensors,
                attention_mask=masks_tensors,
                token_type_ids=segments_tensors)


            logits = outputs[0]

            current_label = labels.view(-1, labels.shape[-1])[:, trainset.label_map[tag]]
            current_label = current_label.view(-1)

            active_logits = logits.view(-1, logits.shape[-1])[masks_tensors.view(-1) == 1]
            active_labels = current_label[masks_tensors.view(-1)== 1]

            active_labels = active_labels.long()

            loss_fct = torch.nn.CrossEntropyLoss()

            loss = loss_fct(active_logits, active_labels)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            if i % 10 == 0:
                print(f"\tLoss: {loss}")
        telegram_bot_sendtext(f"\n{tag}: epoch {epoch}, loss = {loss}")
    model.save_adapter(f"./save_adapters/{tag}_0730", model.active_adapters[0])
    model.save_head(f"./save_heads/{tag}_0730", model.active_head)


Skip Float.


TemporalUnit: epoch 0
	Loss: 0.5391535758972168
	Loss: 0.5389758348464966
	Loss: 0.5388613343238831
	Loss: 0.5377902388572693
	Loss: 0.5413982272148132
	Loss: 0.5388877987861633
	Loss: 0.5381402969360352
	Loss: 0.5406540036201477
	Loss: 0.5400776863098145
	Loss: 0.5401015281677246
	Loss: 0.5411549210548401
	Loss: 0.5415317416191101
	Loss: 0.5394170880317688
	Loss: 0.5425978899002075
	Loss: 0.5405617952346802
	Loss: 0.5384573936462402
	Loss: 0.5415876507759094
	Loss: 0.5379591584205627
	Loss: 0.5425953269004822

TemporalUnit: epoch 1
	Loss: 0.5402517914772034
	Loss: 0.5376653075218201
	Loss: 0.5391207337379456
	Loss: 0.5397608876228333
	Loss: 0.5389649271965027
	Loss: 0.5378719568252563
	Loss: 0.5390498042106628
	Loss: 0.5396788120269775
	Loss: 0.5404996871948242
	Loss: 0.5411071181297302
	Loss: 0.5388025641441345
	Loss: 0.5425803661346436
	Loss: 0.5401619076728821
	Loss: 0.5422891974449158
	Loss: 0.5396026372909546
	Loss: 0.5394212603569031
	Loss: 0.5414801239967346
	Los

In [None]:
label_id_mapping = trainset.label_map

id_label_mapping = dict()
for key in label_id_mapping.keys():
    id_label_mapping[label_id_mapping[key]] = key

def test_model(model, sentence, device = "cpu"):
    tokenized_sentence = torch.tensor([tokenizer.encode(sentence)])
    pos = torch.tensor([[0] * len(tokenized_sentence)])
    tags = torch.tensor([[1] * len(tokenized_sentence)])

    model = model.to(device)
    outputs = model(input_ids=tokenized_sentence.to(device), 
                    token_type_ids=pos.to(device), 
                    attention_mask=tags.to(device))

    logits = outputs[0]

    _, pred_labels = torch.max(logits, 2)

    out_labels = []
    for row in pred_labels:
        result = list(map(lambda x: id_label_mapping[int(x)], row))
        out_labels.append(result)
    #return tokenizer.tokenize(sentence), out_labels[0], logits
    return tokenizer.tokenize(sentence), out_labels[0][1:-1], logits[:, 1:-1]

In [None]:
sentence = "Dan Will be deemed to have completed its delivery obligations before 2021-7-5 if in Niall's opinion, the Jeep Car satisfies the Acceptance Criteria, and Niall notifies Dan in writing that it is accepting the Jeep Car."
sen, pred, logits = test_model(model, sentence, device = 'cpu')

In [None]:
a = tokenizer.tokenize(sentence)[1]

In [None]:
np.array(sen)

In [None]:
np.array(pred)

In [None]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
def interact_word(i):
    print(i)
    print(sen[i])
    target = out[i]

    for i in range(len(target)):
        print(f"{i} {id_label_mapping[i].ljust(6)} \t: {target[i]:.5f}")

In [None]:
out = logits[0]
interact(lambda x: interact_word(x), x=widgets.IntSlider(min=0, max=len(sen)-1, step=1, value=0))

In [None]:
print("OK")