In [2]:
import pandas as pd
import numpy as np

In [3]:
import torch
print(f"Torch Version: {torch.__version__}")

import transformers
print(f"transformers (Adapter) Version: {transformers.__version__}")

Torch Version: 1.8.1
transformers (Adapter) Version: 2.0.1


In [4]:
from transformers import RobertaTokenizer
import numpy as np

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")

In [5]:
data_path = "./NER_multilabel_data_v2.csv"
df = pd.read_csv(data_path)

In [6]:
all_tags = df.newTag

all_tags = set(all_tags)

all_tags = "|".join(all_tags)
all_tags = all_tags.split("|")
all_tags = set(all_tags)
all_tags = list(all_tags)

In [7]:

def process_csv(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tags = df.groupby("Sentence #")["newTag"].apply(list).values
    return sentences, tags

In [8]:
sentences, tags = process_csv(data_path)

In [9]:
from torch.utils.data import Dataset
from sklearn.preprocessing import OneHotEncoder
    
    


class NER_Dataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer, data_path, labels):
        assert mode in ["train", "test"]  # 一般訓練你會需要 dev set
        self.mode = mode
        # 大數據你會需要用 iterator=True
        self.sentences, self.tags = process_csv(data_path)
        self.len = len(self.sentences)
        

        if mode != "test":
            self.label_map = {}
            for i in range(len(labels)):
                self.label_map[labels[i]] = i
                
            possible_labels = np.array(range(len(labels))).reshape(-1, 1)
            self.oneHotEncoder = OneHotEncoder()
            self.oneHotEncoder.fit(possible_labels)
        else:
            self.label_map = None
        
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
        self.O_label = self.label_map["O"]

    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            label_tensor = None
        else:
            label = ["O"] + self.tags[idx] + ["O"]

            label = np.array(label)
            label = label.reshape(-1,1)

            label = np.apply_along_axis(self.split_one_hot_multiTags, 1, label)
            label_tensor = torch.tensor(label, dtype = torch.float32)
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ['[CLS]']
        word_pieces += self.sentences[idx]
        word_pieces += ['[SEP]']
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0
        segments_tensor = torch.zeros_like(tokens_tensor)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
    def split_one_hot_multiTags(self, tags):
        # tags = ['B-org|Party|String']
        tags = tags[0]
        tags = tags.split("|")


        tags_num = list(map(lambda x: self.label_map[x], tags))
        #[5, 20, 23]

        tags_num = np.array(tags_num).reshape(-1,1)

        tags_one_hot = self.oneHotEncoder.transform(tags_num).toarray()

        tags_one_hot = tags_one_hot.sum(axis = 0)

        #return torch.tensor(tags_one_hot, dtype = torch.float32)

        return tags_one_hot
    
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞

df = pd.read_csv(data_path, encoding="latin-1")

labels = np.unique("|".join(list(df.newTag)).split("|"))
print(f"labels: {labels}")

trainset = NER_Dataset("train", tokenizer=tokenizer, data_path=data_path, labels= labels)

labels: ['B-art' 'B-eve' 'B-geo' 'B-gpe' 'B-nat' 'B-org' 'B-per' 'B-tim'
 'CountryCode' 'CryptoCurrencyCode' 'CurrencyCode' 'Event' 'Float' 'I-art'
 'I-eve' 'I-geo' 'I-gpe' 'I-nat' 'I-org' 'I-per' 'I-tim' 'Integer'
 'Location' 'Month' 'O' 'Object' 'Party' 'Race' 'SpecialTerm'
 'TemporalUnit' 'Time' 'Timezone' 'US_States']


In [10]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = [s[2] for s in samples]
        label_ids = pad_sequence(label_ids, 
                                  batch_first=True)
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [11]:
from transformers import RobertaConfig, RobertaModelWithHeads

config = RobertaConfig.from_pretrained(
    "roberta-base",
    num_labels=2,
)
model = RobertaModelWithHeads.from_pretrained(
    "roberta-base",
    config=config,
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

In [12]:
for tag in all_tags:
    model.add_adapter(tag)
    model.add_tagging_head(
        tag,
        num_labels=2
      )

In [12]:
model.train_adapter(all_tags)

In [13]:
model.active_adapters

Stack[I-tim, SpecialTerm, CryptoCurrencyCode, I-gpe, O, I-nat, Party, Month, Float, B-gpe, B-geo, B-per, I-eve, I-org, Location, Integer, Object, B-org, B-nat, CountryCode, B-art, US_States, Race, I-per, Time, I-geo, Event, B-tim, TemporalUnit, CurrencyCode, Timezone, I-art, B-eve]

In [14]:
model.active_head

'B-eve'

In [15]:
from transformers.adapters.composition import Parallel

In [16]:
parallel = eval("Parallel('" + "','".join(all_tags) + "')")

In [17]:
model.set_active_adapters(parallel)

In [18]:
np.array(model.active_head)

array(['I-tim', 'SpecialTerm', 'CryptoCurrencyCode', 'I-gpe', 'O',
       'I-nat', 'Party', 'Month', 'Float', 'B-gpe', 'B-geo', 'B-per',
       'I-eve', 'I-org', 'Location', 'Integer', 'Object', 'B-org',
       'B-nat', 'CountryCode', 'B-art', 'US_States', 'Race', 'I-per',
       'Time', 'I-geo', 'Event', 'B-tim', 'TemporalUnit', 'CurrencyCode',
       'Timezone', 'I-art', 'B-eve'], dtype='<U18')

In [19]:
BATCH_SIZE = 4
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [20]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data

'''print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")'''

'print(f"""\ntokens_tensors.shape   = {tokens_tensors.shape} \n{tokens_tensors}\n------------------------\nsegments_tensors.shape = {segments_tensors.shape}\n{segments_tensors}\n------------------------\nmasks_tensors.shape    = {masks_tensors.shape}\n{masks_tensors}\n------------------------\nlabel_ids.shape        = {label_ids.shape}\n{label_ids}\n""")'

In [21]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

model = model.to(device)

In [22]:
%%time
"""times = 0
for data in trainloader:
    times += 1"""

"""CPU times: user 4min 19s, sys: 212 ms, total: 4min 19s
Wall time: 4min 19s"""

CPU times: user 13 µs, sys: 3 µs, total: 16 µs
Wall time: 28.1 µs


'CPU times: user 4min 19s, sys: 212 ms, total: 4min 19s\nWall time: 4min 19s'

In [23]:
times = 11990

In [24]:
class Loss_count:
    def __init__(self):
        self.all_loss = 0
        self.times = 0
    def add(self, i):
        self.all_loss += i
        self.times += 1

In [25]:
def forward(model, data, device, optimizer, loss_count):
    tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = [t.to(device) for t in data]

    out = model(input_ids = tokens_tensors, attention_mask = masks_tensors, segments_tensors = segments_tensors)

    optimizer.zero_grad()
    for i in range(len(all_tags)):
        adapter_name = all_tags[i]
        tag_index = trainset.label_map[adapter_name]

        current_tag = label_ids[:,:, tag_index]


        out[i][0].shape

        actual = torch.zeros(out[i][0].shape, device = device)

        actual[:,:,1] = current_tag
        actual[:,:,0] = (current_tag == 0).float()

        loss_fct = torch.nn.BCEWithLogitsLoss()

        loss = loss_fct(out[i][0], actual)
        loss_count.add(loss)
        if i <= len(all_tags) -1:
            loss.backward(retain_graph=True)
        else:
            loss.backward()
    optimizer.step()
    
    del tokens_tensors, segments_tensors, masks_tensors, label_ids, loss, actual, current_tag
    torch.cuda.empty_cache()

In [26]:

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
EPOCHS = 3
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 
        optimizer.zero_grad()
        
        # forward pass
        out = torch.zeros((tokens_tensors.shape[0], tokens_tensors.shape[1]), device = device).long()
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=out)

        loss = outputs[0][0]
        # backward
        loss.backward()
        optimizer.step()


        # add to batch loss
        running_loss += loss.item()

KeyboardInterrupt: 

# 目前看來外部再算 Loss 的方法行不通，記憶體會炸掉，因此改為試試看，在 Label 的部分做文章，然後選擇性地去 Loss 試試看

In [35]:
data = next(iter(trainloader))
tokens_tensors, segments_tensors, \
masks_tensors, label_ids = [t.to(device) for t in data]

out = model(input_ids = tokens_tensors, attention_mask = masks_tensors, segments_tensors = segments_tensors)

optimizer.zero_grad()
for i in range(len(all_tags)):
    adapter_name = all_tags[i]
    tag_index = trainset.label_map[adapter_name]

    current_tag = label_ids[:,:, tag_index]


    out[i][0].shape

    actual = torch.zeros(out[i][0].shape, device = device)

    actual[:,:,1] = current_tag
    actual[:,:,0] = (current_tag == 0).float()

    loss_fct = torch.nn.BCEWithLogitsLoss()

    loss = loss_fct(out[i][0], actual)

    loss.backward(retain_graph=True)
optimizer.step()

In [52]:
tmp_label = torch.zeros(tokens_tensors.shape, device = device).long()
out = model(input_ids = tokens_tensors,
            attention_mask = masks_tensors,
            segments_tensors = segments_tensors,
            labels = tmp_label)

In [63]:
i = 4

In [64]:
adapter_name = all_tags[i]
tag_index = trainset.label_map[adapter_name]

current_tag = label_ids[:,:, tag_index]



In [70]:
current_tag

tensor([[1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
         1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
         1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:1')

In [54]:
for i in range(len(all_tags)):
    adapter_name = all_tags[i]
    tag_index = trainset.label_map[adapter_name]

    current_tag = label_ids[:,:, tag_index]


    actual = torch.zeros(out[i][0].shape, device = device)

    actual[:,:,1] = current_tag
    actual[:,:,0] = (current_tag == 0).float()

    loss_fct = torch.nn.BCEWithLogitsLoss()

    loss = loss_fct(out[i][0], actual)

    loss.backward(retain_graph=True)

TokenClassifierOutput(loss=tensor(2.1799e-05, device='cuda:1', grad_fn=<NllLossBackward>), logits=tensor([[[ 5.3136, -4.6688],
         [ 6.2403, -5.2024],
         [ 5.9424, -5.2621],
         [ 5.9892, -5.3082],
         [ 5.8272, -5.3263],
         [ 6.0709, -5.7180],
         [ 5.7603, -4.9385],
         [ 6.1404, -5.1860],
         [ 5.9128, -5.0043],
         [ 6.0323, -5.4904],
         [ 6.1061, -5.1775],
         [ 5.8515, -5.1434],
         [ 5.5018, -4.7776],
         [ 5.9586, -5.3871],
         [ 5.9517, -5.4626],
         [ 5.8563, -5.5738],
         [ 6.0551, -5.2277],
         [ 6.0443, -5.4832],
         [ 6.0166, -5.3652],
         [ 6.0743, -5.3007],
         [ 5.9998, -5.6140],
         [ 5.9225, -5.1588],
         [ 5.7624, -4.9556],
         [ 5.8827, -5.3898],
         [ 5.2844, -4.6918],
         [ 6.0714, -5.5793],
         [ 4.5845, -4.1236],
         [ 3.7666, -3.3064],
         [ 4.8879, -4.3596],
         [ 5.0257, -4.5994],
         [ 5.3031, -4.5590],
   

In [49]:
out[0][0].shape

torch.Size([4, 34, 2])

In [37]:
tokens_tensors.shape

torch.Size([4, 34])

In [44]:
actual.shape

torch.Size([4, 34, 2])

In [None]:
Epoch = 5
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(Epoch):
    all_loss = 0.0
    times = 0
    for data in trainloader:

        tokens_tensors, segments_tensors, \
        masks_tensors, label_ids = [t.to(device) for t in data]

        out = model(input_ids = tokens_tensors, attention_mask = masks_tensors, segments_tensors = segments_tensors)

        optimizer.zero_grad()
        for i in range(len(all_tags)):
            adapter_name = all_tags[i]
            tag_index = trainset.label_map[adapter_name]

            current_tag = label_ids[:,:, tag_index]


            out[i][0].shape

            actual = torch.zeros(out[i][0].shape, device = device)

            actual[:,:,1] = current_tag
            actual[:,:,0] = (current_tag == 0).float()

            loss_fct = torch.nn.BCEWithLogitsLoss()

            loss = loss_fct(out[i][0], actual)
            all_loss += loss
            times += 1
            
            loss.backward(retain_graph=True)
        optimizer.step()
        print(f"   Sub: Epoch {epoch}: Loss = {all_loss}, Mean Loss = {all_loss/times}")
    
    print(f"Epoch {epoch}: Loss = {all_loss}, Mean Loss = {all_loss/times}")

In [None]:
Epoch = 5
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(Epoch):
    loss_count = Loss_count()
    for data in trainloader:
        forward(model, data, model.device, optimizer, loss_count)
        torch.cuda.empty_cache()
        
        print(f"   Sub: Epoch {epoch}: Loss = {loss_count.all_loss}, Mean Loss = {loss_count.all_loss/loss_count.times}")
    
    print(f"Epoch {epoch}: Loss = {all_loss}, Mean Loss = {all_loss/times}")

In [None]:
from telegram_notifier import send_message as telegram_bot_sendtext