In [1]:
import pandas as pd
import numpy as np
import torch
print(f"Torch Version: {torch.__version__}")

import transformers
print(f"transformers (Adapter) Version: {transformers.__version__}")

Torch Version: 1.8.1
transformers (Adapter) Version: 2.0.1


In [2]:
from transformers import RobertaTokenizer
import numpy as np

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")

In [3]:
data_path = "./NER_multilabel_data_v2.csv"
df = pd.read_csv(data_path)

all_tags = df.newTag

all_tags = set(all_tags)

all_tags = "|".join(all_tags)
all_tags = all_tags.split("|")
all_tags = set(all_tags)
all_tags = list(all_tags)

In [4]:

def process_csv(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tags = df.groupby("Sentence #")["newTag"].apply(list).values
    return sentences, tags

sentences, tags = process_csv(data_path)

In [5]:
from torch.utils.data import Dataset
from sklearn.preprocessing import OneHotEncoder
    
    


class NER_Dataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer, data_path, labels):
        assert mode in ["train", "test"]  # 一般訓練你會需要 dev set
        self.mode = mode
        # 大數據你會需要用 iterator=True
        self.sentences, self.tags = process_csv(data_path)
        self.len = len(self.sentences)
        

        if mode != "test":
            self.label_map = {}
            for i in range(len(labels)):
                self.label_map[labels[i]] = i
                
            possible_labels = np.array(range(len(labels))).reshape(-1, 1)
            self.oneHotEncoder = OneHotEncoder()
            self.oneHotEncoder.fit(possible_labels)
        else:
            self.label_map = None
        
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
        self.O_label = self.label_map["O"]

    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            label_tensor = None
        else:
            label = ["O"] + self.tags[idx] + ["O"]

            label = np.array(label)
            label = label.reshape(-1,1)

            label = np.apply_along_axis(self.split_one_hot_multiTags, 1, label)
            label_tensor = torch.tensor(label, dtype = torch.float32)
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ['[CLS]']
        word_pieces += self.sentences[idx]
        word_pieces += ['[SEP]']
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0
        segments_tensor = torch.zeros_like(tokens_tensor)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
    def split_one_hot_multiTags(self, tags):
        # tags = ['B-org|Party|String']
        tags = tags[0]
        tags = tags.split("|")


        tags_num = list(map(lambda x: self.label_map[x], tags))
        #[5, 20, 23]

        tags_num = np.array(tags_num).reshape(-1,1)

        tags_one_hot = self.oneHotEncoder.transform(tags_num).toarray()

        tags_one_hot = tags_one_hot.sum(axis = 0)

        #return torch.tensor(tags_one_hot, dtype = torch.float32)

        return tags_one_hot
    
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞

df = pd.read_csv(data_path, encoding="latin-1")

labels = np.unique("|".join(list(df.newTag)).split("|"))
print(f"labels: {labels}")

trainset = NER_Dataset("train", tokenizer=tokenizer, data_path=data_path, labels= labels)

labels: ['B-art' 'B-eve' 'B-geo' 'B-gpe' 'B-nat' 'B-org' 'B-per' 'B-tim'
 'CountryCode' 'CryptoCurrencyCode' 'CurrencyCode' 'Event' 'Float' 'I-art'
 'I-eve' 'I-geo' 'I-gpe' 'I-nat' 'I-org' 'I-per' 'I-tim' 'Integer'
 'Location' 'Month' 'O' 'Object' 'Party' 'Race' 'SpecialTerm'
 'TemporalUnit' 'Time' 'Timezone' 'US_States']


In [6]:
trainset.id2label = {}
for key in trainset.label_map.keys():
    trainset.id2label[trainset.label_map[key]] = key

from transformers import RobertaConfig, RobertaModelWithHeads

config = RobertaConfig.from_pretrained(
    "roberta-base",
    num_labels=len(all_tags),
    label2id = trainset.label_map, 
    id2label = trainset.id2label
)
model = RobertaModelWithHeads.from_pretrained(
    "roberta-base",
    config=config,
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

In [7]:
adapter_name = "Float"
name = model.load_adapter(f"./save_adapters/{adapter_name}_0730")
model.load_head(f"./save_heads/{adapter_name}_0730")

Overwriting existing head 'Float'


('./save_heads/Float_0730', 'Float')

adapter_name = "All_tag_2"
model.load_adapter(f"./save_adapters/{adapter_name}")
model.load_head(f"./save_heads/{adapter_name}")

In [9]:
label_id_mapping = trainset.label_map

id_label_mapping = dict()
for key in label_id_mapping.keys():
    id_label_mapping[label_id_mapping[key]] = key

def test_model(model, sentence, device = "cpu"):
    tokenized_sentence = torch.tensor([tokenizer.encode(sentence)])
    pos = torch.tensor([[0] * len(tokenized_sentence)])
    tags = torch.tensor([[1] * len(tokenized_sentence)])

    model = model.to(device)
    outputs = model(input_ids=tokenized_sentence.to(device), 
                    token_type_ids=pos.to(device), 
                    attention_mask=tags.to(device))

    logits = outputs[0]

    _, pred_labels = torch.max(logits, 2)

    out_labels = []
    for row in pred_labels:
        result = list(map(lambda x: id_label_mapping[int(x)], row))
        out_labels.append(result)
    #return tokenizer.tokenize(sentence), out_labels[0], logits
    return tokenizer.tokenize(sentence), out_labels[0][1:-1], logits[:, 1:-1]

In [10]:
sentence = "Dan will be deemed to have completed its delivery for 8.2 obligations before 2021-7-5 if in Niall's opinion, the Jeep Car satisfies the Acceptance Criteria, and Niall notifies Dan in writing that it is accepting the Jeep Car."
sen, pred, logits = test_model(model, sentence, device = 'cpu')

There are adapters available but none are passed to model.forward


In [11]:
np.array(sen)

array(['Dan', 'Ġwill', 'Ġbe', 'Ġdeemed', 'Ġto', 'Ġhave', 'Ġcompleted',
       'Ġits', 'Ġdelivery', 'Ġfor', 'Ġ8', '.', '2', 'Ġobligations',
       'Ġbefore', 'Ġ2021', '-', '7', '-', '5', 'Ġif', 'Ġin', 'ĠNi', 'all',
       "'s", 'Ġopinion', ',', 'Ġthe', 'ĠJeep', 'ĠCar', 'Ġsatisfies',
       'Ġthe', 'ĠAccept', 'ance', 'ĠCrit', 'eria', ',', 'Ġand', 'ĠNi',
       'all', 'Ġnot', 'ifies', 'ĠDan', 'Ġin', 'Ġwriting', 'Ġthat', 'Ġit',
       'Ġis', 'Ġaccepting', 'Ġthe', 'ĠJeep', 'ĠCar', '.'], dtype='<U12')

In [12]:
np.array(pred)

array(['B-eve', 'B-art', 'B-eve', 'B-art', 'B-art', 'B-art', 'B-art',
       'B-eve', 'B-eve', 'B-art', 'B-art', 'B-eve', 'B-eve', 'B-art',
       'B-art', 'B-art', 'B-eve', 'B-art', 'B-eve', 'B-art', 'B-art',
       'B-art', 'B-eve', 'B-art', 'B-art', 'B-art', 'B-eve', 'B-eve',
       'B-eve', 'B-eve', 'B-art', 'B-eve', 'B-eve', 'B-eve', 'B-eve',
       'B-eve', 'B-eve', 'B-art', 'B-eve', 'B-art', 'B-art', 'B-art',
       'B-eve', 'B-art', 'B-art', 'B-art', 'B-eve', 'B-art', 'B-art',
       'B-eve', 'B-eve', 'B-eve', 'B-eve'], dtype='<U5')

In [13]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
def interact_word(i):
    print(i)
    print(f"{sen[i]}: {pred[i]}")
    target = out[i]

    for i in range(len(target)):
        print(f"{i} {id_label_mapping[i].ljust(6)} \t: {target[i]:.5f}")

In [14]:
out = logits[0]
interact(lambda x: interact_word(x), x=widgets.IntSlider(min=0, max=len(sen)-1, step=1, value=0))

0
Dan: B-eve
0 B-art  	: 0.06164
1 B-eve  	: 0.13845


<function __main__.<lambda>(x)>