In [3]:
import pandas as pd
import pymongo
import numpy as np

from ..core.config import MONGODB_URL,DATABASE_NAME, NER_LABEL_COLLECTION, LABEL_COLLECTION, LABEL_RETRAIN_QUEUE_COLLECTION

from torch.utils.data import Dataset
from sklearn.preprocessing import OneHotEncoder

from transformers import RobertaTokenizer
import numpy as np




In [17]:
from tqdm import tqdm
def get_training_dataframe(train_data_search_filter = {}):
    client = pymongo.MongoClient(MONGODB_URL)
    col = client[DATABASE_NAME][NER_LABEL_COLLECTION]

    result = col.find(train_data_search_filter)
    print("Reading Data from MongoDB...")
    result = list(result)
    print("Processing Data...")
    df = pd.DataFrame()
    for i, sentence in enumerate(tqdm(result)):
        sentense_df = pd.DataFrame(columns=["Sentence #", "text", "labels"], data = sentence["text_and_labels"])
        sentense_df["Sentence #"] = str(sentence["_id"])
        df = df.append(sentense_df)
    return df
def get_training_data_by_df_according_to_label_name(df, label_name):
    label_col = client[DATABASE_NAME][LABEL_COLLECTION]

    label_info = label_col.find_one({"label_name": label_name})
    alias_labels = label_col.find({"alias_as": {"$in": [label_name]}})

    alias = []
    for alias_label in alias_labels:
        alias.append(alias_label["label_name"])
    alias = alias + label_info["inherit"]

    wanted_label = [label_name] + alias

    def label_data(label):
        if set(label).intersection(set(wanted_label)):
            return label_name
        else:
            return "O"

    df[label_name] = list(df["labels"].apply(label_data))

    sentences = df.groupby("Sentence #")["text"].apply(list).values
    tags = df.groupby("Sentence #")[label_name].apply(list).values
    return sentences, tags

In [4]:
class NER_Dataset_for_Adapter(Dataset):
    def __init__(self, tokenizer, df, label_name):
        self.label_name = label_name
        self.mode = "train"
        # 大數據你會需要用 iterator=True
        self.sentences, self.tags = get_training_data_by_df_according_to_label_name(df, label_name)
        self.len = len(self.sentences)


        labels = ["O", label_name]

        if self.mode != "test":
            labels = ["O", label_name]
            self.label_map = {}
            for i, label in enumerate(labels):
                self.label_map[label] = i

            possible_labels = np.array(range(len(labels))).reshape(-1, 1)
            self.oneHotEncoder = OneHotEncoder()
            self.oneHotEncoder.fit(possible_labels)
        else:
            self.label_map = None

        self.tokenizer = tokenizer  # RoBERTa tokenizer
        self.O_label = self.label_map["O"]

    def __getitem__(self, idx):
        if self.mode == "test":
            label_tensor = None
        else:
            label = ["O"] + self.tags[idx] + ["O"]

            label = np.array(label)
            label = label.reshape(-1,1)

            label = np.apply_along_axis(self.split_one_hot_multiTags, 1, label)
            label_tensor = torch.tensor(label, dtype = torch.float32)

        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = [tokenizer.cls_token]
        word_pieces += self.sentences[idx]
        word_pieces += [tokenizer.sep_token]

        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)

        # 將第一句包含 [SEP] 的 token 位置設為 0
        segments_tensor = torch.zeros_like(tokens_tensor)

        return (tokens_tensor, segments_tensor, label_tensor)

    def __len__(self):
        return self.len

    def split_one_hot_multiTags(self, tags):
        # tags = ['B-org|Party|String']
        tags = tags[0]
        tags = tags.split("|")


        tags_num = list(map(lambda x: self.label_map[x], tags))
        #[5, 20, 23]

        tags_num = np.array(tags_num).reshape(-1,1)

        tags_one_hot = self.oneHotEncoder.transform(tags_num).toarray()

        tags_one_hot = tags_one_hot.sum(axis = 0)

        #return torch.tensor(tags_one_hot, dtype = torch.float32)

        return tags_one_hot

In [106]:
# Env Virable (Might From Config)
BATCH_SIZE = 16
default_filter = {}
device_id = 0
device = torch.device(f"cuda:{device_id}" if torch.cuda.is_available() else "cpu")
Adapter_Patch = "."

import os
import datetime
# When Each Train
# Run When Set Up
if os.path.isdir(f"{Adapter_Patch}/save_adapters") == False:
    os.mkdir(f"{Adapter_Patch}/save_adapters")
if os.path.isdir(f"{Adapter_Patch}/save_heads") == False:
    os.mkdir(f"{Adapter_Patch}/save_heads")

dateStamp = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S%z")

from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# On Each Iter
train_data_search_filter = {}
label_name = "Party87"
Epoch_Times = 2

In [107]:

df = get_training_dataframe(default_filter)

Reading Data from MongoDB...


KeyboardInterrupt: 

In [111]:

client = pymongo.MongoClient(MONGODB_URL)
col = client[DATABASE_NAME][NER_LABEL_COLLECTION]

In [112]:
wanted_id = list(map(lambda x: str(x["_id"]),
                     list(col.find(train_data_search_filter, {"_id": True}))))

target_df = df[df["Sentence #"].isin(wanted_id)]

In [113]:
trainset = NER_Dataset_for_Adapter(tokenizer, target_df, label_name)

In [114]:
from torch.utils.data import DataLoader, IterableDataset
from torch.nn.utils.rnn import pad_sequence
import torch

In [115]:
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]

    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = [s[2] for s in samples]
        label_ids = pad_sequence(label_ids, 
                                  batch_first=True)
    else:
        label_ids = None

    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)

    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)

    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [116]:
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [117]:
from transformers import RobertaConfig, RobertaModelWithHeads
config = RobertaConfig.from_pretrained(
    "roberta-base"
)

In [120]:
df.labels

0                [O]
1                [O]
2                [O]
3                [O]
4                [O]
          ...       
2                [O]
3    [Party, String]
4                [O]
5     [TemporalUnit]
6                [O]
Name: labels, Length: 1048582, dtype: object

In [118]:
model = RobertaModelWithHeads.from_pretrained(
    "roberta-base",
    config=config,
    )

try:
    model.add_adapter(label_name)
    model.add_tagging_head(
        label_name,
        num_labels=1
      )
except: pass
model.train_adapter(label_name)
model = model.to(device)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
                {
                    "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                    "weight_decay": 1e-5,
                },
                {
                    "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                    "weight_decay": 0.0,
                },
            ]
optimizer = torch.optim.AdamW(params=optimizer_grouped_parameters, lr=1e-4)

for epoch in range(Epoch_Times):
    print(f"\n{label_name}: epoch {epoch}")
    for i, data in enumerate(trainloader):

        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        outputs = model(input_ids = tokens_tensors,
            attention_mask=masks_tensors,
            token_type_ids=segments_tensors)


        logits = outputs[0]

        current_label = labels.view(-1, labels.shape[-1])[:, trainset.label_map[label_name]]
        current_label = current_label.view(-1)

        active_logits = logits.view(-1, logits.shape[-1])[masks_tensors.view(-1) == 1]
        active_labels = current_label[masks_tensors.view(-1)== 1]

        actual = current_label[masks_tensors.view(-1)== 1].float().view(-1,1)

        loss_fct = torch.nn.BCEWithLogitsLoss()

        loss = loss_fct(active_logits, actual)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if i % 100 == 0:
            print(f"\tLoss: {loss}")
    """filename = f"{label_name}_epoch_{epoch}_{dateStamp}"
    model.save_adapter(f"{Adapter_Patch}/save_adapters/{filename}", model.active_adapters[0])
    model.save_head(f"{Adapter_Patch}/save_heads/{filename}", model.active_head)"""
filename = f"{label_name}_epoch_{Epoch_Times}_{dateStamp}"
model.save_adapter(f"{Adapter_Patch}/save_adapters/{filename}", model.active_adapters[0])
model.save_head(f"{Adapter_Patch}/save_heads/{filename}", model.active_head)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere


Party87: epoch 0
	Loss: 0.6863803863525391
	Loss: 0.26608678698539734
	Loss: 0.1739221066236496
	Loss: 0.12224006652832031
	Loss: 0.13584811985492706
	Loss: 0.14804978668689728
	Loss: 0.07767172157764435
	Loss: 0.1210523322224617
	Loss: 0.10067823529243469
	Loss: 0.07805242389440536
	Loss: 0.08767428249120712
	Loss: 0.14710210263729095
	Loss: 0.09772222489118576
	Loss: 0.04824133589863777
	Loss: 0.10125719010829926
	Loss: 0.09478776901960373
	Loss: 0.16918572783470154
	Loss: 0.06986373662948608
	Loss: 0.10380092263221741
	Loss: 0.12744483351707458
	Loss: 0.07507335394620895
	Loss: 0.11341594904661179
	Loss: 0.06651750952005386
	Loss: 0.09243737161159515
	Loss: 0.06899833679199219
	Loss: 0.1170530915260315
	Loss: 0.05675486847758293
	Loss: 0.07523295283317566
	Loss: 0.08001790195703506
	Loss: 0.08097665756940842

Party87: epoch 1
	Loss: 0.1378442794084549
	Loss: 0.11058449000120163
	Loss: 0.10486708581447601
	Loss: 0.06578885018825531
	Loss: 0.060900747776031494
	Loss: 0.10366994142532