In [105]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_metric
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertConfig, BertTokenizer, BertForTokenClassification, AdamW, get_linear_schedule_with_warmup
from transformers import TrainingArguments, Trainer
from seqeval.metrics import f1_score, accuracy_score, classification_report

### Configuration

In [106]:
# identify and specify the GPU as the device, later in training loop we will load data into device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
N_GPU = torch.cuda.device_count()
GPU_NAME = torch.cuda.get_device_name(0)

In [107]:
# 初始化建立entity types list
# return ENTITY_TYPES, TAGS2ID, ID2TAGS
def initial_entity_types(tags):
    ALL_TAGS = []
    ALL_TAGS.append('O')
    for tag in tags:
        ALL_TAGS.append('B-' + tag)
        ALL_TAGS.append('I-' + tag)
        #ALL_TAGS.append('E-' + tag) #
        #ALL_TAGS.append('S-' + tag) #
    TAGS2ID = {tag: id for id, tag in enumerate(ALL_TAGS)}
    ID2TAGS = {id: tag for tag, id in TAGS2ID.items()}
    return ALL_TAGS, TAGS2ID, ID2TAGS

In [108]:
# ENTITY TYPES
tags = ['name', 'location', 'time', 'contact', 'ID', 'profession', 'biomarker', 'family', 'clinical_event', 'special_skills', 'unique_treatment', 'account', 'organization', 'education', 'money', 'belonging_mark', 'med_exam', 'others']
ENTITY_TYPES, TAGS2ID, ID2TAGS = initial_entity_types(tags)

In [109]:
TAGS2ID

{'O': 0,
 'B-name': 1,
 'I-name': 2,
 'B-location': 3,
 'I-location': 4,
 'B-time': 5,
 'I-time': 6,
 'B-contact': 7,
 'I-contact': 8,
 'B-ID': 9,
 'I-ID': 10,
 'B-profession': 11,
 'I-profession': 12,
 'B-biomarker': 13,
 'I-biomarker': 14,
 'B-family': 15,
 'I-family': 16,
 'B-clinical_event': 17,
 'I-clinical_event': 18,
 'B-special_skills': 19,
 'I-special_skills': 20,
 'B-unique_treatment': 21,
 'I-unique_treatment': 22,
 'B-account': 23,
 'I-account': 24,
 'B-organization': 25,
 'I-organization': 26,
 'B-education': 27,
 'I-education': 28,
 'B-money': 29,
 'I-money': 30,
 'B-belonging_mark': 31,
 'I-belonging_mark': 32,
 'B-med_exam': 33,
 'I-med_exam': 34,
 'B-others': 35,
 'I-others': 36}

In [110]:
# using model
MODEL = dict(
    MODEL_VERSION = 'bert-base-chinese',
    BATCH_SIZE = 32,
    EPOCHS = 6,
    lr = 2e-5,
    eps = 1e-8,
    WEIGHT_DECAY = 0.1,
    MAX_GRAD_NORM = 1.0,
)

In [111]:
METRIC = load_metric("seqeval")

### Utility

In [112]:
# 訓練資料讀取
def loadInputFile(path):
    trainingset = list()  # store trainingset [content,content,...]
    position = list()  # store position [article_id, start_pos, end_pos, entity_text, entity_type, ...]
    mentions = dict()  # store mentions[mention] = Type

    with open(path, 'r', encoding='utf-8') as f:
        file_text = f.read().encode('utf-8').decode('utf-8-sig')
    
    datas = file_text.split('\n\n--------------------\n\n')[:-1]
    
    for data in datas:
        data = data.split('\n')
        content = data[0]
        trainingset.append(content)
        annotations = data[1:]
        for annot in annotations[1:]:
            annot = annot.split('\t') #annot = article_id, start_pos, end_pos, entity_text, entity_type
            position.extend(annot)
            mentions[annot[3]] = annot[4]
    
    return trainingset, position, mentions

In [113]:
# 將position轉為dataframe格式
def tranfer2dataframe(position):
    preDataframe = [position[i:i + 5] for i in range(0, len(position), 5)]
    df = pd.DataFrame(preDataframe, columns=['article_id', 'start_pos', 'end_pos', 'entity_text', 'entity_type'])
    return df

In [114]:
# 將IOB NER加到tuple後
def FormatEntity(origin_data, position):
    origin_len = len(origin_data[0])
    result = origin_data.copy()
    for index, row in position.iterrows():
        start_pos = int(row['start_pos'])
        end_pos = int(row['end_pos'])
        for i in range(start_pos, end_pos):
            if i == start_pos:
                entity_type = 'B-' + row['entity_type']
            else:
                entity_type = 'I-' + row['entity_type']
            result[i] += (entity_type, )
    return list(map(lambda x: x+('O', ) if len(x) == origin_len else x, result))    # 將沒有標記的標記上O

In [115]:
def tokenizer(text):
    tokens = [char for char in text]
    tokens = list(map(lambda char: (char, ), tokens))
    return tokens

In [116]:
# 讀取資料pipline
def loadData(path):
    trainingset, position, mentions = loadInputFile(path)
    df_position = tranfer2dataframe(position)
    words_list = list(map(lambda text: tokenizer(text), trainingset))
    traindata_list = []
    for index in range(len(words_list)):
        index_position = df_position[df_position.article_id == str(index)]
        traindata_list.append(FormatEntity(words_list[index], index_position))  # 要放在最後面

    return traindata_list

In [117]:
# 將資料分為訓練及測試資料
def Dataset(data_list):
    article_id_list = [ id for id in range(len(data_list))]
    traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list = train_test_split(data_list, article_id_list, test_size=0.3, random_state=42)
    train_texts = [[word[0] for word in doc] for doc in traindata_list]
    train_tags = [[word[1] for word in doc] for doc in traindata_list]
    test_texts = [[word[0] for word in doc] for doc in testdata_list]
    test_tags = [[word[1] for word in doc] for doc in testdata_list]
    return train_texts, train_tags, test_texts, test_tags

In [118]:
# 將文章依照 ['。', '？', '！', '，'] 斷句
def split_list_by_value(text, tags):
    split_list = ['。', '？', '！', '，']
    sentence = []
    sentence_tags = []
    text_result = []
    tag_result = []
    
    for i in range(len(text)):
        sentence.append(text[i])
        sentence_tags.append(tags[i])
        if text[i] in split_list:
            text_result.append(sentence.copy())
            tag_result.append(sentence_tags.copy())
            sentence.clear()
            sentence_tags.clear()
            
    return text_result, tag_result

In [119]:
# 將文章切為句子
def transferDataFormat(texts, tags):
    result_texts = []
    result_tags = []
    for id in range(len(texts)):
        text, tag = split_list_by_value(texts[id], tags[id])
        result_texts.extend(text)
        result_tags.extend(tag)
    return result_texts, result_tags

In [120]:
def loadTrainingDataPipeline(dataset):
    train_texts, train_tags, test_texts, test_tags = Dataset(dataset)
    train_texts, train_tags = transferDataFormat(train_texts, train_tags)
    test_texts, test_tags = transferDataFormat(test_texts, test_tags)
    return train_texts, train_tags, test_texts, test_tags

In [121]:
# 將tag轉為encoding
def encode_tags(tags, encodings, max_length):
    labels = [[TAGS2ID[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for label in labels:
        sentence_label = np.array([-100])
        sentence_label = np.append(sentence_label, label)
        append_len = abs(max_length - len(sentence_label)) #max_length - len(sentence_label)
        append_label = np.ones(append_len, dtype=int) * -100
        encoded_labels.append(np.append(sentence_label, append_label).tolist())
    return encoded_labels

In [122]:
# 將資料轉Trainer輸入格式
def transferTrainingData(encodings, labels):
    t_inputs = torch.tensor(encodings.input_ids)
    t_masks = torch.tensor(encodings.attention_mask)
    t_labels = torch.tensor(labels)

    training_data = TensorDataset(t_inputs, t_masks, t_labels)
    return training_data

In [123]:
def loadPredictFile(path):
    trainingset = list()  # store trainingset [content,content,...]
    position = list()  # store position [article_id, start_pos, end_pos, entity_text, entity_type, ...]
    mentions = dict()  # store mentions[mention] = Type

    with open(path, 'r', encoding='utf-8') as f:
        file_text = f.read().encode('utf-8').decode('utf-8-sig')
    
    datas = file_text.split('\n\n--------------------\n\n')[:-1]
    trainingset = []
    for data in datas:
        data = data.split('\n')
        trainingset.append(data[1:])
    return trainingset

In [124]:
def predictData_split_to_sentence(texts):
    split_list = ['。', '？', '！', '，']
    sentence = []
    result = []
    for text in texts:
        for i in range(len(text)):
            sentence.append(text[i])
            if text[i] in split_list:
                result.append(sentence.copy())
                sentence.clear()
    return result

In [125]:
def predict_sentence(sentence):
    inputs = tokenizer.encode_plus(sentence, is_split_into_words=True, return_tensors='pt')
    inputs.to(DEVICE)
    outputs = model(**inputs)
    logits = np.array(outputs.logits.tolist())
    result = np.argmax(logits, axis=2)
    result = [ID2TAGS[i] for i in result[0]]
    return result

### Bert Training utility

In [126]:
# 取得BERT參數量
def get_learnable_params(module):
    model_params = [p for p in module.parameters() if p.requires_grad]
    clf_params = [p for p in module.classifier.parameters() if p.requires_grad]

    print(f"""
    整個分類模型的參數量：{sum(p.numel() for p in model_params)}
    線性分類器的參數量：{sum(p.numel() for p in clf_params)}
    """)

In [127]:
# 比較預測出的結果跟實際差別
def outputResult(data):
    predictions, labels, _ = trainer.predict(data)
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [ENTITY_TYPES[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ENTITY_TYPES[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    print(classification_report(true_labels, true_predictions))

In [128]:
# Trainer使用compute_metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [ID2TAGS[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ID2TAGS[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = METRIC.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [129]:
# Trainer使用data_collator
def data_collator(dataset):
    batch = {}
    batch['input_ids'] = torch.stack([f[0] for f in dataset])
    batch['attention_mask'] = torch.stack([f[1] for f in dataset])
    batch['labels'] = torch.stack([f[2] for f in dataset])
    
    return batch

### Input Training Data

In [130]:
training_data = loadData('../data/SampleData_deid.txt')
training_data.extend(loadData('../data/train_1_update.txt'))
training_data.extend(loadData('../data/train_2.txt'))
training_data.extend(loadData('../data/sample_id_2.txt'))
training_data.extend(loadData('../data/sample_contact.txt'))

In [131]:
train_texts, train_tags, test_texts, test_tags = loadTrainingDataPipeline(training_data)

### Bert Training

In [132]:
MAX_LENGTH = max(len(max(train_texts, key=len)), len(max(test_texts, key=len))) + 2    # 設定最大長度，比較訓練集及測試集最大長度

In [133]:
MAX_LENGTH = 512 if MAX_LENGTH > 512 else MAX_LENGTH
MAX_LENGTH

103

In [134]:
len(ENTITY_TYPES)

37

In [135]:
configuration = BertConfig.from_pretrained(
    MODEL['MODEL_VERSION'],
    architectures=['BertForTokenClassification'],
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2,
    num_labels=len(ENTITY_TYPES),
    id2label=ID2TAGS,
    label2id=TAGS2ID
)

In [138]:
tokenizer = BertTokenizer.from_pretrained(MODEL['MODEL_VERSION'])
model = BertForTokenClassification.from_pretrained(MODEL['MODEL_VERSION'], config=configuration).to(DEVICE)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-c

In [140]:
# return input_ids, token_type_ids, attention_mask
train_encodings = tokenizer(train_texts, is_split_into_words=True, max_length=MAX_LENGTH, padding='max_length')
val_encodings = tokenizer(test_texts, is_split_into_words=True, max_length=MAX_LENGTH, padding='max_length')

In [141]:
train_labels = encode_tags(train_tags, train_encodings, MAX_LENGTH)
val_labels = encode_tags(test_tags, val_encodings, MAX_LENGTH)

In [142]:
# convert all data into torch tensors, required data type for our model
train_inputs = torch.tensor(train_encodings.input_ids)
train_masks = torch.tensor(train_encodings.attention_mask)
train_labels = torch.tensor(train_labels)

valid_inputs = torch.tensor(val_encodings.input_ids)
valid_masks = torch.tensor(val_encodings.attention_mask)
valid_labels = torch.tensor(val_labels)

In [143]:
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, shuffle=False, batch_size=MODEL['BATCH_SIZE'])
valid_data = TensorDataset(valid_inputs, valid_masks, valid_labels)
valid_dataloader = DataLoader(valid_data, shuffle=False, batch_size=MODEL['BATCH_SIZE'])

In [144]:
get_learnable_params(model)


    整個分類模型的參數量：101705509
    線性分類器的參數量：28453
    


In [145]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': MODEL['WEIGHT_DECAY']},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [146]:
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=MODEL['lr'],
    eps=MODEL['eps']
)

In [147]:
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * MODEL['EPOCHS']

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [148]:
args = TrainingArguments(
    output_dir = './output',
    evaluation_strategy = "epoch", 
    learning_rate=MODEL['lr'],
    per_device_train_batch_size=MODEL['BATCH_SIZE'],
    per_device_eval_batch_size=16,
    num_train_epochs=MODEL['EPOCHS'],
    weight_decay=MODEL['WEIGHT_DECAY']
)

In [149]:
optimizers = optimizer, scheduler

trainer = Trainer(
    model,
    args,
    train_dataset=train_data,
    eval_dataset=valid_data,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=optimizers
)

In [150]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.036248,0.035959,0.621134,0.684335,0.651205,0.988336
2,0.026392,0.029815,0.716482,0.705632,0.711016,0.989857
3,0.017578,0.029364,0.726961,0.80265,0.762933,0.991348
4,0.0126,0.032574,0.780957,0.787979,0.784452,0.992247
5,0.009416,0.035271,0.796357,0.80691,0.801598,0.992281
6,0.007042,0.036906,0.798611,0.816375,0.807395,0.992631


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=10734, training_loss=0.0215844102071965)

In [151]:
trainer.evaluate()

{'eval_loss': 0.03690628707408905,
 'eval_precision': 0.7986111111111112,
 'eval_recall': 0.8163748225272125,
 'eval_f1': 0.8073952726421718,
 'eval_accuracy': 0.9926306278297029,
 'epoch': 6.0}

In [152]:
outputResult(valid_data)

                precision    recall  f1-score   support

            ID       0.84      0.72      0.78        29
clinical_event       1.00      0.50      0.67         2
       contact       0.85      0.76      0.80        29
     education       1.00      1.00      1.00         2
        family       0.67      0.62      0.64        29
      location       0.94      0.93      0.94       148
      med_exam       0.76      0.80      0.78       205
         money       0.88      1.00      0.94        88
          name       0.90      0.91      0.91       111
  organization       0.00      0.00      0.00         1
    profession       0.42      0.71      0.53         7
          time       0.78      0.80      0.79      1462

     micro avg       0.80      0.82      0.81      2113
     macro avg       0.75      0.73      0.73      2113
  weighted avg       0.80      0.82      0.81      2113



### Predict

In [153]:
predict_data = loadPredictFile('../data/test.txt')
predict_data = list(map(lambda x: predictData_split_to_sentence(x), predict_data))

In [154]:
results = []
docs = []
for doc in predict_data:
    article = []
    for sentence in doc:
        article.append(predict_sentence(sentence))
    results.append(article.copy())
    article.clear()

In [155]:
import json
with open('../data/result.txt', 'w') as f:
    json.dump(results, f)

In [156]:
with open('../data/sentences.txt', 'w', encoding='utf8') as f:
    json.dump(predict_data, f, ensure_ascii=False)