<a href="https://colab.research.google.com/github/henry3556108/codalab_contest/blob/main/our_model_with_early_stopping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install datasets
!pip install islab-opendeid

Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Using cached huggingface_hub-0.17.3-py3-none-any.whl.metadata (13 kB)
Using cached huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
Installing collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.19.4
    Uninstalling huggingface-hub-0.19.4:
      Successfully uninstalled huggingface-hub-0.19.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 2.15.0 requires huggingface-hub>=0.18.0, but you have huggingface-hub 0.17.3 which is incompatible.[0m[31m
[0mSuccessfully installed huggingface-hub-0.17.3
Collecting huggingface-hub>=0.18.0 (from datasets)
  Using cached huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)
Using cached huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
Installing collected packages: hugg

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### import package

In [2]:
import os
import numpy as np
from tqdm import tqdm, trange
from torch.optim import AdamW
from torch.utils.data import DataLoader
import torch
import torch.optim as optim
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, get_linear_schedule_with_warmup
import re
import random
import matplotlib.pyplot as plt
from torch.nn import functional as F
from torch.utils.data import Dataset

In [4]:
# 所有參數定義
BATCH_SIZE = 4
epochs = 16

# 專注於解決哪方面的問題 all or task1
task_opt = "all"

plm = "EleutherAI/pythia-70m"
special_tokens_dict = {"bos_token": "<|endoftext|>", "sep_token": "####", "eos_token": "<|END|>"}

annotation_data_path = "First_Phase_Release(Correction)/answer.txt"
valid_annotation_data_path = "Second_Phase_Dataset/answer.txt"

train_data_path = "First_Phase_Release(Correction)/First_Phase_Text_Dataset"
valid_data_path = "Second_Phase_Dataset/Second_Phase_Text_Dataset"

test_dataset = "First_Phase_Release(Correction)/Validation_Release"
pred_dst = ""
pred_file_name = "answer.txt"

# 模型路徑
model_dir = "models"

# 是否要載入訓練好的模型
load_model = True

In [3]:

def process_annotation_file(lines, task_opt = "all"):
    entity_dict = {}
    for line in lines:
        items = line.strip('\n').split('\t')
        if task_opt == "all":
            if len(items) == 5:
                item_dict = {
                    'phi' : items [1],
                    'st_idx' :int (items[2]),
                    'ed_idx' : int(items[3]),
                    'entity' : items [4],
                }
            elif len(items) == 6:
                item_dict = {
                'phi' : items[1],
                'st_idx' : int(items[2]),
                'ed_idx' :int(items[3]),
                'entity': items [4],
                'normalize_time' : items[5],}
        elif task_opt == "task1":
            item_dict = {
                'phi' : items [1],
                'st_idx' :int (items[2]),
                'ed_idx' : int(items[3]),
                'entity' : items [4],
            }
        elif task_opt == "task2" and len(items) == 6:
            item_dict = {
                'phi' : items[1],
                'st_idx' : int(items[2]),
                'ed_idx' :int(items[3]),
                'entity': items [4],
                'normalize_time' : items[5],}
        else:
            # pass 掉這不合規的資料，包括如果針對 task2 卻不是時間資訊的
            continue
        if items[0] not in entity_dict:
            entity_dict[items[0]] = [item_dict]
        else:
            entity_dict[items[0]].append(item_dict)
    return entity_dict


def generate_annotated_medical_report(anno_file_path, task_opt):
    '''
    有可能是關於生成 annotation data 的前置作業
    task_opt: "task1" | "all" 如果放入 "task1" 只會生出 task1 的訓練資料 不會有時間正規化
    '''
    with open(anno_file_path, "r", encoding='utf-8-sig') as f:
        anno_lines = f.readlines()
        annos_dict = process_annotation_file(anno_lines, task_opt)
        return annos_dict

def process_medical_report(txt_name, medical_report_folder, annos_dict, special_tokens_dict):
    '''
    生成 training data
    '''
    file_name = txt_name + '.txt'
    sents = open(os.path.join(medical_report_folder, file_name), "r").readlines()
    article = "".join(sents)
    bounary, item_idx, temp_seq, seq_pairs = 0, 0, "", []
    for w_idx, word in enumerate(article):
        if w_idx == annos_dict[txt_name][item_idx]["st_idx"]:
            phi_key = annos_dict[txt_name][item_idx]['phi']
            phi_value = annos_dict[txt_name][item_idx]['entity']
            if "normalize_time" in annos_dict[txt_name][item_idx]:
                temp_seq += f"{phi_key}: {phi_value}=>{annos_dict[txt_name][item_idx]['normalize_time']}\n"
            else:
                temp_seq += f"{phi_key}:{phi_value}\n"
            if item_idx == len(annos_dict[txt_name]) - 1:
                continue
            item_idx += 1
        if word == "\n":
            new_line_idx = w_idx + 1
            if temp_seq == "":
                temp_seq = "PHI:Null"
            seq_pair = special_tokens_dict['bos_token'] + article[bounary: new_line_idx] + special_tokens_dict['sep_token'] + temp_seq + special_tokens_dict['eos_token']
            bounary = new_line_idx
            seq_pairs.append(seq_pair)
            temp_seq = ""
    return seq_pairs

class GPTDataset(Dataset):
    def __init__(self, seq_pairs, tokenizer, special_tokens_dict, pad_idx):
        self.seq_pairs = seq_pairs
        self.tokenizer = tokenizer
        self.special_tokens_dict = special_tokens_dict
        self.pad_idx = pad_idx

    def __len__(self):
        return len(self.seq_pairs)

    def __getitem__(self, index):
        return self.seq_pairs[index]

    def collate_batch(self, datasets):
        tokens_list, labels_list, attention_mask_list = [], [], []

        for dataset in datasets:
            encoded_seq = self.tokenizer(dataset)  # 假设 tokenizer 是一个合法的方法
            indexed_tks = encoded_seq["input_ids"]
            attention_mask = encoded_seq["attention_mask"]

            tokens_list.append(torch.tensor(indexed_tks))
            labels_list.append(torch.tensor(indexed_tks))
            attention_mask_list.append(torch.tensor(attention_mask))

        return self.pad_sequence(tokens_list, labels_list, attention_mask_list)  # 请注意，这里需要补充 pad_sequence 方法的定义

    def pad_sequence(self, non_pad_token, non_pad_label, non_pad_attn):
        max_size = max([len(ele) for ele in non_pad_token])  # 找出该批次数据中的最长序列的长度
        pad_batch1 = torch.stack([torch.cat([t, torch.LongTensor([self.pad_idx] * (max_size - len(t)))]) for t in non_pad_token])
        pad_batch2 = torch.stack([torch.cat([t, torch.LongTensor([self.pad_idx] * (max_size - len(t)))]) for t in non_pad_label])
        pad_batch3 = torch.stack([torch.cat([t, torch.LongTensor([self.pad_idx] * (max_size - len(t)))]) for t in non_pad_attn])
        return pad_batch1, pad_batch2, pad_batch3

def sample_text(model, tokenizer, text, n_words=100):
    model.eval()
    text = tokenizer.encode(text)
    # inputs, past_key_values = torch.tensor([text]).to(device), None
    inputs, past_key_values = torch.tensor([text]), None
    generated_text = []

    with torch.no_grad():
        for _ in range(n_words):
            output = model(inputs, past_key_values=past_key_values)
            logits = output.logits
            past_key_values = output.past_key_values
            log_probs = F.softmax(logits[:, -1], dim=-1)
            inputs = torch.multinomial(log_probs, 1)
            generated_text.append(inputs.item())

            if tokenizer.decode(inputs.item()) == "<|END|>":  # 定义 eos 作为终止标记
                break

    return tokenizer.decode(generated_text)


In [None]:

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(plm)
tokenizer.add_special_tokens(special_tokens_dict)

annos_dict = generate_annotated_medical_report(annotation_data_path, task_opt) # 後面這邊的 task1 是為了只解決 task1 的問題
valid_annos_dict = generate_annotated_medical_report(valid_annotation_data_path, task_opt)

# 目前還等待理解這一段
PAD_IDX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

seq_pairs = []
valid_seq_pairs = []

# 讀取該資料夾下的所有資料，往前迭代並且傳至 data_preprocess 去生成訓練資料
file_names = os.listdir(train_data_path)
for file_name in file_names:
    file_name = file_name.replace(".txt", "")
    seq_pairs.extend(process_medical_report(file_name, train_data_path, annos_dict, special_tokens_dict))

file_names = os.listdir(valid_data_path)
for file_name in file_names:
    file_name = file_name.replace(".txt", "")
    valid_seq_pairs.extend(process_medical_report(file_name, valid_data_path, valid_annos_dict, special_tokens_dict))


tr_dataset = GPTDataset(seq_pairs, tokenizer, special_tokens_dict, 1)
valid_dataset = GPTDataset(valid_seq_pairs, tokenizer, special_tokens_dict, 1)

# 創建 DataLoader
bucket_train_dataloader = DataLoader(tr_dataset, batch_size=BATCH_SIZE, collate_fn=tr_dataset.collate_batch)
bucket_valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=tr_dataset.collate_batch)

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
# 創建模型

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForCausalLM.from_pretrained(plm)
model.resize_token_embeddings(len(tokenizer))
if load_model:
    model.load_state_dict(torch.load(os.path.join(model_dir , 'model1.pth'), map_location=device)) # 如果要 load model 可以用這一行 code 載入訓練好的模型

config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

Embedding(50278, 512)

In [None]:
param_optimizer = list(model.named_parameters())

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01}
]

# 創建 AdamW 優化器
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)


In [None]:

min_loss = 9999
model.to(device)
valid_avg_loss = 99999
count = 0
for _ in trange(epochs, desc="Epoch"):
    model.train()  # 设置模型为训练模式
    total_loss = 0.0
    # 看起來是迭代 train DataLoader 的資料
    for step, (seqs, labels, masks) in enumerate(bucket_train_dataloader):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        # 目前梯度歸零要再思考一下
        # 梯度清零
        model.zero_grad()

        # 模型向前傳遞
        outputs = model(seqs, labels=labels, attention_mask=masks)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()

        total_loss += loss.item()

        loss.backward()  # 向後傳遞
        optimizer.step()
        if loss < min_loss:
            min_loss = loss
            torch.save(model.state_dict(), os.path.join(model_dir , 'GPT_best.pt'))
    # 計算每個 epoch 平均loss
    avg_loss = total_loss / len(bucket_train_dataloader)

    # 這邊迭代 valid Dataloader
    for step, (seqs, labels, masks) in enumerate(bucket_valid_dataloader):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        # 目前梯度歸零要再思考一下
        # 梯度清零
        model.zero_grad()

        # 模型向前傳遞
        outputs = model(seqs, labels=labels, attention_mask=masks)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()

        total_loss += loss.item()

    # 計算 valid data 的每個 epoch 平均 loss 如果有持續上升兩次代表 可能 overfitting 了，停止 triaining
    avg_valid_loss = total_loss / len(bucket_valid_dataloader)
    if valid_avg_loss > avg_valid_loss:
        valid_avg_loss = avg_valid_loss
        torch.save(model.state_dict(), os.path.join(model_dir , 'GPT_best.pt'))
        count = 0
    elif count >= 2:
        break
    else:
        count += 1
    print(f"Epoch {_ + 1}, Avg train Loss: {avg_loss}, Average avg valid Loss: {valid_avg_loss}")

Epoch:   6%|▋         | 1/16 [15:40<3:55:09, 940.61s/it]

Epoch 1, Average Loss: 1.1396466636113525


Epoch:  12%|█▎        | 2/16 [30:46<3:34:46, 920.47s/it]

Epoch 2, Average Loss: 0.8520968749669113


Epoch:  19%|█▉        | 3/16 [45:49<3:17:39, 912.24s/it]

Epoch 3, Average Loss: 0.7497952701459374


Epoch:  25%|██▌       | 4/16 [1:00:44<3:01:03, 905.29s/it]

Epoch 4, Average Loss: 0.6883466479106896


Epoch:  31%|███▏      | 5/16 [1:15:39<2:45:17, 901.58s/it]

Epoch 5, Average Loss: 0.6448762542427209


Epoch:  38%|███▊      | 6/16 [1:30:34<2:29:54, 899.47s/it]

Epoch 6, Average Loss: 0.6111221343588609


Epoch:  44%|████▍     | 7/16 [1:45:32<2:14:50, 898.97s/it]

Epoch 7, Average Loss: 0.5820464661105574


Epoch:  50%|█████     | 8/16 [2:00:25<1:59:37, 897.17s/it]

Epoch 8, Average Loss: 0.5569302330783648


Epoch:  56%|█████▋    | 9/16 [2:15:13<1:44:20, 894.32s/it]

Epoch 9, Average Loss: 0.5330988994114788


Epoch:  62%|██████▎   | 10/16 [2:30:01<1:29:13, 892.25s/it]

Epoch 10, Average Loss: 0.5134311589279151
Epoch 10, Average Loss: 0.5134311589279151


In [None]:
# 生成資料
generated_text = sample_text(model, tokenizer, "<|endoftext|>Collected: 29/10/2063 at :\n####", n_words=100)
print(generated_text)

DATE: 23/10/2063=>2063-10-05
<|END|>


In [None]:
answer = []

def get_start_end_pos(target, text):
    match = re.search(re.escape(target), text)
    if match:
        start_position = match.start()
        end_position = match.end()
        return start_position, end_position
    return None, None

for index, file_name in enumerate(tqdm(os.listdir(test_dataset))):
    with open(os.path.join(test_dataset, file_name)) as f:
        letters_counter = 0
        lines = f.readlines()
        for line_index, line in enumerate(lines):  ## 在這邊直接加入正則應該比較好
            if line == "":
                continue
            else:
                generated_text = sample_text(model, tokenizer, "<|endoftext|>"+line+"####", n_words=100)
                if(generated_text != "PHI:Null<|END|>"):
                    generated_text.replace("<|END|>", "")
                    answer.append((file_name, letters_counter, line, generated_text.split("\n")))
                letters_counter += len(line)


 51%|█████     | 283/560 [59:51<1:25:53, 18.60s/it]

In [None]:
result = []
for file_name, counter, line, contents in answer:
    for content in contents:
        if ":" in content:
            data_type, rest = content[:content.find(":")], content[content.find(":") + 1: ].strip()
            # print(rest)
            if "=>" in rest:
                content, format_date = rest[:rest.find("=>")], rest[rest.find("=>") + 2: ].strip()
            else :
                content, format_date = rest, ""
            start_pos, end_pos = get_start_end_pos(content, line)
            if start_pos is not None:
                start_pos, end_pos = start_pos + counter, end_pos + counter
                if len(format_date) > 0:
                    line_info = file_name.replace(".txt", "") + "\t" + data_type + '\t' + str(start_pos) + '\t' + str(end_pos) + '\t' + content + '\t' + format_date
                elif content.count("=>") == 0:
                    line_info = file_name.replace(".txt", "") + "\t" + data_type + '\t' + str(start_pos) + '\t' + str(end_pos) + '\t' + content
                result.append(line_info)

with open(os.path.join(pred_dst, pred_file_name), "w") as f:
    f.write("\n".join(result))