In [1]:
# !pip install islab-opendeid

### import package

In [2]:
import os

import numpy as np
from tqdm import tqdm, trange
from torch.optim import AdamW

from torch.utils.data import DataLoader
import torch
import torch.optim as optim
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, get_linear_schedule_with_warmup
import re
import random
import matplotlib.pyplot as plt
from torch.nn import functional as F
from torch.utils.data import Dataset

In [3]:
def set_torch_seed(seed = 0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benckmark = False
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
set_torch_seed()

def read_file(path):
    with open(path , 'r' , encoding = 'utf-8-sig') as fr:
        return fr.readlines()

### 資料處理

In [4]:
bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<p>'
ner = '\n\n####\n\n'
special_tokens_dict = {'bos_token': bos,
                       'eos_token': eos,
                       'pad_token': pad,
                       'sep_token': ner}

def process_annotation_file(lines):
    '''
    處理anwser.txt 標註檔案

    output:annotation dicitonary
    '''
    print("process annotation file...")
    entity_dict = {}
    for line in lines:
        '''
        # '\n' 分行
        # '\t' 分詞
        '''
        
        items = line.strip('\n').split('\t')
        if len(items) == 5:
            item_dict = {
                'phi' : items[1],
                'st_idx' : int(items[2]),
                'ed_idx' : int(items[3]),
                'entity' : items[4],
            }
        elif len(items) == 6:
            item_dict = {
                'phi' : items[1],
                'st_idx' : int(items[2]),
                'ed_idx' : int(items[3]),
                'entity' : items[4],
                'normalize_time' : items[5],
            }
        if items[0] not in entity_dict:
            entity_dict[items[0]] = [item_dict]
        else:
            entity_dict[items[0]].append(item_dict)
    print("annotation file done")
    return entity_dict

def process_medical_report(txt_name, medical_report_folder, annos_dict, special_tokens_dict):
    '''
    處理單個病理報告

    output : 處理完的 sequence pairs
    '''
    file_name = txt_name + '.txt'
    sents = read_file(os.path.join(medical_report_folder, file_name))
    article = "".join(sents)

    bounary , item_idx , temp_seq , seq_pairs = 0 , 0 , "" , []
    new_line_idx = 0
    for w_idx, word in enumerate(article):
        
        # 重要!!!
        if word == '"':
            article = article[:w_idx] + ' ' + article[w_idx + 1:]

        if w_idx == annos_dict[txt_name][item_idx]['st_idx']:
            phi_key = annos_dict[txt_name][item_idx]['phi']
            phi_value = annos_dict[txt_name][item_idx]['entity']
            
            if 'normalize_time' in annos_dict[txt_name][item_idx]:
                temp_seq += f"{phi_key} QAQ {phi_value}=>{annos_dict[txt_name][item_idx]['normalize_time']}   "
            else:
                temp_seq += f"{phi_key} QAQ {phi_value}   "
                    
            if item_idx == len(annos_dict[txt_name]) - 1:
                # 重要!!!
                item_idx = 0
                
                continue
            item_idx += 1
        
        if word == '\n':
            new_line_idx = w_idx + 1
            if article[bounary:new_line_idx] == '\n':
                
                # 重要!!!
                bounary = new_line_idx
                
                continue
            if temp_seq == "":
                # bounary = new_line_idx
                # continue
                temp_seq = "PHI QAQ Null"
            sentence = article[bounary:new_line_idx].strip().replace('\t' , ' ')
            temp_seq = temp_seq.strip('\\n')
            
            # 重要!!!
            # seq_pair = f"{txt_name}\t{bounary}\t{sentence}\t{temp_seq}\n"
            seq_pair = f"{bounary}\t{sentence}\t{temp_seq}\n"
            
            # seq_pair = f"{txt_name}\t{new_line_idx}\t{sentence}\t{temp_seq}\n"
            ## seq_pair = special_tokens_dict['bos_token'] + article[bounary:new_line_idx] + special_tokens_dict['sep_token'] + temp_seq + special_tokens_dict['eos_token']
            bounary = new_line_idx
            seq_pairs.append(seq_pair)
            temp_seq = ""
    return seq_pairs

def generate_annotated_medical_report_parallel(anno_file_path, medical_report_folder , tsv_output_path , num_processes = 4):
    '''
    呼叫上面的兩個function
    處理全部的病理報告和標記檔案

    output : 全部的 sequence pairs
    '''
    anno_lines = read_file(anno_file_path)
    annos_dict = process_annotation_file(anno_lines)
    txt_names = list(annos_dict.keys())

    print("processing each medical file")

    all_seq_pairs = []
    for txt_name in txt_names:
        all_seq_pairs.extend(process_medical_report(txt_name, medical_report_folder, annos_dict, special_tokens_dict))
    # print(all_seq_pairs[:10])
    print("All medical file done")
    print("write out to tsv format...")
    with open(tsv_output_path , 'w' , encoding = 'utf-8') as fw:
        for seq_pair in all_seq_pairs:
            fw.write(seq_pair)
    print("tsv format dataset done")
    # return all_seq_pairs

anno_info_path_First = r"data/First_Phase_Release(Correction)/answer.txt"
report_folder_First = r"data/First_Phase_Release(Correction)/First_Phase_Text_Dataset"
tsv_output_path_First = r"data/First_Phase_Release(Correction)/train.tsv"
generate_annotated_medical_report_parallel(anno_info_path_First, report_folder_First, tsv_output_path_First, num_processes = 4)

anno_info_path_Second = r"data/Second_Phase_Dataset/answer.txt"
report_folder_Second = r"data/Second_Phase_Dataset/Second_Phase_Text_Dataset"
tsv_output_path_Second = r"data/Second_Phase_Dataset/train.tsv"
generate_annotated_medical_report_parallel(anno_info_path_Second, report_folder_Second, tsv_output_path_Second, num_processes = 4)

process annotation file...
annotation file done
processing each medical file
All medical file done
write out to tsv format...
tsv format dataset done
process annotation file...
annotation file done
processing each medical file
All medical file done
write out to tsv format...
tsv format dataset done


In [5]:
training_data_total = r"data/trainTotal.tsv"

with open(tsv_output_path_First , 'r' , encoding = 'utf-8') as fw:
    content1 = fw.read()

with open(tsv_output_path_Second , 'r' , encoding = 'utf-8') as fw:
    content2 = fw.read()

merged_content = content1 + '\n' + content2

with open(training_data_total , 'w' , encoding = 'utf-8') as fw:
    fw.write(merged_content)

In [6]:
anno_lines = read_file(anno_info_path_First)
annos_dict = process_annotation_file(anno_lines)

process annotation file...
annotation file done


In [7]:
len(annos_dict)

1120

In [8]:
annos_dict['100'][0]

{'phi': 'MEDICALRECORD', 'st_idx': 1, 'ed_idx': 12, 'entity': '9324677.BOP'}

### Read Tsv Dataset

In [9]:
from datasets import load_dataset, Features, Value

dataset = load_dataset('csv', data_files = training_data_total, delimiter = '\t',
                       features = Features({
                              'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['idx', 'content', 'label'], keep_default_na=False)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'content', 'label'],
        num_rows: 78575
    })
})

In [11]:
dataset['train']

Dataset({
    features: ['idx', 'content', 'label'],
    num_rows: 78575
})

In [12]:
dataset['train'][1209]

{'idx': 4344, 'content': 'Material received:', 'label': 'PHI QAQ Null'}

In [13]:
print(len(dataset['train']))

'''
1195 o

1209
1210

1270 x

好像雙引號會出問題"\t \n"

'''

78575


'\n1195 o\n\n1209\n1210\n\n1270 x\n\n好像雙引號會出問題"\t \n"\n\n'

### Dataloader Sample

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM

plm = "MBZUAI/LaMini-GPT-124M" #"EleutherAI/pythi a-70m-deduped"

bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<p>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}
# special_tokens_dict = {'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(plm)
tokenizer.padding_side = 'left'
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"{tokenizer.pad_token}: {tokenizer.pad_token_id}")

<p>: 50259


In [15]:
list(dataset['train'])[0]

{'idx': 1,
 'content': 'Episode No:  09F016547J',
 'label': 'IDNUM QAQ 09F016547J   '}

In [16]:
'''
Test Region.

input_ids：
這是 tokenized 後的文本，
其中每個 token 被映射為相應的 token ID。
在 BERT 或其他 Transformer 模型中，
每個 token ID 將對應到模型的詞彙表中的一個索引。
input_ids 是模型接受的主要輸入。

attention_mask：
這是一個二進制的遮罩，
用於指示模型在進行 self-attention 計算時應該考慮還是忽略相應的位置。
如果某個位置是 padding 的，則對應的 attention mask 將為 0（忽略），否則為 1（考慮）。
這是為了確保模型不會在 padding 的位置上產生意外的注意力。
'''

# def collate_batch_with_prompt_template(batch, tokenizer, template = "__CONTENT__\n\n####\n\n__LABEL__", IGNORED_PAD_IDX = -100):
#     # default template: {bos} {data['content']} {sep}
#     return 0
#     texts = [template.replace("__LABEL__", data['label']).replace("__CONTENT__", data['content']) for data in list(batch)]
#     encoded_seq = tokenizer(texts, padding=True)
    
#     indexed_tks = torch.tensor(encoded_seq['input_ids'])
#     attention_mask = torch.tensor(encoded_seq['attention_mask'])
#     encoded_label = torch.tensor(encoded_seq['input_ids'])
#     encoded_label[encoded_label == tokenizer.pad_token_id] = IGNORED_PAD_IDX
    
#     return indexed_tks, encoded_label, attention_mask

# train_data = list(dataset['train'])
# train_dataloader = DataLoader(train_data, batch_size=3, shuffle=False, collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer))
# train_dataloader_x = DataLoader(train_data, batch_size=3, shuffle=False)
# titer = iter(train_dataloader)

'\nTest Region.\n\ninput_ids：\n這是 tokenized 後的文本，\n其中每個 token 被映射為相應的 token ID。\n在 BERT 或其他 Transformer 模型中，\n每個 token ID 將對應到模型的詞彙表中的一個索引。\ninput_ids 是模型接受的主要輸入。\n\nattention_mask：\n這是一個二進制的遮罩，\n用於指示模型在進行 self-attention 計算時應該考慮還是忽略相應的位置。\n如果某個位置是 padding 的，則對應的 attention mask 將為 0（忽略），否則為 1（考慮）。\n這是為了確保模型不會在 padding 的位置上產生意外的注意力。\n'

In [17]:
# print(type(train_dataloader))
# print(type(train_dataloader_x))

# print("---" * 30)
# for data in train_dataloader:
#     print(data)
#     break

# print("---" * 30)
# for data in train_dataloader_x:
#     print(data)
#     break
    
'''
三個(batch_size)三個放一起，放在字典的value list裡面
'''

'\n三個(batch_size)三個放一起，放在字典的value list裡面\n'

In [18]:
from torch.utils.data import DataLoader
from islab.aicup import collate_batch_with_prompt_template

train_data = list(dataset['train'])
train_dataloader = DataLoader(train_data, batch_size=5, shuffle=False, collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer))
titer = iter(train_dataloader)
tks, labels, masks= next(titer)
print(tks.shape)
next(iter(titer))

'''
這部分將 train_dataloader 轉換為迭代器（titer），並使用 next(titer) 提取下一個資料批次。
批次包含三個部分：tks、labels 和 masks。tks 似乎是經過標記的輸入資料，而 labels 可能是相應的標籤。
print(tks.shape) 印出 tks 的形狀。
'''

torch.Size([5, 28])


'\n這部分將 train_dataloader 轉換為迭代器（titer），並使用 next(titer) 提取下一個資料批次。\n批次包含三個部分：tks、labels 和 masks。tks 似乎是經過標記的輸入資料，而 labels 可能是相應的標籤。\nprint(tks.shape) 印出 tks 的形狀。\n'

In [19]:
results = tokenizer(
    [f"9364819.RAN\\nMINTANIA, JEFFRY {sep} ID: 9364819.RAN\\nNAME: MINTANIA, JEFFRY",
     f"This is a sentence {sep} PHI: NULL"],
    padding=True
)
print(results['attention_mask'][0])
print(results['attention_mask'][1])
print("---" * 30)
print(tokenizer.decode(results['input_ids'][0]))
print("---" * 30)
print(tokenizer.decode(results['input_ids'][1]))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
------------------------------------------------------------------------------------------

9364819.RAN\nMINTANIA, JEFFRY 

####

 ID: 9364819.RAN\nNAME: MINTANIA, JEFFRY
------------------------------------------------------------------------------------------
<p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p><p>This is a sentence 

####

 PHI: NULL


### DataLoader For training

In [20]:
from islab.aicup import OpenDeidBatchSampler

# BATCH_SIZE = 6
BATCH_SIZE = 5

bucket_train_dataloader = DataLoader(train_data,
                                     batch_sampler=OpenDeidBatchSampler(train_data, BATCH_SIZE),
                                     collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                                     pin_memory=True)
# , num_workers=4


In [21]:
# the model config to which we add the special tokens
from transformers import AutoConfig
config = AutoConfig.from_pretrained(plm,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    output_hidden_states=False)

model = AutoModelForCausalLM.from_pretrained(plm, config=config)
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

In [22]:
import os
import gc
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['CUDA_VISIBLE_DEVICES']='0'
os.environ['TORCH_USE_CUDA_DSA'] = '1'
import torch

In [23]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

EPOCHS = 30 # CHANGE TO THE NUMBER OF EPOCHS YOU WANT
optimizer = AdamW(model.parameters(),lr=3e-5) # YOU CAN ADJUST LEARNING RATE
# optimizer = AdamW(model.parameters(),lr=3e-5)

model.resize_token_embeddings(len(tokenizer))
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50261, bias=False)
)

In [24]:
def sample_text(model, tokenizer, text, n_words=100):
    '''
    input : model, tokenizer, text(句子 string), n_words(生成字數限制)
    output : 模型預測結果 (string)
    '''
    model.eval()
    text = tokenizer.encode(text)
    inputs, past_key_values = torch.tensor([text]).to(device), None

    with torch.no_grad():
        for _ in range(n_words):
            out = model(inputs, past_key_values=past_key_values)
            logits = out.logits
            past_key_values = out.past_key_values
            log_probs = F.softmax(logits[:, -1], dim=-1)
            inputs = torch.multinomial(log_probs, 1)
            text.append(inputs.item())
            if tokenizer.decode(inputs.item()) == eos:
                break

    return tokenizer.decode(text)

In [25]:
device

device(type='cuda')

In [26]:
from tqdm import tqdm, trange

# 模型儲存資料夾名稱
model_name = "data/First_Phase_Release(Correction)/models/GPT2LMHeadModel"
# 模型儲存路徑
model_dir = f"{model_name}"

accumulation_steps = 2
if not os.path.isdir(model_dir):
     os.mkdir(model_dir)
min_loss = float('inf')  # 使用 float('inf') 表示正無窮
model = model.to(device)

# 如果有多個 GPU，將模型擴展到多個 GPU 上
if torch.cuda.device_count() > 1:
    #print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
    
predict_text = special_tokens_dict['bos_token'] + "MANILDRA  NSW  2865"

# 模型訓練開始
for epoch in trange(EPOCHS, desc="Epoch"):
    #print('epoch=', epoch)
    model.train()
    total_loss = 0

    for step, (seqs, labels, masks) in enumerate(tqdm(bucket_train_dataloader)):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        model.zero_grad()
        outputs = model(seqs, labels=labels)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()
        total_loss += loss.item()
        loss = loss / accumulation_steps
        loss.backward()
        if (step + 1) % accumulation_steps == 0:
            optimizer.step()
            model.zero_grad()
        #del seqs
        #del labels
        #del masks
    avg_train_loss = total_loss / len(bucket_train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    print(sample_text(model, tokenizer, text=predict_text))
    torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': min_loss,
                }, os.path.join(model_dir , 'GPT2LMHeadModel_Finial.pt'))
    if avg_train_loss < min_loss:
        min_loss = avg_train_loss
        torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': min_loss,
                }, os.path.join(model_dir , 'GPT2LMHeadModel_best.pt'))
    torch.cuda.empty_cache()

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


## When training crashes, call the last model to continue unfinished training.

In [None]:
from tqdm import tqdm, trange

# 模型儲存資料夾名稱
model_name = "data/First_Phase_Release(Correction)/models/GPT2LMHeadModel"
# 模型儲存路徑
model_dir = f"{model_name}"

checkpoint = torch.load(os.path.join(model_dir , 'GPT2LMHeadModel_best.pt'))

model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epo = checkpoint['epoch']
min_loss = checkpoint['loss']
model = model.to(device)

accumulation_steps = 2
if not os.path.isdir(model_dir):
     os.mkdir(model_dir)

model = model.to(device)

# 如果有多個 GPU，將模型擴展到多個 GPU 上
if torch.cuda.device_count() > 1:
    #print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
    
predict_text = special_tokens_dict['bos_token'] + "MANILDRA  NSW  2865"

# 模型訓練開始
for epoch in trange(EPOCHS-epo, desc="Epoch"):
    model.train()
    total_loss = 0

    for step, (seqs, labels, masks) in enumerate(tqdm(bucket_train_dataloader)):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        model.zero_grad()
        outputs = model(seqs, labels=labels)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()
        total_loss += loss.item()
        loss = loss / accumulation_steps
        loss.backward()
        if (step + 1) % accumulation_steps == 0:
            optimizer.step()
            model.zero_grad()
        #del seqs
        #del labels
        #del masks
    avg_train_loss = total_loss / len(bucket_train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    print(sample_text(model, tokenizer, text=predict_text))
    torch.save({
                'epoch': epoch+epo+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                }, os.path.join(model_dir , 'GPT2LMHeadModel_Finial.pt'))
    if avg_train_loss < min_loss:
        min_loss = avg_train_loss
        torch.save({
                'epoch': epoch+epo+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                }, os.path.join(model_dir , 'GPT2LMHeadModel_best.pt'))
    torch.cuda.empty_cache()

Epoch: 0it [00:00, ?it/s]


In [None]:
model_name = "data/First_Phase_Release(Correction)/models/GPT2LMHeadModel"
# 模型儲存路徑
model_dir = f"{model_name}"

load_model = torch.load(os.path.join(model_dir , 'GPT2LMHeadModel_best.pt'))
model.load_state_dict(load_model['model_state_dict'])
model = model.to(device)

def sample_text(model, tokenizer, text, n_words=20):
    model.eval()
    text = tokenizer.encode(text)
    inputs, past_key_values = torch.tensor([text]).to(device), None

    with torch.no_grad():
        for _ in range(n_words):
            out = model(inputs, past_key_values=past_key_values)
            logits = out.logits
            past_key_values = out.past_key_values
            log_probs = F.softmax(logits[:, -1], dim=-1)
            inputs = torch.multinomial(log_probs, 1)
            text.append(inputs.item())
            if tokenizer.decode(inputs.item()) == eos:
                break

    return tokenizer.decode(text)

text = "D.O.B:  29/9/2000"
print(sample_text(model, tokenizer, text=text , n_words=20))

D.O.B:  29/9/2000

####

<|endoftext|> Bid<|endoftext|> De<|endoftext|> T<|endoftext|> B<|endoftext|> D<|endoftext|> D<|endoftext|> H<|endoftext|> O<|endoftext|> Result<|endoftext|>


In [None]:
def process_valid_data(test_txts , out_file):
    with open(out_file , 'w' , encoding = 'utf-8') as fw:
        for txt in test_txts:
            m_report = read_file(txt)
            boundary = 0
            # temp = ''.join(m_report)

            # for Windows
            fid = txt.split('\\')[-1].replace('.txt' , '')
            
            # for linux
            # fid = txt.split('/')[-1].replace('.txt' , '')
            
            for idx,sent in enumerate(m_report):
                if sent.replace(' ' , '').replace('\n' , '').replace('\t' , '') != '':
                    sent = sent.replace('\t' , ' ')
                    fw.write(f"{fid}\t{boundary}\t{sent}\n")
                # else:
                #     print(f"{fid}\t{boundary}\t{sent}\n")
                #     assert 1==2
                boundary += len(sent)

test_phase_path = r'data/First_Phase_Release(Correction)/Validation_Release'
valid_out_file_path = './valid.tsv'
test_txts = list(map(lambda x:os.path.join(test_phase_path , x) , os.listdir(test_phase_path)))
test_txts = sorted(test_txts)
valid_data = process_valid_data(test_txts , valid_out_file_path)

In [None]:
from datasets import load_dataset, Features, Value
valid_data = load_dataset("csv", data_files=valid_out_file_path, delimiter='\t',
                          features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'])
valid_list= list(valid_data['train'])
# valid_list

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
train_phi_category = ['PATIENT', 'DOCTOR', 'USERNAME',
             'PROFESSION',
             'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION', 'STREET', 'CITY', 'STATE', 'COUNTRY', 'ZIP', 'LOCATION-OTHER',
             'AGE',
             'DATE', 'TIME', 'DURATION', 'SET',
             'PHONE', 'FAX', 'EMAIL', 'URL', 'IPADDR',
             'SSN', 'MEDICALRECORD', 'HEALTHPLAN', 'ACCOUNT', 'LICENSE', 'VEHICLE', 'DEVICE', 'BIOID', 'IDNUM']

'''
pred 是一個字串，代表預測的文本或資訊。
sep 是用來分隔文本的分隔符號。
pad 是需要從文本中移除的填充符號。
eos 是需要從文本中移除的結束符號。
這行程式碼的執行步驟如下：

pred.index(sep) 找到 sep 在 pred 中的索引位置。
pred[pred.index(sep) + len(sep):] 從找到的索引位置之後的部分，即分隔符號後的文本。
.replace(pad, "") 移除文本中的填充符號。
.replace(eos, "") 移除文本中的結束符號。
.strip() 移除文本兩側的空白字符。
最終的結果存儲在 phi_infos 變數中。
'''

def get_anno_format(sentence , infos , boundary):
    # 整行句子
    # 預測的 PHI content
    # 整行句子的起始位置
    
    anno_list = []
    # lines = infos.split("\n") # 多個PHI type出現在同一行，{PHI type:PHI content\nPHI type:PHI content\n...}
    lines = infos.split("   ")
    
    normalize_keys = ['DATE' , "TIME" , "DURATION" , "SET"]
    phi_dict = {}
    
    for line in lines:
        # print("------------------------------------line：\n", line)

        # 重要!!!
        parts = line.split(" QAQ ")
        # print("------------------------------------parts(split line)：\n", parts)
        
        if parts[0] not in train_phi_category or parts[1] == '':
            # print("有空的喔")
            continue

        # 一行句子裡，只有一個 PHI content
        # 重要!!!
        if len(parts) == 2:
            if parts[0] not in phi_dict:
                phi_dict[parts[0]] = [parts[1].strip()]
            else:
                phi_dict[parts[0]].append(parts[1].strip())
        # print("------------------------------------phi_dict：\n", phi_dict)

    for phi_key, phi_values in phi_dict.items():
        # 重要!!!(LIST)
        for phi_value in phi_values:
            # print("------------------------------------phi_key and phi_value：\n{}\n{}".format(phi_key, phi_value))
            # print("------------------------------------sentence：\n", sentence)
            
            normalize_time = None
            
            # 標準化時間項
            if phi_key in normalize_keys:
                if '=>' in phi_value:
                    temp_phi_values = phi_value.split('=>')
                    phi_value = temp_phi_values[0]
                    normalize_time = temp_phi_values[-1]
                else:
                    normalize_time = phi_value
    
            
            try:
                matches = [(match.start(), match.end()) for match in re.finditer(phi_value, sentence)]
            except:
                continue
            # print("------------------------------------matches：\n", matches)
        
            for start, end in matches:
                if start == end:
                    continue
                item_dict = {
                            'phi' : phi_key,
                            'st_idx' : start + int(boundary),
                            'ed_idx' : end + int(boundary),
                            'entity' : phi_value,
                }
                
                if normalize_time is not None:
                    item_dict['normalize_time'] = normalize_time

                # print("------------------------------------item_dict：\n", item_dict)

                if item_dict not in anno_list:
                    anno_list.append(item_dict) 
                    
    return anno_list

def aicup_predict(model, tokenizer, input, template = "<|endoftext|> __CONTENT__\n\n####\n\n"):
    seeds = [template.replace("__CONTENT__", data['content']) for data in input]
    # print("------------------------------------Input seeds：\n", seeds)
    
    sep = tokenizer.sep_token
    eos = tokenizer.eos_token
    pad = tokenizer.pad_token
    pad_idx = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    
    """Generate text from a trained model."""
    model.eval()
    device = model.device
    texts = tokenizer(seeds, return_tensors = 'pt', padding=True).to(device)
    outputs = []
    
    with torch.cuda.amp.autocast():
        output_tokens = model.generate(**texts, max_new_tokens=400, pad_token_id = pad_idx,
                                        eos_token_id=tokenizer.convert_tokens_to_ids(eos))
        
        preds = tokenizer.batch_decode(output_tokens)
        # print("------------------------------------preds：\n", preds)
        
        for idx , pred in enumerate(preds):
            if "NULL" in pred:
                continue

            # print("------------------------------------pred：\n", pred)
            
            phi_infos = pred[pred.index(sep)+len(sep):].replace(pad, "").replace(bos, "").replace(eos, "").strip()
            # print("------------------------------------phi_infos：\n", phi_infos)
            
            annotations = get_anno_format(input[idx]['content'] , phi_infos , input[idx]['idx'])
            # print("------------------------------------annotations：\n", annotations)

            for annotation in annotations:
                if 'normalize_time' in annotation:
                    outputs.append(f'{input[idx]["fid"]}\t{annotation["phi"]}\t{annotation["st_idx"]}\t{annotation["ed_idx"]}\t{annotation["entity"]}\t{annotation["normalize_time"]}')
                else:
                    outputs.append(f'{input[idx]["fid"]}\t{annotation["phi"]}\t{annotation["st_idx"]}\t{annotation["ed_idx"]}\t{annotation["entity"]}')
    return outputs

In [None]:
# # from tqdm import tqdm

# import io

# with torch.no_grad():
#     seeds = [{'fid': '1066',
#              'idx': 2266,
#              'content': 'Tissue extremely difficult to dissociate and ?necrotic: regrettably, no mitoses were available for cytogenetic analysis.  Please advise the Laboratory (8382 9154) should interphase FISH be on paraffin-embedded tissue be of assistance in this case.',
#              'label': None}]

#     outputs = aicup_predict(model, tokenizer, input=seeds)
#     for output in outputs:
#         print(output)

In [None]:
from tqdm import tqdm

import io
BATCH_SIZE = 1

with open("./answer.txt", 'w', encoding='utf8') as f:
    for i in tqdm(range(0, len(valid_list), BATCH_SIZE)):
        with torch.no_grad():
            seeds = valid_list[i:i+BATCH_SIZE]
            outputs = aicup_predict(model, tokenizer, input = seeds)
            for o in outputs:
                f.write(o)
                f.write('\n')

  0%|          | 30/25739 [00:17<4:13:30,  1.69it/s]


KeyboardInterrupt: 