In [1]:
# !pip install transformers huggingface datasets tqdm islab-opendeid torch torchvision

In [2]:
import os
os.chdir('/home/jupyter/aicup-meddata-pp')

In [11]:
import os

import pandas as pd
import numpy as np
from tqdm import tqdm, trange
from torch.optim import AdamW

from torch.utils.data import DataLoader
import torch
import torch.optim as optim
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, get_linear_schedule_with_warmup
import re
import random
import matplotlib.pyplot as plt
from torch.nn import functional as F
from torch.utils.data import Dataset

In [4]:
def set_torch_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benckmark = False
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
set_torch_seed()

def read_file(path):
    with open(path , 'r' , encoding = 'utf-8-sig') as fr:
        return fr.readlines()

  return torch._C._cuda_getDeviceCount() > 0


In [5]:
!mkdir -p content
!yes | unzip ./content/First_Phase_ReleaseCorrection.zip -d ./content
!yes | unzip ./content/Second_Phase_Dataset.zip -d ./content
!yes | unzip ./content/Validation_Dataset_Answer.zip -d ./content

!mv ./content/First_Phase_Release\(Correction\) ./content/First_Phase_Release

Archive:  ./content/First_Phase_ReleaseCorrection.zip
replace ./content/First_Phase_Release(Correction)/answer.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename:   inflating: ./content/First_Phase_Release(Correction)/answer.txt  
replace ./content/First_Phase_Release(Correction)/First_Phase_Text_Dataset/10.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename:   inflating: ./content/First_Phase_Release(Correction)/First_Phase_Text_Dataset/10.txt  
replace ./content/First_Phase_Release(Correction)/First_Phase_Text_Dataset/100.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename:   inflating: ./content/First_Phase_Release(Correction)/First_Phase_Text_Dataset/100.txt  
replace ./content/First_Phase_Release(Correction)/First_Phase_Text_Dataset/101.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename:   inflating: ./content/First_Phase_Release(Correction)/First_Phase_Text_Dataset/101.txt  
replace ./content/First_Phase_Release(Correction)/First_Phase_Text_Dataset/102.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename:   inflating: ./content

# 資料處理

In [7]:
bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
ner = '\n\n####\n\n'
special_tokens_dict = {'bos_token': bos,
                       'eos_token': eos,
                       'pad_token': pad,
                       'sep_token': ner}

def process_annotation_file(lines):
    '''
    處理anwser.txt 標註檔案

    output:annotation dicitonary
    '''
    print("process annotation file...")
    entity_dict = {}
    for line in lines:
        items = line.strip('\n').split('\t')
        if len(items) == 5:
            item_dict = {
                'phi' : items[1],
                'st_idx' : int(items[2]),
                'ed_idx' : int(items[3]),
                'entity' : items[4],
            }
        elif len(items) == 6:
            item_dict = {
                'phi' : items[1],
                'st_idx' : int(items[2]),
                'ed_idx' : int(items[3]),
                'entity' : items[4],
                'normalize_time' : items[5],
            }
        if items[0] not in entity_dict:
            entity_dict[items[0]] = [item_dict]
        else:
            entity_dict[items[0]].append(item_dict)
    print("annotation file done")
    return entity_dict

def process_medical_report(txt_name, medical_report_folder, annos_dict, special_tokens_dict):
    '''
    處理單個病理報告

    output : 處理完的 sequence pairs
    '''
    file_name = txt_name + '.txt'
    sents = read_file(os.path.join(medical_report_folder, file_name))
    article = "".join(sents)

    bounary , item_idx , temp_seq , seq_pairs = 0 , 0 , "" , []
    new_line_idx = 0
    for w_idx, word in enumerate(article):
        if word == '\n':
            new_line_idx = w_idx + 1
            if article[bounary:new_line_idx] == '\n':
                continue
            if temp_seq == "":
                temp_seq = "PHI:Null"
            sentence = article[bounary:new_line_idx].strip().replace('\t' , ' ')
            temp_seq = temp_seq.strip('\\n')
            seq_pair = f"{txt_name}\t{new_line_idx}\t{sentence}\t{temp_seq}\n"
            # seq_pair = special_tokens_dict['bos_token'] + article[bounary:new_line_idx] + special_tokens_dict['sep_token'] + temp_seq + special_tokens_dict['eos_token']
            bounary = new_line_idx
            seq_pairs.append(seq_pair)
            temp_seq = ""
        if w_idx == annos_dict[txt_name][item_idx]['st_idx']:
            phi_key = annos_dict[txt_name][item_idx]['phi']
            phi_value = annos_dict[txt_name][item_idx]['entity']
            if 'normalize_time' in annos_dict[txt_name][item_idx]:
                temp_seq += f"{phi_key}:{phi_value}=>{annos_dict[txt_name][item_idx]['normalize_time']}\\n"
            else:
                temp_seq += f"{phi_key}:{phi_value}\\n"
            if item_idx == len(annos_dict[txt_name]) - 1:
                continue
            item_idx += 1
    return seq_pairs

def generate_annotated_medical_report_parallel(anno_file_path, medical_report_folder , tsv_output_path , num_processes=4):
    '''
    呼叫上面的兩個function
    處理全部的病理報告和標記檔案

    output : 全部的 sequence pairs
    '''
    anno_lines = read_file(anno_file_path)
    annos_dict = process_annotation_file(anno_lines)
    txt_names = list(annos_dict.keys())

    print("processing each medical file")

    all_seq_pairs = []
    for txt_name in txt_names:
        all_seq_pairs.extend(process_medical_report(txt_name, medical_report_folder, annos_dict, special_tokens_dict))
    print(all_seq_pairs[:10])
    print("All medical file done")
    print("write out to tsv format...")
    with open(tsv_output_path , 'w' , encoding = 'utf-8') as fw:
        for seq_pair in all_seq_pairs:
            fw.write(seq_pair)
    print("tsv format dataset done")
    # return all_seq_pairs

In [10]:
# POYING: Change the path here
# Phase 1 dataset
anno_info_path = r"./content/First_Phase_Release/answer.txt"
report_folder = r"./content/First_Phase_Release/First_Phase_Text_Dataset"

tsv_output_path = './train1.tsv'
generate_annotated_medical_report_parallel(anno_info_path, report_folder, tsv_output_path, num_processes=4)

# Phase 2 dataset
anno_info_path = r"./content/Second_Phase_Dataset/answer.txt"
report_folder = r"./content/Second_Phase_Dataset/Second_Phase_Text_Dataset"

tsv_output_path = './train2.tsv'
generate_annotated_medical_report_parallel(anno_info_path, report_folder, tsv_output_path, num_processes=4)


process annotation file...
annotation file done
processing each medical file
['10\t25\tEpisode No:  09F016547J\tIDNUM:09F016547J\n', '10\t36\t091016.NMT\tMEDICALRECORD:091016.NMT\n', '10\t52\tSIZAR, HOWARD\tPATIENT:SIZAR, HOWARD\n', '10\t70\tLab No:  09F01654\tIDNUM:09F01654\n', '10\t78\tRunford\tSTREET:Runford\n', '10\t97\tRENMARK  TAS  5084\tCITY:RENMARK\\nSTATE:TAS\\nZIP:5084\n', '10\t114\tSpecimen: Tissue\tPHI:Null\n', '10\t132\tD.O.B:  24/8/1993\tDATE:24/8/1993=>1993-08-24\n', '10\t140\tSex:  M\tPHI:Null\n', '10\t171\tCollected: 28/08/2013 at 08:26\tTIME:28/08/2013 at 08:26=>2013-08-28T08:26\n']
All medical file done
write out to tsv format...
tsv format dataset done
process annotation file...
annotation file done
processing each medical file
['1093\t25\tEpisode No:  48B915480A\tIDNUM:48B915480A\n', '1093\t37\t4809154.WAA\tMEDICALRECORD:4809154.WAA\n', '1093\t58\tOtterbine, Laverne\tPATIENT:Otterbine, Laverne\n', '1093\t85\tLab No:  48B91548,48B91548\tIDNUM:48B91548\\nIDNUM:48B915

In [36]:
df1 = pd.read_csv('./train1.tsv', header=None, delimiter='\t')
df2 = pd.read_csv('./train2.tsv', header=None, delimiter='\t')
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)#.rename(columns={0:'file', 1:'start_position', 2:'text', 3:'phi'})
df3.to_csv('./train3.tsv', header=False, index=False, sep='\t')

In [9]:
df = pd.read_csv('./train3.tsv', header=None, delimiter='\t')
df

Unnamed: 0,0,1,2,3
0,10,25,Episode No: 09F016547J,IDNUM:09F016547J
1,10,36,091016.NMT,MEDICALRECORD:091016.NMT
2,10,52,"SIZAR, HOWARD","PATIENT:SIZAR, HOWARD"
3,10,70,Lab No: 09F01654,IDNUM:09F01654
4,10,78,Runford,STREET:Runford
...,...,...,...,...
81284,753,4376,Maximum depth of invasion: 8 mm into a wall 15...,PHI:Null
81285,753,4416,No evidence of lymphovascular invasion,PHI:Null
81286,753,4484,"Cervix, fallopian tubes and ovaries show no ev...",PHI:Null
81287,753,4562,"B to E. Left and right external iliac, left an...",PHI:Null


### Read Tsv Dataset

In [10]:
from datasets import load_dataset, Features, Value

dataset = load_dataset("csv", data_files="./train3.tsv", delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['fid', 'idx', 'content', 'label'],
        num_rows: 81289
    })
})

### Dataloader Sample

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM

plm = "EleutherAI/pythia-70m" #"EleutherAI/pythia-70m-deduped"

bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(plm, revision="step3000")
tokenizer.padding_side = 'left'
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"{tokenizer.pad_token}: {tokenizer.pad_token_id}")

<|pad|>: 50278


In [15]:
from torch.utils.data import DataLoader
from islab.aicup import collate_batch_with_prompt_template

train_data = list(dataset['train'])
train_dataloader = DataLoader(train_data, batch_size=3, shuffle=False, collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer))
titer = iter(train_dataloader)
tks, labels, masks= next(titer)
print(tks.shape)
next(iter(titer))

torch.Size([3, 23])


(tensor([[50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278, 50278,     0, 10118,  1621,    27, 50276,  2693,    39,
            520, 29195, 50279,  1838, 20872,    27,  2693,    39,   520, 29195,
            209, 50277],
         [50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
              0, 14311,  4379, 50279,  1267,  1848,  2025,    27, 12965,  4379,
            209, 50277],
         [    0,   416,  1400, 42525, 50276,    53,  1719, 50276,  1235,  2759,
          50279,    36,  7400,    27,    51,  1400, 42525,    61,    79, 19247,
             27,    53,  1719,    61,    79,    59,  3123,    27,  1235,  2759,
            209, 50277]]),
 tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,     0, 10118,  1621,    27, 50276,  2693,    39,
            520, 29195, 50279,  1838, 20872

In [16]:
results = tokenizer(
    [f"{bos} 9364819.RAN\\nMINTANIA, JEFFRY {sep} ID: 9364819.RAN\\nNAME: MINTANIA, JEFFRY {eos}",
     f"{bos} This is a sentence {sep} PHI: NULL {eos}"],
    padding=True
)
print(results['attention_mask'][0])
print(results['attention_mask'][1])
print(tokenizer.decode(results['input_ids'][0]))
print(tokenizer.decode(results['input_ids'][1]))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
<|endoftext|> 9364819.RAN\nMINTANIA, JEFFRY 

####

 ID: 9364819.RAN\nNAME: MINTANIA, JEFFRY <|END|>
<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|endoftext|> This is a sentence 

####

 PHI: NULL <|END|>


### DataLoader For training

In [50]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU.")

No GPU available, using CPU.


In [17]:
from islab.aicup import OpenDeidBatchSampler

BATCH_SIZE = 8
bucket_train_dataloader = DataLoader(train_data,
                                     batch_sampler=OpenDeidBatchSampler(train_data, BATCH_SIZE),
                                     collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                                     pin_memory=True)

In [18]:
train_data

[{'fid': '10',
  'idx': 25,
  'content': 'Episode No:  09F016547J',
  'label': 'IDNUM:09F016547J'},
 {'fid': '10',
  'idx': 36,
  'content': '091016.NMT',
  'label': 'MEDICALRECORD:091016.NMT'},
 {'fid': '10',
  'idx': 52,
  'content': 'SIZAR, HOWARD',
  'label': 'PATIENT:SIZAR, HOWARD'},
 {'fid': '10',
  'idx': 70,
  'content': 'Lab No:  09F01654',
  'label': 'IDNUM:09F01654'},
 {'fid': '10', 'idx': 78, 'content': 'Runford', 'label': 'STREET:Runford'},
 {'fid': '10',
  'idx': 97,
  'content': 'RENMARK  TAS  5084',
  'label': 'CITY:RENMARK\\nSTATE:TAS\\nZIP:5084'},
 {'fid': '10', 'idx': 114, 'content': 'Specimen: Tissue', 'label': 'PHI:Null'},
 {'fid': '10',
  'idx': 132,
  'content': 'D.O.B:  24/8/1993',
  'label': 'DATE:24/8/1993=>1993-08-24'},
 {'fid': '10', 'idx': 140, 'content': 'Sex:  M', 'label': 'PHI:Null'},
 {'fid': '10',
  'idx': 171,
  'content': 'Collected: 28/08/2013 at 08:26',
  'label': 'TIME:28/08/2013 at 08:26=>2013-08-28T08:26'},
 {'fid': '10',
  'idx': 230,
  'conten

In [45]:
from transformers import AutoConfig
# the model config to which we add the special tokens
config = AutoConfig.from_pretrained(plm,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    output_hidden_states=False)

model = AutoModelForCausalLM.from_pretrained(plm, revision="step3000", config=config)
model

config.json: 100%|██████████| 567/567 [00:00<00:00, 1.65MB/s]
pytorch_model.bin: 100%|██████████| 166M/166M [00:09<00:00, 17.1MB/s] 


GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [46]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

EPOCHS = 10 # CHANGE TO THE NUMBER OF EPOCHS YOU WANT
optimizer = AdamW(model.parameters(),lr=3e-5) # YOU CAN ADJUST LEARNING RATE

model.resize_token_embeddings(len(tokenizer))
model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [49]:
from tqdm import tqdm,trange
# 模型儲存資料夾名稱
model_name = "AutoModelForCausalLM"
# 模型儲存路徑
model_dir = f"./models/{model_name}"
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)
min_loss = 9999

global_step = 0
total_loss = 0

model.train()
for _ in trange(EPOCHS, desc="Epoch"):
    model.train()
    total_loss = 0

    # Training loop
    predictions , true_labels = [], []

    for step, (seqs, labels, masks) in enumerate(bucket_train_dataloader):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        model.zero_grad()
        outputs = model(seqs, labels=labels, attention_mask=masks)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()

        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(bucket_train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    torch.save(model.state_dict(), os.path.join(model_dir , 'GPT_Finial.pt'))
    if avg_train_loss < min_loss:
        min_loss = avg_train_loss
        torch.save(model.state_dict(), os.path.join(model_dir , 'GPT_best.pt'))

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [02:24<?, ?it/s]


KeyboardInterrupt: 

In [3]:
import pandas as pd
df1 = pd.read_csv('train1.tsv', delimiter='\t', header=None)
df1

Unnamed: 0,0,1,2,3
0,10,25,Episode No: 09F016547J,IDNUM:09F016547J
1,10,36,091016.NMT,MEDICALRECORD:091016.NMT
2,10,52,"SIZAR, HOWARD","PATIENT:SIZAR, HOWARD"
3,10,70,Lab No: 09F01654,IDNUM:09F01654
4,10,78,Runford,STREET:Runford
...,...,...,...,...
53208,file9965,18412,SpecimenReceivedDate,PHI:Null
53209,file9965,18432,2512-10-20 00:00:00,TIME:2512-10-20 00:00:00=>2512-10-20T00:00:00
53210,file9965,18434,,PHI:Null
53211,file9965,18443,LastName,PHI:Null


In [7]:
df1.iloc[:,3].apply(lambda x: x.split(':')[0]).unique()

array(['IDNUM', 'MEDICALRECORD', 'PATIENT', 'STREET', 'CITY', 'PHI',
       'DATE', 'TIME', 'DEPARTMENT', 'DOCTOR', 'ORGANIZATION', 'AGE',
       'HOSPITAL', 'DURATION', 'SET', 'COUNTRY', 'URL', 'LOCATION-OTHER',
       'STATE', 'ZIP', 'PHONE'], dtype=object)