# Dataset

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [9]:
from datasets import load_dataset, Features, Value
# dataset = load_dataset("csv", data_files="content/opendid_set1.tsv", delimiter='\t',
dataset = load_dataset("csv", data_files="content/opendid_train_competion.tsv", delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['fid', 'idx', 'content', 'label'],
        num_rows: 6712
    })
})

In [11]:
dataset['train'][0]

{'fid': '1097',
 'idx': 1,
 'content': '433475.RDC',
 'label': 'MEDICALRECORD: 433475.RDC'}

In [12]:
dataset['train'][1]

{'fid': '1135', 'idx': 124, 'content': '3/6/1989', 'label': 'DATE: 1989-06-03'}

In [13]:
dataset['train'][7]

{'fid': '1135',
 'idx': 152,
 'content': '07/08/2062 at 12:20',
 'label': 'TIME: 2062-08-07T12:20'}

In [15]:
dataset['train'][6711]

{'fid': 'file66968',
 'idx': 8,
 'content': '22Q6010968',
 'label': 'MEDICALRECORD: 22Q6010968'}

For demonstration purpose, we only use the randomly sampled 20000 instances.

In [17]:
import torch
sub_datasets = torch.utils.data.random_split(dataset['train'],[6000,712])
print(len(sub_datasets[0]))
for i in range(4): print(sub_datasets[0][i])

6000
{'fid': '2101', 'idx': 24, 'content': '3494718.LHD', 'label': 'MEDICALRECORD: 3494718.LHD'}
{'fid': '2073', 'idx': 90, 'content': 'ACT', 'label': 'STATE: ACT'}
{'fid': 'file58549', 'idx': 27, 'content': '0009014', 'label': 'MEDICALRECORD: 0009014'}
{'fid': '2444', 'idx': 87, 'content': 'WANGARATTA', 'label': 'CITY: WANGARATTA'}


# Data loader

In [18]:
!pip install transformers



In [19]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

plm = "EleutherAI/pythia-70m-deduped" #"EleutherAI/pythia-70m-deduped"
bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(plm, revision="step3000")
tokenizer.padding_side = 'left'
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"{tokenizer.pad_token}: {tokenizer.pad_token_id}")

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

<|pad|>: 50278


In [20]:
!pip install islab-opendeid

Collecting islab-opendeid
  Downloading islab_opendeid-0.0.1.1-py3-none-any.whl (3.0 kB)
Installing collected packages: islab-opendeid
Successfully installed islab-opendeid-0.0.1.1


In [21]:
from torch.utils.data import DataLoader
from islab.aicup import collate_batch_with_prompt_template

train_data = list(sub_datasets[0])
train_dataloader = DataLoader(train_data, batch_size=3, shuffle=False, collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer))
titer = iter(train_dataloader)
tks, labels, masks= next(titer)
print(tks.shape)
next(iter(titer))

torch.Size([3, 21])


(tensor([[50278, 50278, 50278, 50278,     0,   411, 14375,  1277,  1194,  7656,
          50279,    36,  7400,    27,   411, 14375,  1277,  1194,  7656,   209,
          50277],
         [50278, 50278, 50278, 50278,     0,   854,  2759,  1036, 32745, 50279,
          19047, 25129, 34181,  8242,    27,   854,  2759,  1036, 32745,   209,
          50277],
         [    0,  3436,    45,  2090,  1787,  1630,    51, 50279, 19047, 25129,
          34181,  8242,    27,  3436,    45,  2090,  1787,  1630,    51,   209,
          50277]]),
 tensor([[ -100,  -100,  -100,  -100,     0,   411, 14375,  1277,  1194,  7656,
          50279,    36,  7400,    27,   411, 14375,  1277,  1194,  7656,   209,
          50277],
         [ -100,  -100,  -100,  -100,     0,   854,  2759,  1036, 32745, 50279,
          19047, 25129, 34181,  8242,    27,   854,  2759,  1036, 32745,   209,
          50277],
         [    0,  3436,    45,  2090,  1787,  1630,    51, 50279, 19047, 25129,
          34181,  8242,    2

In [22]:
results = tokenizer(["Lab No: 14H02780", "“STOCKDALE” 653 MONAGHAN RD"], padding=True)
print(results['input_ids'])
print()
print(results['input_ids'][0])
print(tokenizer.decode(results['input_ids'][0]))
print(results['input_ids'][1])
print(tokenizer.decode(results['input_ids'][1]))

[[50278, 50278, 50278, 50278, 50278, 50278, 21663, 1621, 27, 1638, 41, 16604, 1438], [1628, 1267, 9466, 37, 23502, 668, 721, 3357, 33995, 2696, 41, 1539, 28613]]

[50278, 50278, 50278, 50278, 50278, 50278, 21663, 1621, 27, 1638, 41, 16604, 1438]
<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|>Lab No: 14H02780
[1628, 1267, 9466, 37, 23502, 668, 721, 3357, 33995, 2696, 41, 1539, 28613]
“STOCKDALE” 653 MONAGHAN RD


In [23]:
results = tokenizer(
    [f"{bos} 9364819.RAN\\nMINTANIA, JEFFRY {sep} ID: 9364819.RAN\\nNAME: MINTANIA, JEFFRY {eos}",
     f"{bos} This is a sentence {sep} PHI: NULL {eos}"],
    padding=True
)
print(results['attention_mask'][0])
print(results['attention_mask'][1])
print(tokenizer.decode(results['input_ids'][0]))
print(tokenizer.decode(results['input_ids'][1]))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
<|endoftext|> 9364819.RAN\nMINTANIA, JEFFRY 

####

 ID: 9364819.RAN\nNAME: MINTANIA, JEFFRY <|END|>
<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|endoftext|> This is a sentence 

####

 PHI: NULL <|END|>


In [24]:
from islab.aicup import OpenDeidBatchSampler

BATCH_SIZE = 8
bucket_train_dataloader = DataLoader(train_data, batch_sampler=OpenDeidBatchSampler(train_data, BATCH_SIZE),
                                     collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                                     pin_memory=True)

for idx, batch in enumerate(bucket_train_dataloader):
    print(batch)
    print(batch[0].shape)
    print(batch[1].shape)
    break

[tensor([[    0, 23564,    42,    14, 43724,  1194, 11477, 49101,    14,    44,
          1277,    48,  1139,  4877,  4889, 34928, 43148, 11364,  6117,     8,
         25719,  1372, 13859,   388,  2697,    49, 35573, 50279,    41,  2697,
            49, 35573,    27, 23564,    42,    14, 43724,  1194, 11477, 49101,
            14,    44,  1277,    48,  1139,  4877,  4889, 34928, 43148, 11364,
          6117,     8, 25719,  1372, 13859,   388,  2697,    49, 35573,   209,
         50277],
        [50278, 50278, 50278, 50278,     0, 28846, 43724,  1194, 11477, 28239,
          4741, 16759,    14,    53,  4237,  1719,   388,  2697,    49, 35573,
           353, 10147,  3123,  3322, 19024, 10197, 38112, 21847,  8875, 50279,
            41,  2697,    49, 35573,    27, 28846, 43724,  1194, 11477, 28239,
          4741, 16759,    14,    53,  4237,  1719,   388,  2697,    49, 35573,
           353, 10147,  3123,  3322, 19024, 10197, 38112, 21847,  8875,   209,
         50277],
        [50278, 5

# Model

In [25]:
from transformers import AutoConfig
# the model config to which we add the special tokens
config = AutoConfig.from_pretrained(plm,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    output_hidden_states=False)

model = AutoModelForCausalLM.from_pretrained(plm, revision="step3000", config=config)
model

config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [27]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

EPOCHS = 5 # CHANGE TO THE NUMBER OF EPOCHS YOU WANT
optimizer = AdamW(model.parameters(),lr=1e-4) # YOU CAN ADJUST LEARNING RATE
model.resize_token_embeddings(len(tokenizer))
model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [28]:
from tqdm import tqdm,trange

global_step = 0
total_loss = 0

model.train()
for _ in trange(EPOCHS, desc="Epoch"):
    model.train()
    total_loss = 0

    # Training loop
    predictions , true_labels = [], []

    for step, (seqs, labels, masks) in enumerate(bucket_train_dataloader):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        model.zero_grad()
        outputs = model(seqs, labels=labels, attention_mask=masks)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()

        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(bucket_train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

Epoch:  20%|██        | 1/5 [00:25<01:43, 25.76s/it]

Average train loss: 1.6917151226202647


Epoch:  40%|████      | 2/5 [00:50<01:15, 25.13s/it]

Average train loss: 1.3129937194983163


Epoch:  60%|██████    | 3/5 [01:14<00:49, 24.52s/it]

Average train loss: 1.121375757733981


Epoch:  80%|████████  | 4/5 [01:38<00:24, 24.35s/it]

Average train loss: 0.9695531621774037


Epoch: 100%|██████████| 5/5 [02:02<00:00, 24.50s/it]

Average train loss: 0.8474584747950236





In [29]:
torch.save(model.state_dict(), "content/AICUP2023/70md.pt")

In [32]:
from datasets import load_dataset, Features, Value
valid_data = load_dataset("csv", data_files="content/opendid_test.tsv", delimiter='\t',
                          features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'])
valid_list= list(valid_data['train'])
valid_list

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

[{'fid': '1097', 'idx': 1, 'content': '433475.RDC', 'label': None},
 {'fid': '1097', 'idx': 12, 'content': 'Timmins, ELDEN', 'label': None},
 {'fid': '1097', 'idx': 27, 'content': '43J47561,43J47561', 'label': None},
 {'fid': '1097',
  'idx': 46,
  'content': 'Last edited : 7/9/2063  Page: 2',
  'label': None},
 {'fid': '1097', 'idx': 78, 'content': 'CLINICAL:', 'label': None},
 {'fid': '1097',
  'idx': 88,
  'content': 'Metastatic cancer ?colorectal primary.',
  'label': None},
 {'fid': '1097', 'idx': 128, 'content': 'MACROSCOPIC:', 'label': None},
 {'fid': '1097',
  'idx': 141,
  'content': 'Specimen labelled "Omentum secondary", consists of a piece of omentum 120 x 100 x 30mm.',
  'label': None},
 {'fid': '1097',
  'idx': 230,
  'content': 'On sectioning there are multiple fibrotic white ill-defined nodules identified.',
  'label': None},
 {'fid': '1097',
  'idx': 312,
  'content': 'Blocks: 1 to 5 - representative sections from the nodules.',
  'label': None},
 {'fid': '1097',
  'id

In [33]:
from tqdm.notebook import tqdm
from islab.aicup import aicup_predict
import io
BATCH_SIZE = 64

with io.open("content/AICUP2023/answer.txt",'w',encoding='utf8') as f:
#with io.open("answer.txt",'w',encoding='utf8') as f:
    for i in tqdm(range(0, len(valid_list), BATCH_SIZE)):
        with torch.no_grad():
            seeds = valid_list[i:i+BATCH_SIZE]
            outputs = aicup_predict(model, tokenizer, input=seeds)
            for o in outputs:
                f.write(o)
                f.write('\n')

  0%|          | 0/1235 [00:00<?, ?it/s]