In [1]:
from pathlib import Path
import os
import pandas as pd
import tokenizers
import transformers

Paths:

In [2]:
dataset_dir = 'B:/source/Datasets'
splits_path = os.path.join(
    dataset_dir,
    'mimic_cxr_jpg',
    'physionet.org',
    'files',
    'mimic-cxr-jpg',
    '2.0.0',
    'mimic-cxr-2.0.0-split.csv',
)
reports_path = os.path.join(dataset_dir, 'mimic_cxr_sections', 'mimic_cxr_sectioned.csv')

Training corpus:

In [3]:
splits = pd.read_csv(splits_path)
reports = pd.read_csv(reports_path)
reports.findings = reports.findings.replace(r'\n', ' ', regex=True)
reports.impression = reports.findings.replace(r'\n', ' ', regex=True)
reports.findings = reports.findings.replace(r'\t', ' ', regex=True)
reports.impression = reports.findings.replace(r'\t', ' ', regex=True)
reports.findings = reports.findings.replace(r'\s{2,}', ' ', regex=True)
reports.impression = reports.findings.replace(r'\s{2,}', ' ', regex=True)
reports.rename(columns={'study': 'study_id'}, inplace=True)
reports.study_id = reports.study_id.str[1:].astype('int32')
df = pd.merge(splits, reports, on='study_id')
df.head()

Unnamed: 0,dicom_id,study_id,subject_id,split,impression,findings,last_paragraph,comparison
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train,"There is no focal consolidation, pleural effus...","There is no focal consolidation, pleural effus...",,None.
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,10000032,train,"There is no focal consolidation, pleural effus...","There is no focal consolidation, pleural effus...",,None.
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,10000032,train,"The cardiac, mediastinal and hilar contours ar...","The cardiac, mediastinal and hilar contours ar...",,___
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,10000032,train,"The cardiac, mediastinal and hilar contours ar...","The cardiac, mediastinal and hilar contours ar...",,___
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,10000032,train,Single frontal view of the chest provided. The...,Single frontal view of the chest provided. The...,,Chest radiograph ___


Use the findings and impression sections from the training set:

In [4]:
reports = df.loc[df.split == 'train'].drop_duplicates(subset=['study_id']).findings.dropna().tolist()
reports += df.loc[df.split == 'train'].drop_duplicates(subset=['study_id']).impression.dropna().tolist()
len(reports)

304346

Constructing a byte-pair BPE tokenizer based on https://huggingface.co/course/chapter6/8?fw=pt#building-a-bpe-tokenizer-from-scratch:

In [5]:
tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())

Byte level pre-tokenizer:

In [6]:
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.pre_tokenizer.pre_tokenize_str(reports[408])

[('A', (0, 1)),
 ('Ġsingle', (1, 8)),
 ('Ġportable', (8, 17)),
 ('Ġsemi', (17, 22)),
 ('-', (22, 23)),
 ('erect', (23, 28)),
 ('Ġchest', (28, 34)),
 ('Ġradiograph', (34, 45)),
 ('Ġwas', (45, 49)),
 ('Ġobtained', (49, 58)),
 ('.', (58, 59)),
 ('ĠPulmonary', (59, 69)),
 ('Ġaeration', (69, 78)),
 ('Ġhas', (78, 82)),
 ('Ġdecreased', (82, 92)),
 ('.', (92, 93)),
 ('ĠModerate', (93, 102)),
 ('Ġto', (102, 105)),
 ('Ġlarge', (105, 111)),
 ('Ġlayering', (111, 120)),
 ('Ġright', (120, 126)),
 ('Ġpleural', (126, 134)),
 ('Ġeffusion', (134, 143)),
 ('Ġhas', (143, 147)),
 ('Ġincreased', (147, 157)),
 ('.', (157, 158)),
 ('ĠLoculated', (158, 168)),
 ('Ġintra', (168, 174)),
 ('-', (174, 175)),
 ('abdominal', (175, 184)),
 ('Ġair', (184, 188)),
 ('Ġprojects', (188, 197)),
 ('Ġover', (197, 202)),
 ('Ġthe', (202, 206)),
 ('Ġright', (206, 212)),
 ('Ġlung', (212, 217)),
 ('Ġbase', (217, 222)),
 ('.', (222, 223)),
 ('ĠCentral', (223, 231)),
 ('Ġpulmonary', (231, 241)),
 ('Ġvascular', (241, 250)),
 ('Ġconge

Train tokenizer on corpus:

In [7]:
trainer = tokenizers.trainers.BpeTrainer(special_tokens=['[UNK]', '[BOS]', '[EOS]', '[SEP]', '[PAD]', '[MASK]'])
tokenizer.train_from_iterator(reports, trainer)
tokenizer.get_vocab_size()

23084

Byte-level decoder:

In [8]:
tokenizer.decoder = tokenizers.decoders.ByteLevel()

Wrap using HFs PreTrainedTokenizerFast:

In [9]:
tokenizer = transformers.PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token='[UNK]',
    pad_token='[PAD]',
    bos_token='[BOS]',
    cls_token='[BOS]',
    sep_token='[SEP]',
    eos_token='[EOS]',
    mask_token='[MASK]',
)

Save tokenizer:

In [10]:
version = 'bpe_findings_impression'
save_dir = f'B:/work/Checkpoints/mimic-cxr-tokenizers/{version}'

Path(save_dir).mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(save_dir)

('B:/work/Checkpoints/mimic-cxr-tokenizers/bpe_findings_impression\\tokenizer_config.json',
 'B:/work/Checkpoints/mimic-cxr-tokenizers/bpe_findings_impression\\special_tokens_map.json',
 'B:/work/Checkpoints/mimic-cxr-tokenizers/bpe_findings_impression\\tokenizer.json')

Load tokenizer:

In [11]:
loaded_tokenizer = transformers.PreTrainedTokenizerFast.from_pretrained(save_dir)

In [12]:
loaded_tokenizer.all_special_tokens

['[BOS]', '[EOS]', '[UNK]', '[SEP]', '[PAD]', '[MASK]']