In [27]:
import numpy as np
import csv
import pickle as pkl

from pathlib import Path
from tqdm import tqdm
from multiprocessing import Process, Manager

In [28]:
dataset_path = Path('/mnt/209C31C29C3192F0/Datasets/Mimic-CXR/physionet.org/files/mimic-cxr/2.0.0/')

# be careful here, loading both amounts to ~10GB of memory
image_list = list(csv.DictReader(open(dataset_path / 'cxr-record-list.csv', 'r'), delimiter=','))         # list of dictionaries
report_list = list(csv.DictReader(open(dataset_path / 'cxr-study-list.csv', 'r'), delimiter=','))           # list of dictionaries

In [29]:
dataset_length = len(image_list)

n_proc = 6
offset = 0

chunksize = dataset_length // n_proc
proc_slices = []

for i_proc in range(n_proc):
        chunkstart = int(offset + (i_proc * chunksize))
        # make sure to include the division remainder for the last process
        chunkend = int(offset + (i_proc + 1) * chunksize) if i_proc < n_proc - 1 else int(offset + dataset_length)
        proc_slices.append(np.s_[chunkstart:chunkend])

print(f'Number of slices: {len(proc_slices)}\n{proc_slices}')

def process(data, slice, rank):                    # split it up into slices
    # preprocess reports and images
    # iterate through images and find corresponding report
    for image in tqdm(image_list[slice]):
        image_path = image['path']

        # only have p10 from data
        if image_path.find('p10') == -1:
                continue

        # filter out LAT chest X-rays?

        id = image['study_id']
        # find corresponding report
        report_path = None
        for report in report_list:
            if id in report['study_id']:
                report_path = report['path']
        
        if report is None:
                print(f'Not found report for image with path {image_path}, study id: {id} and dicom')
                continue

        entry = {'subject_id': report['subject_id'],
                'study_id': id,
                'dicom_id': image['dicom_id'],
                'report_path': report_path,
                'image_path': image_path,
                }

        data.append(entry)


data = Manager().list()
processes = []
for i in range(n_proc):
        p = Process(target=process, args=(data, proc_slices[i], i))  # Passing the list
        p.start()
        processes.append(p)
for p in processes:
        p.join()

data_list = list(data)
print(len(data_list))
with open(dataset_path / 'images2reports.pkl', 'wb') as f:
        pkl.dump(data_list, f, pkl.DEFAULT_PROTOCOL)

Number of slices: 6
[slice(0, 62851, None), slice(62851, 125702, None), slice(125702, 188553, None), slice(188553, 251404, None), slice(251404, 314255, None), slice(314255, 377110, None)]


100%|██████████| 62851/62851 [00:00<00:00, 1719441.68it/s]

  0%|          | 0/62855 [00:00<?, ?it/s], 1871782.28it/s]
100%|██████████| 62855/62855 [00:00<00:00, 2174041.58it/s]
100%|██████████| 62851/62851 [00:00<00:00, 1594157.14it/s]
100%|██████████| 62851/62851 [07:56<00:00, 131.87it/s]


36681
