In [None]:
# This file is modified version of original:
# https://github.com/dangnh0611/kaggle_rsna_breast_cancer/blob/reproduce/src/tools/prepair_roi_det_dataset.py

In [None]:
# pip3 install -q yacs
# pip3 install -q dicomsdl

# https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html
# pip3 install --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-cuda110

# pip3 install -q tensorrt # this package would take time approx. 25 minutes.

In [7]:
"""
Prepair YOLOX detection dataset.
- Convert competition's raw dicom to png
- Convert YOLOv5 format --> COCO format
"""

import argparse
import os
import shutil
import sys

import cv2
from yolov5_2_coco import YOLOV5ToCOCO

from settings import SETTINGS
import misc as misc_utils
from dicom import convert_with_dicomsdl_parallel


In [13]:
def parse_args():
    parser = argparse.ArgumentParser(description='Prepair YOLOX ROI detection dataset in COCO format')
    parser.add_argument('--num-workers', type=int, default=11, help='Number of workers for dicomsdl decoding.')
    
    #args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    return args

In [9]:
print(SETTINGS)

ASSETS_DIR: ./assets/
MODEL_CHECKPOINT_DIR: ./checkpoints/
MODEL_FINAL_SELECTION_DIR: ./assets/reproduce/
PROCESSED_DATA_DIR: ./datasets/processed/
RAW_DATA_DIR: 
SUBMISSION_DIR: ./submissions/
TEMP_DIR: ./tmp/
__JSON_PATH__: /media/na/e0adac50-20ce-4eb4-9c9d-98faf82ddd46/rsna_breast/SETTINGS.json


In [38]:
def main(args):
    ASSET_ROI_YOLOV5_DATA_DIR = os.path.join(SETTINGS.ASSETS_DIR, 'data', 'roi_det_yolov5_format')
    # ASSET_ROI_YOLOV5_DATA_DIR => ./assets/data/roi_det_yolov5_format
    
    # copy all competition data files to folder "rsna-breast-cancer-detection".
    KAGGLE_DCM_DIR = os.path.join(SETTINGS.RAW_DATA_DIR,
                                  'rsna-breast-cancer-detection',# rsna-breast-cancer-detection
                                  'train_images')
    # KAGGLE_DCM_DIR => train_images

    ROI_YOLOV5_DATA_DIR = os.path.join(SETTINGS.PROCESSED_DATA_DIR,
                                       'roi_det_yolox', 'yolov5_format')

    ROI_COCO_DATA_DIR = os.path.join(SETTINGS.PROCESSED_DATA_DIR,
                                     'roi_det_yolox', 'coco_format')
    
    
    # Copy manually annotated label
    # Copy mannual annotated breast ROI box in YOLOv5 format from ./assets/data/roi_det_yolov5_format/ to {PROCESSED_DATA_DIR}/roi_det_yolox/yolov5_format/
    misc_utils.rm_and_mkdir(os.path.dirname(ROI_YOLOV5_DATA_DIR))
    print(f'Copy from {ASSET_ROI_YOLOV5_DATA_DIR} --> {ROI_YOLOV5_DATA_DIR}')
    shutil.copytree(ASSET_ROI_YOLOV5_DATA_DIR,  ROI_YOLOV5_DATA_DIR)
    

    misc_utils.rm_and_mkdir(os.path.join(ROI_YOLOV5_DATA_DIR, 'images'))
    misc_utils.rm_and_mkdir(
        os.path.join(ROI_YOLOV5_DATA_DIR, 'background_images'))

    dcm_paths = []
    save_paths = []
    
    for split in ['train', 'val']:
        txt_list_path = os.path.join(ROI_YOLOV5_DATA_DIR, f'{split}.txt')
        with open(txt_list_path, 'r') as f:
            content = f.read()
            
        # type(content) => <class 'str'>
        # content[0:50] => 
        # images/10042@495770405.png
        # images/10314@410335114.

        paths = [line for line in content.split('\n') if line]
        # paths[0:5] =>
        # ['images/10042@495770405.png', 'images/10314@410335114.png', 'images/10267@647260524.png', 'images/16488@793868015.png', 'images/10224@1800447452.png']
        names = [os.path.basename(p) for p in paths]
        # names[0:5]
        # ['10042@495770405.png', '10314@410335114.png', '10267@647260524.png', '16488@793868015.png', '10224@1800447452.png']                
        
        for name in names:
            patient_id, image_id = name.split('.')[0].split('@')
            dcm_path = os.path.join(KAGGLE_DCM_DIR, patient_id,
                                    f'{image_id}.dcm')
            save_path = os.path.join(ROI_YOLOV5_DATA_DIR, 'images', name)
            dcm_paths.append(dcm_path)
            save_paths.append(save_path)
            
    # save_paths[0:2] => ['./datasets/processed/roi_det_yolox/yolov5_format/images/10042@495770405.png', './datasets/processed/roi_det_yolox/yolov5_format/images/10314@410335114.png']
    # dcm_paths[0:3] => ['train_images/10042/495770405.dcm', 'train_images/10314/410335114.dcm', 'train_images/10267/647260524.dcm']
    assert len(dcm_paths) == len(save_paths)
    print('Total:', len(dcm_paths))

    print('Converting dicom to png..')
    # convert dicom to png (full resolution)
    convert_with_dicomsdl_parallel(dcm_paths,
                                   save_paths,
                                   normalization='min_max',
                                   save_backend='cv2',
                                   save_dtype='uint8',
                                   parallel_n_jobs=args.num_workers,
                                   joblib_backend='loky',
                                   legacy=True)

    print('Converting YOLOv5 format to COCO format..')
    
    # tool for converting a yolov5 format dataset to COCO format working with yolov5, yolox and yolov6.
    # https://github.com/RapidAI/YOLO2COCO/blob/main/yolov5_2_coco.py
    yolov5_to_coco_converter = YOLOV5ToCOCO(src_dir=ROI_YOLOV5_DATA_DIR,
                                            dst_dir=ROI_COCO_DATA_DIR)
    yolov5_to_coco_converter(mode_list=['train', 'val'])


In [22]:
print(__name__)

__main__


In [42]:
if __name__ == '__main__':
    args = parse_args()
    main(args)

Total: 571
Converting dicom to png..
Starting 11 jobs with backend `loky`


100%|██████████████████████████████████| 571/571 [00:54<00:00, 10.40it/s]


Converting YOLOv5 format to COCO format..


train: 100%|███████████████████████████| 521/521 [01:49<00:00,  4.74it/s]
val: 100%|███████████████████████████████| 50/50 [00:10<00:00,  4.81it/s]

Successfully convert, detail in datasets/processed/roi_det_yolox/coco_format





In [15]:
print(args)

Namespace(num_workers=11)
