## DCMtoCocoUtility02  

#### Notes
- added all available classes from "Study" level data.  (060321)  
Issue: Many image level records do not have bounding boxes, even though they are positive...need to explore  
Issue: Duplicate records (seems to be a known issue on kaggle)

Based on following notebooks:  
- https://www.kaggle.com/bnapora/siim-covid-19-convert-to-jpg-256px  
- https://www.kaggle.com/bnapora/siim-covid-19-resize-process-coco-dataset


In [1]:
import os

from PIL import Image
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path

In [2]:
params = dict(
    size=256
    )

# Laptop Workstation
path_dicom = Path('/host_Data/DataSets/Kaggle-SIIM Covid 19/siim-covid19-detection')

# AZ Server
# path_dicom = Path('/workspace/WSI/SIIM-Covid19-Detection')

path_output_patches = path_dicom / ('DS-COCO_' + str(params['size']))

In [3]:
import numpy as np
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

def read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [4]:
def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    
    return im

In [6]:
path = path_dicom / 'train/ae3e63d94c13/288554eb6182/e00f9fe0cce5.dcm'
dicom = pydicom.read_file(path)

In [9]:
# Resize images and create annotation meta
image_ids = []
folder_ids = []
study_ids = []
widths = []
heights = []
# splits = []

for split in ['test', 'train']:
    save_dir = path_output_patches / f'{split}/'

    os.makedirs(save_dir, exist_ok=True)
    print(path_dicom / f'siim-covid19-detection/{split}')
    for dirname, _, filenames in tqdm(os.walk(str(path_dicom / f'{split}'))):
        for file in filenames:
            path_file = dirname + '/' + file

            # set keep_ratio=True to have original aspect ratio
            xray = read_xray(os.path.join(dirname, file))
            im = resize(xray, size=256)  

            path_split = path_file.split('/')
            study_id = path_split[-3]
            folder_id = path_split[-2]
            image_name = path_split[-1].replace('.dcm', '_image')

            im.save(os.path.join(save_dir, image_name+'.png'))

            image_ids.append(image_name)
            folder_ids.append(folder_id)
            study_ids.append(study_id)
            widths.append(xray.shape[0])
            heights.append(xray.shape[1])

    df = pd.DataFrame.from_dict({'id': image_ids, 'folder_id': folder_ids,
                                 'study_id': study_ids, 'width': widths,
                                 'height': heights})
    df.to_csv(path_output_patches / f'{split}_meta.csv', index=False)

/host_Data/DataSets/Kaggle-SIIM Covid 19/siim-covid19-detection/siim-covid19-detection/test


0it [00:00, ?it/s]

/host_Data/DataSets/Kaggle-SIIM Covid 19/siim-covid19-detection/siim-covid19-detection/train


0it [00:00, ?it/s]



In [None]:
# %%time
# !tar -zcf train.tar.gz -C path_output_patches / "train/" .
# !tar -zcf test.tar.gz -C path_output_patches / "test/" .

In [None]:
# df = pd.DataFrame.from_dict({'image_id': image_id, 'dim0': dim0, 'dim1': dim1, 'split': splits})
# df.to_csv('meta.csv', index=False)

## Load & Merge Study, Image, Meta CSVs

In [10]:
df_train_image = pd.read_csv(path_dicom / "train_image_level.csv")
# df_train_image.head(3)

In [11]:
df_train_study = pd.read_csv(path_dicom / "train_study_level.csv")
df_train_study['StudyInstanceUID'] = df_train_study['id'].apply(lambda x: x.replace('_study', ''))
del df_train_study['id']
# df_train_study.head(3)

In [12]:
df_image_study = df_train_image.merge(df_train_study, on='StudyInstanceUID')

In [14]:
df_train_meta = pd.read_csv(path_output_patches / "train_meta.csv")
# df_train_meta.head(3)

In [15]:
df_train_meta = pd.read_csv(path_output_patches / "train_meta.csv")
# df_train_meta.head(3)

Unnamed: 0,id,folder_id,study_id,width,height
0,3dcdfc352a06_image,3eb5a506ccf3,00188a671292,3480,4248
1,c39146cbda47_image,e7d024ea82d7,004bd59708be,3488,4256
2,951211f8e1bb_image,d39fc1121992,00508faccd39,2320,2832


In [16]:
#Merge Train_Meta with Train_Image
df_train = df_train_image.merge(df_train_meta, on='id')
# df_train.head(3)

Unnamed: 0,id,boxes,label,StudyInstanceUID,folder_id,study_id,width,height
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,81456c9c5423,5776db0cec75,3488,4256
1,000c3a3f293f_image,,none 1 0 0 1 1,ff0879eb20ed,d8a644cc4f93,ff0879eb20ed,2320,2832
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,22897cd1daa0,9d514ce429a7,2544,3056


In [17]:
#Merge Train_Study into Train_Image & Train_Meta
df_image_study_meta = df_train.merge(df_train_study, on='StudyInstanceUID')
# df_image_study_meta.head(3)

## Meta-data Clean  
#### Remove Duplicates, Convert NaN to 0

In [18]:
# add StudyInstanceUID_count column
group_col = 'StudyInstanceUID'
df=pd.DataFrame(df_image_study_meta.groupby(group_col)['id'].count())
df.columns = [f'{group_col}_count']
df_image_study_meta=df_image_study_meta.merge(df.reset_index(), on=group_col)

one_study_multi_image_df = df_image_study_meta[df_image_study_meta[f'{group_col}_count'] > 1]
print(len(one_study_multi_image_df))

# delete 'StudyInstanceUID_count > 1' data
df_image_study_meta = df_image_study_meta[df_image_study_meta[f'{group_col}_count'] == 1] 
one_study_multi_image_df.head(3)

512


Unnamed: 0,id,boxes,label,StudyInstanceUID,folder_id,study_id,width,height,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,StudyInstanceUID_count
22,00c1515729a8_image,,none 1 0 0 1 1,1a58b43cf286,461096d084ba,1a58b43cf286,2539,3050,0,1,0,0,2
23,cada5310214b_image,"[{'x': 981.75034, 'y': 256.08181, 'width': 225...",opacity 1 981.75034 256.08181 1207.54099 765.2...,1a58b43cf286,0704ff4c28b5,1a58b43cf286,1140,1387,0,1,0,0,2
25,00e3a7e91a34_image,,none 1 0 0 1 1,74ba8f2badcb,c8755f476425,74ba8f2badcb,1760,2140,1,0,0,0,4


In [19]:
#Fill NaN VAlues in DF
df_image_study_meta = df_image_study_meta.fillna(0)

## Create Record for Each Bounding Box

In [20]:
class_names = ['Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']
classdata2num = {
    '[0, 0, 0]': 1,
    '[1, 0, 0]': 2,
    '[0, 1, 0]': 3,
    '[0, 0, 1]': 4
}
num2label = {1: 'Negative', 2: 'Typical', 3: 'Indeterminate', 4: 'Atypical'}

classmap = num2label

In [23]:
import cv2
import ast

new_size = (params['size'], params['size'])
df_idx=0

for idx, row in tqdm(df_image_study_meta.iterrows(), total=df_image_study_meta.shape[0]):
    img = cv2.imread(os.path.join("train", row.id.replace("_image", ".png")))

    if row.boxes != 0:
        bboxes = [list(bbox.values()) for bbox in ast.literal_eval(row.boxes)]
    else:
        bboxes = [[0,0,1,1]]
        
    height_ratio, width_ratio = (new_size[0]/row.height, new_size[1]/row.width)
    
    for box in bboxes:
        box[2] = box[2]+box[0]
        box[3] = box[3]+box[1]
        box = (box[0]*height_ratio, box[1]*width_ratio,
               box[2]*height_ratio, box[3]*width_ratio)
        
        classdata = row[class_names].values
        classid = classdata2num[str(classdata.tolist())]
        classlabel = num2label[classid]
  
        
        row_df = pd.DataFrame({'id':row.id,
                       'StudyInstanceUID':row.StudyInstanceUID,
                       'folder_id':row.folder_id,
                       'study_id':row.study_id,
                       'width':row.width,
                       'height':row.height,
                       'xmin':round(box[0]),
                       'ymin':round(box[1]),
                       'xmax':round(box[2]),
                       'ymax':round(box[3]),
                       'class_id':classid,
                       'class_label':classlabel,
                       'Negative for Pneumonia': row['Negative for Pneumonia'],
                       'Typical Appearance':row['Typical Appearance'],
                       'Indeterminate Appearance':row['Indeterminate Appearance'],
                       'Atypical Appearance':row['Atypical Appearance']}, index=[df_idx])
        if df_idx==0:
            df_train_processed = row_df
        else:
            df_train_processed = pd.concat([df_train_processed, row_df])

        df_idx+=1
        
df_train_processed.head(3)

  0%|          | 0/5822 [00:00<?, ?it/s]

Unnamed: 0,id,StudyInstanceUID,folder_id,study_id,width,height,xmin,ymin,xmax,ymax,class_id,class_label,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,000a312787f2_image,5776db0cec75,81456c9c5423,5776db0cec75,3488,4256,47,43,109,183,2,Typical,0,1,0,0
1,000a312787f2_image,5776db0cec75,81456c9c5423,5776db0cec75,3488,4256,135,43,201,173,2,Typical,0,1,0,0
2,000c3a3f293f_image,ff0879eb20ed,d8a644cc4f93,ff0879eb20ed,2320,2832,0,0,0,0,1,Negative,1,0,0,0


In [24]:
df_train_processed.to_csv(path_output_patches / "df_train_processed_meta.csv", index=False)
df_train_processed.shape

(9225, 16)

## Additional Data Clean - Remove No BBox Records

In [25]:
df_annotations = pd.read_csv(path_output_patches / "df_train_processed_meta.csv")
print('Annotation Count=', len(df_annotations))
# df_annotations.sample(100)

Annotation Count= 9225


In [26]:
#Load Unique Image ID's and Remove Images with No BBoxes
print('Cnt of df_annotations=', len(df_annotations))
df_BBoxes_True = df_annotations.loc[df_annotations['xmin'] != 0]
print('Cnt of df_BBoxes_True=', len(df_BBoxes_True))

Cnt of df_annotations= 9225
Cnt of df_BBoxes_True= 7504


## Generate COCO Dataset

In [27]:
train_img_ids = df_BBoxes_True.id.unique()
print(f"Train image count: {len(train_img_ids)}")

Train image count: 4115


In [37]:
import datetime
import json

now = datetime.datetime.now()

data = dict(
    info=dict(
        description='SIIM Covid-19 GroupKfold',
        url=None,
        version=None,
        year=now.year,
        contributor=None,
        date_created=now.strftime('%Y-%m-%d %H:%M:%S.%f'),
    ),
    licenses=[dict(
        url=None,
        id=0,
        name=None,
    )],
    images=[
        # license, url, file_name, height, width, date_captured, id
    ],
    type='instances',
    annotations=[
        # segmentation, area, iscrowd, image_id, bbox, category_id, id
    ],
    categories=[
        # supercategory, id, name
    ],
)

In [30]:
#Generate Categories section of COCO
for key, value in classmap.items():
    data['categories'].append(dict(
        supercategory=None,
        id=key,
        name=value,
    ))
    
# data

In [34]:
H, W = (params['size'], params['size'])

## Setting the output annotation json file paths
train_out_file = 'coco_train_annotations_' + str(params['size']) + 'px.json'

In [35]:
#Generate images and annotations sections of COCO
data_train = data.copy()
data_train['images'] = []
data_train['annotations'] = []

for i, img_id in tqdm(enumerate(train_img_ids), total=len(train_img_ids)):

    data_train['images'].append(dict(license=0,
                                     url=None,
                                     file_name=img_id+'.png',
                                     height=H,
                                     width=W,
                                     date_captured=None,
                                     id=i
                                    ))

    img_annotations = df_annotations[df_annotations.id==img_id]

    for idx, row in img_annotations.iterrows():
        x_min, y_min, x_max, y_max = [row['xmin'], row['ymin'], row['xmax'], row['ymax']]
        area = round((x_max-x_min)*(y_max-y_min),1)
        bbox =[
                int(x_min),
                int(y_min),
                int(x_max-x_min),
                int(y_max-y_min)
                ]

        data_train['annotations'].append(dict(id=len(data_train['annotations']),
                                              image_id=i,
                                              category_id=int(row['class_id']),
                                              area=int(area),
                                              bbox=bbox,
                                              iscrowd=0)) 

  0%|          | 0/4115 [00:00<?, ?it/s]

In [38]:
with open(path_output_patches / train_out_file, 'w') as f:
    json.dump(data_train, f, indent=4)