## Install and Import Libraries

In [None]:
!pip install pydicom

In [None]:
import glob, pylab, pandas as pd
import cv2
import pydicom, numpy as np
from os import listdir
from os.path import isfile, join
import matplotlib.pylab as plt
import os
import seaborn as sns
import tensorflow as tf
from tqdm import tqdm
import joblib
import PIL
import json
import seaborn as sns
from collections import Counter
from PIL import Image
import math
from collections import defaultdict
from pathlib import Path
import re
import logging as l
from glob import glob
import argparse

## Load and Preprocess Data

In [None]:
#Load and Preprocess labels
label = pd.read_csv("rsna-intracranial-hemorrhage-detection/stage_2_train.csv")
label = label[label['ID'].str.contains('any')].reset_index(drop=True)
label['ID'] = label['ID'].str[:-4]
label['ID'] = label['ID'].astype(str) + '.dcm'

In [None]:
train_images_dir = 'rsna-intracranial-hemorrhage-detection/stage_2_train/'
train_images = [f for f in listdir(train_images_dir) if isfile(join(train_images_dir, f))]

### Plot Sample Data

In [None]:
fig=plt.figure(figsize=(20, 12))
columns = 5; rows = 3
for i in range(1, columns*rows +1):
    ds = pydicom.dcmread(train_images_dir + train_images[i])
    fig.add_subplot(rows, columns, i)
    plt.imshow(ds.pixel_array, cmap=plt.cm.bone)
    fig.add_subplot

### Plot ICH vs ICH label distribution

In [None]:
plt.style.use('seaborn-bright')
ax = label['Label'].value_counts().plot(kind='barh', figsize=(11,5),fontsize=14, color=['C0', 'C2']);
ax.set_alpha(0.8)
ax.set_xlabel("Hemorrhage and No-Hemorrhage Label Counts", fontsize=18);
ax.set_xlim(0, 720000)
#ax.set_yticks(['Haemorrhage', 'No-Haemorrhage'])

labels = [item.get_text() for item in ax.get_yticklabels()]
labels = ['No-Hemorrhage', 'Hemorrhage']
ax.set_yticklabels(labels)

# create a list to collect the plt.patches data
totals = []
# find the values and append to list
for i in ax.patches:
    totals.append(i.get_width())
# set individual bar lables using above list
total = sum(totals)
# set individual bar lables using above list
for i in ax.patches:
    # get_width pulls left or right; get_y pushes up or down
    ax.text(i.get_width()+.3, i.get_y()+.38, \
            str(round((i.get_width()/total)*100, 2))+'%', fontsize=16,
color='black')
# invert for largest on top
ax.invert_yaxis()
ax.grid()
ax.figure.savefig('hemorrhage_label_plot.png', dpi = 300, bbox_inches='tight')

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.10, stratify=df[["Label"]])

df_train_1 = df_train['ID'][df_train['Label'] == 1].to_list()
df_train_0 = df_train['ID'][df_train['Label'] == 0].to_list()

df_test_1 = df_test['ID'][df_test['Label'] == 1].to_list()
df_test_0 = df_test['ID'][df_test['Label'] == 0].to_list()

## Data Processing - Rescale, Resize and Convert to JPG

In [None]:
dir_src = 'rsna-intracranial-hemorrhage-detection/stage_2_train'
dir_dst_train_0 = 'rsna-intracranial-hemorrhage-detection/main_data/train_data_dcm/no_hemorrhage'
dir_dst_train_1 = 'rsna-intracranial-hemorrhage-detection/main_data/train_data_dcm/hemorrhage'

def tranfer_train_data(filelist, source, destination):
    for i in filelist[1:]: 
        shutil.copy(os.path.join(source, i), destination)

In [None]:
tranfer_train_data(df_train_0, dir_src, dir_dst_train_0)
tranfer_train_data(df_train_1, dir_src, dir_dst_train_1)

In [None]:
# %load prepare_data.py
import joblib
import PIL
from glob import glob
import pydicom
import numpy as np
import pandas as pd
import os
import cv2
import json
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from PIL import Image
import math
import seaborn as sns
from collections import defaultdict
from pathlib import Path
import cv2
from tqdm import tqdm
import re
import logging as l
from glob import glob
import argparse

def get_first_of_dicom_field_as_int(x):
    if type(x) == pydicom.multival.MultiValue:
        return int(x[0])
    return int(x)

def get_id(img_dicom):
    return str(img_dicom.SOPInstanceUID)

def get_metadata_from_dicom(img_dicom):
    metadata = {
        "window_center": img_dicom.WindowCenter,
        "window_width": img_dicom.WindowWidth,
        "intercept": img_dicom.RescaleIntercept,
        "slope": img_dicom.RescaleSlope,
    }
    return {k: get_first_of_dicom_field_as_int(v) for k, v in metadata.items()}

def window_image(img, window_center, window_width, intercept, slope):
    img = img * slope + intercept
    img_min = window_center - window_width // 2
    img_max = window_center + window_width // 2
    img[img < img_min] = img_min
    img[img > img_max] = img_max
    return img 

def resize(img, new_w, new_h):
    img = PIL.Image.fromarray(img.astype(np.int8), mode="L")
    return img.resize((new_w, new_h), resample=PIL.Image.BICUBIC)

def save_img(img_pil, subfolder, name):
    img_pil.save(subfolder+name+'.jpg')

def normalize_minmax(img):
    mi, ma = img.min(), img.max()
    return (img - mi) / (ma - mi)

def prepare_image(img_path):
    img_dicom = pydicom.read_file(img_path)
    img_id = get_id(img_dicom)
    metadata = get_metadata_from_dicom(img_dicom)
    img = window_image(img_dicom.pixel_array, **metadata)
    img = normalize_minmax(img) * 255
    img = PIL.Image.fromarray(img.astype(np.int8), mode="L")
    return img_id, img

def prepare_and_save(img_path, subfolder):
    try:
        img_id, img_pil = prepare_image(img_path)
        save_img(img_pil, subfolder, img_id)
    except KeyboardInterrupt:
        # Rais interrupt exception so we can stop the cell execution
        # without shutting down the kernel.
        raise
    except:
        l.error('Error processing the image: {'+img_path+'}')

def prepare_images(imgs_path, subfolder):
    for i in tqdm.tqdm(imgs_path):
        prepare_and_save(i, subfolder)

def prepare_images_njobs(img_paths, subfolder, n_jobs=-1):
    joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(prepare_and_save)(i, subfolder) for i in tqdm(img_paths))

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-dcm_path", "--dcm_path", type=str)
    parser.add_argument("-jpg_path", "--jpg_path", type=str)
    args = parser.parse_args()
    dcm_path = args.dcm_path
    jpg_path = args.jpg_path

    if not os.path.exists(jpg_path):
        os.makedirs(jpg_path)

    prepare_images_njobs(glob(dcm_path+'/*'), jpg_path+'/')

In [None]:
%%capture
!python3 prepare_data.py -dcm_path 'rsna-intracranial-hemorrhage-detection/main_data/train_data_dcm/no_hemorrhage' -jpg_path 'rsna-intracranial-hemorrhage-detection/main_data/train_data/no_hemorrhage'
!python3 prepare_data.py -dcm_path 'rsna-intracranial-hemorrhage-detection/main_data/train_data_dcm/hemorrhage' -jpg_path 'rsna-intracranial-hemorrhage-detection/main_data/train_data/hemorrhage'

In [None]:
%%capture

!aws s3 sync rsna-intracranial-hemorrhage-detection/main_data/train_data <S3 BUCKET> 