In [None]:
# Convert chest CT scans from a set of .png images to .nii files
# 
# This should only be done the first time. 

In [None]:
# The CLEAN-CC-CCII dataset is available at the following URL: https://github.com/HKBU-HPML/HKBU_HPML_COVID-19 

In [None]:
import os
def png_to_nii(in_path, out_path):
    paths=[x[0] for x in os.walk(in_path)]

    i=0
    scan_name=''
    for path in paths:
        file_names=list()
        for File in os.listdir(path):
            if File.endswith(".png"): 
                file_names.append(path+os.path.sep+File)
        if len(file_names)!=0: 
            i+=1
            l=path[path.rfind(os.path.sep)+1:]
            path=path[:path.rfind(os.path.sep)]
            f=path[path.rfind(os.path.sep)+1:]
            scan_name=f+'_'+l
            #file_names = glob.glob('*.png')
            try:
                reader = sitk.ImageSeriesReader()
                reader.SetFileNames(file_names)
                vol = reader.Execute()
                sitk.WriteImage(vol, os.path.join(out_path, scan_name+".nii.gz")
            except: print (scan_name)

# replace with your directories with the paths of the CLEAN-CC-CCII dataset and the path where save the nii files
                                
in_paths=["/home/pwrai/notebook/Clean-CC-CCII/dataset_cleaned/Normal/",
          "/home/pwrai/notebook/Clean-CC-CCII/dataset_cleaned/NCP/",
          "/home/pwrai/notebook/Clean-CC-CCII/dataset_cleaned/CP/"
         ]
out_paths=["/home/pwrai/notebook/Clean-CC-CCII/dataset_cleaned/nii/Normal/",
           "/home/pwrai/notebook/Clean-CC-CCII/dataset_cleaned/nii/NCP/",
           "/home/pwrai/notebook/Clean-CC-CCII/dataset_cleaned/nii/CP/"
          ]

for i in range(len(in_paths)):
    png_to_nii(in_path[i], out_path[i])                               
                                

In [None]:
# Pre-process the chest CT scans (.nii files) with the aim to resize them and remove irrelevant contents
# The following will generate the training/validation/test set for a k-fold cross validation strategy according
# to the required CT scan depth.
# Datasets will be saved as numpy.

In [1]:
import os
import zipfile
import numpy as np
from numpy.random import default_rng
import nibabel as nib
from scipy import ndimage
import random
from sklearn.model_selection import train_test_split
import multiprocessing as mp
from scipy.ndimage import zoom
import tensorflow as tf
import math


In [5]:
# set the parameters to resize the chest CT scans

In [2]:
new_width=256   # width of the slices after pre-processing
new_height=256  # height for the slices after pre-processing
new_depth=25    # number of slices of the CT after pre-processing

In [7]:
# define pre-processing steps for the CT scans

In [3]:
def read_nifti_file(filepath):
    """Read and load volume"""
    # Read file
    scan = nib.load(filepath)
    # Get raw data
    scan = scan.get_fdata()
    return scan

# upsample the scan at borders
def upsample_borders(img, current_depth, new_depth):
    """Upsample scans by repeating slices at both extremity"""
    up = new_depth - current_depth
    upl = round(up/2)
    upr = new_depth - (current_depth + upl)
    idx_l = [0] * upl
    idx_r = [img.shape[2]-1] * upr
    slices=np.array(range(current_depth))
    slices=np.hstack((slices, np.array(idx_r, dtype=int), np.array(idx_l, dtype=int)))
    slices.sort(axis=0)
    slices=np.asarray(slices)
    img=img[:,:,slices]
    
    return img

# downsample the scan at borders
def downsample_borders(img, current_depth, new_depth):
    """Downsample scans by repeating slices at both extremity"""
    down = current_depth - new_depth
    downl = round(down/2)
    downr = current_depth - (new_depth + downl)
    slices = np.array(range(current_depth))
    slices = slices[downl:current_depth-downr]
    img=img[:,:,slices]
    
    return img


# crop and scale the slices of a CT scan
def resize_volume(img, width, height, depth):
    """Resize across z-axis"""
    # Set the desired depth
    new_depth = depth
    new_width = width
    new_height = height
    
    # Get current depth
    current_depth = img.shape[2]
    current_width = img.shape[0]
    current_height = img.shape[1]
    
    # Upsampling/Downsampling slices at both extremity
    if new_depth<current_depth: img = downsample_borders(img, current_depth, new_depth)
    elif new_depth>current_depth: img = upsample_borders(img, current_depth, new_depth)
    else: pass # depth will not be changed => desired_depth==current_depth
    
    # Rotate
    img = ndimage.rotate(img, -90, reshape=False)
    #img = ndimage.rotate(img, 180, reshape=False)
    #img = np.flipud(img)
    
    crop_scan = img[img.shape[0]//6: - img.shape[0]//6, img.shape[1]//6: - img.shape[1]//6, :]
    depth_factor = 1.0 # only used for zooming. Images were previously upsampled/downsampled 
    width_factor = 1/(crop_scan.shape[0] / new_width)
    height_factor = 1/(crop_scan.shape[1] / new_height)
    img= ndimage.zoom(crop_scan, (width_factor, height_factor, depth_factor))

    return img



def process_scan(path):
    width=new_width
    height=new_height
    depth=new_depth
    """Read and resize volume"""
    # Read scan
    volume = read_nifti_file(path)
    if len(volume.shape)==5: volume=volume[:,:,:,0,0]
    # Normalize
    volume = volume.astype("float32")
    # Resize width, height and depth
    volume = resize_volume(volume, width, height, depth)
    return volume

In [4]:
# Pre-process the dataset

In [6]:
def pre_processing(normal_scans_path, ncp_scans_path, cp_scans_path):

    # Read and process the scans.
    pool = mp.Pool(mp.cpu_count())
    
    print('Pre-processing CT scans of Normal controls')
    normal_scans = np.array([pool.map(process_scan, normal_scans_path)])
    print('Samples classified as Normal: ', normal_scans.shape[1])
    
    print('Pre-processing CT scans of patients with NCP')
    ncp_scans = np.array([pool.map(process_scan, ncp_scans_path)])
    print('Samples classified as NCP: ', ncp_scans.shape[1])
    
    print('Pre-processing CT scans of patients with CP')
    cp_scans = np.array([pool.map(process_scan, cp_scans_path)])
    print('Samples classified as CP : ', cp_scans.shape[1])
    
    pool.close()
      
    # labels for training set
    # NORMAL
    normal_labels_indexes = np.array([0 for _ in range(len(normal_scans_path))])
    # NCP
    ncp_labels_indexes = np.array([1 for _ in range(len(ncp_scans_path))])
    # CP
    cp_labels_indexes = np.array([2 for _ in range(len(cp_scans_path))])
    
    nb_classes=3
    normal_labels=tf.one_hot(normal_labels_indexes, nb_classes) 
    ncp_labels=tf.one_hot(ncp_labels_indexes, nb_classes) 
    cp_labels=tf.one_hot(cp_labels_indexes, nb_classes) 
    
    normal_labels=tf.make_tensor_proto(normal_labels)
    ncp_labels=tf.make_tensor_proto(ncp_labels)
    cp_labels=tf.make_tensor_proto(cp_labels)
    
    normal_labels = tf.make_ndarray(normal_labels)
    ncp_labels = tf.make_ndarray(ncp_labels)
    cp_labels = tf.make_ndarray(cp_labels)
    
    print(normal_scans.shape)
    print(ncp_scans.shape)
    print(cp_scans.shape)
    
    x = np.hstack((normal_scans,
                   ncp_scans,
                   cp_scans
                  )) 
    y = np.concatenate((normal_labels, 
                   ncp_labels,
                   cp_labels
                  ))
    
    #print(x.shape)
    #print(y.shape)
    x = x.squeeze()
    
    return x, y


# get chest CT scan paths (saved as .nii)

# replace the scans_path variable with your directories

# get paths of the CT scans of normal controls
scans_path="/home/pwrai/notebook/Clean-CC-CCII/dataset_cleaned/nii/Normal" # folder with scans of normal controls
normal_scans_paths = [
    os.path.join(os.getcwd(), scans_path, x)
    for x in os.listdir(scans_path)
]

# get paths of the CT scans of patients affected by NCP
scans_path="/home/pwrai/notebook/Clean-CC-CCII/dataset_cleaned/nii/NCP" # folder with scans of patients affected by NCP
ncp_scans_paths = [
    os.path.join(os.getcwd(), scans_path, x)
    for x in os.listdir(scans_path)
]

# get paths of the CT scans of patients affected by NCP
scans_path="/home/pwrai/notebook/Clean-CC-CCII/dataset_cleaned/nii/CP" # folder with scans of patients affected by CP
cp_scans_paths = [
    os.path.join(os.getcwd(), scans_path, x)
    for x in os.listdir(scans_path)
]


print("Normal scans: " + str(len(normal_scans_paths)))
print("NCP scans: " + str(len(ncp_scans_paths)))
print("CP scans: " + str(len(cp_scans_paths)))

X, Y = pre_processing(normal_scans_paths, ncp_scans_paths, cp_scans_paths)

Normal scans: 966
NCP scans: 1519
CP scans: 1538
Pre-processing normal scans
Samples classified as Normal:  966
Pre-processing NCP scans




Samples classified as NCP:  1519
Pre-processing CP scans
Samples classified as CP :  1538
(1, 966, 256, 256, 25)
(1, 1519, 256, 256, 25)
(1, 1538, 256, 256, 25)


In [None]:
# build and save training, validation and test set for all folds (to implement a k-fold cross validation strategy)
# by default a 5-fold cross validation is implemented which implies that for each fold the dataset is split into 
# 20% for test set and 80% for training set. We also imposed that 10% of the training set was used for validation
# purposes.

In [None]:
from pathlib import Path
from skmultilearn.model_selection import IterativeStratification
from os import path

n_splits=5 # nb of folds
kfold = IterativeStratification(n_splits=n_splits, order=1)

i = 0

# replace "folds_path" with your directory
# for each fold a directory labeled fold_i (i stands the i-th fold) will be automatically created
folds_path="/home/pwrai/notebook/clean_cc_ccii_folds/25/"

for train, test in kfold.split(X, Y):
    i+=1
    x_train, x_val, y_train, y_val = train_test_split(X[train], Y[train], test_size=0.1,  stratify=Y[train])
    x_test = X[test]
    y_test = Y[test]
    
    fold_path=path.join(folds_path, "fold_"+str(i) + path.sep)
        
    Path(fold_path).mkdir(parents=True, exist_ok=True)
    np.save(fold_path+'x_train.npy', x_train)
    np.save(fold_path+'x_test.npy', x_test)
    np.save(fold_path+'x_val.npy', x_val)
    np.save(fold_path+'y_train.npy', y_train)
    np.save(fold_path+'y_test.npy', y_test)
    np.save(fold_path+'y_val.npy', y_val)