# Data Preprocessing

This notebook contains the preprocessing done on the CT-scan data

>> Author: Bogdan Aioanei

In [1]:
import os
import numpy as np
import nibabel as nib
import matplotlib.pyplot as plt
from scipy import ndimage
import cv2
from skimage.transform import resize

### Get the filepaths for the 2 sets of CT scans

In [None]:
batch1_path = r"./Training Batch 1/"
batch2_path = r"./Training Batch 2/"

scan_batch1_paths = [
    os.path.join(os.getcwd(), batch1_path, x)
    for x in os.listdir(batch1_path)
]

scan_batch2_paths = [
    os.path.join(os.getcwd(batch2_path),, x)
    for x in os.listdir(batch2_path)
]

### Create some functions to preprocess the data

In [None]:
# process the volume-type files
def process_file(filepath):
    """
    This function is used for preprocessing of the volume-type CT-scans
    
    :param filepath: string variable representing the full path towards the CT-scan volume file
    :returns: numpy array of containing pixel values of the pre-processed CT-scan volume 
    
    """ 
    # get the data
    scan = nib.load(filepath)
    scan = scan.get_fdata()
    scan = scan.astype("float32")
    
    # normalize data
    normalized_scan = scan/np.linalg.norm(scan)
    
    # resize the images
    new_width = 128
    new_height = 128
    new_depth = 64
    
    # Get current depth
    old_depth = scan.shape[-1]
    old_width = scan.shape[0]
    old_height = scan.shape[1]
    
    depth_factor = new_depth / old_depth
    
    # rotate 90 degrees
    img = ndimage.rotate(normalized_scan, 90, reshape=False)
    
    # zoom in
    img = ndimage.zoom(img, (0.25, 0.25, depth_factor), order=1)
    
    return img

def process_masks(filepath):
    """
    This function is used for preprocessing of the segmentation-type CT-scans, i.e. ground truth 'volumes'
    
    :param filepath: string variable representing the full path towards the CT-scan segmentation file
    :returns: numpy array of containing pixel values of the pre-processed CT-scan segmentation
    """
    # get the data
    scan = nib.load(filepath)
    scan = scan.get_fdata()
    scan = scan.astype("float32")
    
    # rotate 90 degrees
    img = ndimage.rotate(scan, 90, reshape=False)
    
    img = resize(img, (128, 128, 64), mode = 'constant', preserve_range = True)
    
    # approximate the values of the array back to the ground truths, i.e. : 0, 1 and 2
    img_app = np.rint(img)
    
    return img_app

### Preprocess the data using the previously created functions

In [None]:
# create some variables to store the data
test_volumes = []
test_labels = []

train_volumes = []
train_labels = []

for filepath in scan_batch1_paths:
    if 'segmentation-' in filepath:
        test_segmentation_label = process_masks(filepath)
        test_labels.append(test_segmentation_label)
        
    if 'volume-' in filepath:
        test_volume = process_file(filepath)
        test_volumes.append(test_volume)

for filepath in scan_batch2_paths:
    if 'segmentation-' in filepath:
        train_segmentation_label = process_masks(filepath)
        train_labels.append(train_segmentation_label)
        
    if 'volume-' in filepath:
        train_volume = process_file(filepath)
        train_volumes.append(train_volume)

### Further process the data and save it in numpy.array format

In [None]:
# Define the dimensions of the numpy arrays which will contain train / test data
len_train = int(len(scan_batch2_paths) / 2)
len_test = int(len(scan_batch1_paths) / 2)

img_depth = 64

X_train = np.zeros((len_train*img_depth, 128, 128, 3), dtype=np.float32)
y_train = np.zeros((len_train*img_depth, 128, 128, 1), dtype=np.float32)

X_test = np.zeros((len_test*img_depth, 128, 128, 3), dtype=np.float32)
y_test = np.zeros((len_test*img_depth, 128, 128, 1), dtype=np.float32)

In [None]:
# process the data 
pos = 0
for image in test_volumes:
    # split each image from the volume into 3 channels (RGB)
    for i in range(64):
        current_img = image[:,:,i]
        current_img = cv2.cvtColor(current_img, cv2.COLOR_GRAY2RGB)
        X_test[pos] = current_img
        pos += 1

pos = 0
for image in train_volumes:
    # split each image from the volume into 3 channels (RGB)
    for i in range(64):
        current_img = image[:,:,i]
        current_img = cv2.cvtColor(current_img, cv2.COLOR_GRAY2RGB)
        X_train[pos] = current_img
        pos += 1

pos = 0
for image in train_labels:
    for i in range(64):
        current_img = image[:,:,i]
        # reshape the segmentation data
        current_img_reshaped = current_img.reshape(128,128,1)
        y_train[pos] = current_img_reshaped
        pos += 1

pos = 0
for image in test_labels:
    for i in range(64):
        current_img = image[:,:,i]
        # reshape the segmentation data
        current_img_reshaped = current_img.reshape(128,128,1)
        y_test[pos] = current_img_reshaped
        pos += 1

### Define a function to save the processed data locally

In [None]:
# save variable to file
import pickle

def save_variable_to_local(variable, local_path):
    """
    This function is used saving the data locally
    
    :param variable: variable that needs to be saved
    :param local_path: string path indicating where to save the variable
    """ 
    file = open(local_path, 'wb')
    pickle.dump(variable, file)
    file.close()

In [None]:
# save the variables to local files for later usage
test_label_path = r"D:\Facultate\job\media\nas\01_Datasets\CT\LITS/y_test2.pckl"
test_volume_path = r"D:\Facultate\job\media\nas\01_Datasets\CT\LITS/X_test.pckl"

train_label_path = r"D:\Facultate\job\media\nas\01_Datasets\CT\LITS/y_train2.pckl"
train_volume_path = r"D:\Facultate\job\media\nas\01_Datasets\CT\LITS/X_train.pckl"

save_variable_to_local(y_test, test_label_path)
save_variable_to_local(X_test, test_volume_path)
save_variable_to_local(y_train, train_label_path)
save_variable_to_local(X_train, train_volume_path)