# Development Notebook

Develop and debug code for data preprocessing

In [1]:
%cd ../..

/home/bhkuser/bhklab/katy/aaura-bench-preprocess


In [2]:
import logging

logging.basicConfig(
	level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s'
)

logger = logging.getLogger(__name__)

In [3]:
from damply import dirs

print(dirs)

DamplyDirs<Structure: NESTED>
Project Root: /home/bhkuser/bhklab/katy/aaura-bench-preprocess
CONFIG       : ├── <not found>
LOGS         : ├── logs
METADATA     : ├── <not found>
NOTEBOOKS    : ├── workflow/notebooks
PROCDATA     : ├── data/procdata
RAWDATA      : ├── data/rawdata
RESULTS      : ├── data/results
SCRIPTS      : └── workflow/scripts


In [5]:
dataset = "CVPR_LesionLocator"
# sample_id = "LesionLocator_0001"

In [7]:
import pandas as pd
metadata = pd.read_csv(dirs.RAWDATA / dataset / "naming_1.csv")
anatomy_match = pd.read_csv(dirs.RAWDATA / dataset / "dataset_anatomy_match.csv")

sample_id = metadata.iloc[0]['File Name']

In [None]:
# for idx, source_dataset in anatomy_match.iterrows():
#     # print(source_dataset['Anatomy'])
#     metadata.loc[metadata.Source.str.contains(source_dataset.Dataset),'lesion_location'] = source_dataset['Anatomy']

In [8]:
import SimpleITK as sitk
from imgtools.coretypes import MedImage
def image_proc(image_path):
    # Read in image
    image = sitk.ReadImage(str(image_path))

    # Cast image to Int16
    image = sitk.Cast(image, sitk.sitkInt32)

    # Convert to MedImage
    image = MedImage(image)

    return image

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import SimpleITK as sitk
from imgtools.coretypes import Mask, VectorMask

def mask_proc(mask_path):
    # Read in mask
    mask = sitk.ReadImage(str(mask_path))

    # Cast mask to UInt8
    mask = sitk.Cast(mask, sitk.sitkUInt8)

    # TODO: Find the maximum pixel value, this will be the number of volumes, make a little roi mapping then go back to using VectorMask

    # Convert to MedImageTools Mask
    mask = Mask(mask, metadata={"mask.ndim": 3})

    return mask

In [10]:
from imgtools.coretypes import MedImage, Mask
from pathlib import Path
import SimpleITK as sitk

base_scan_path = Path(dataset) / "Baseline" / "images" / f"{sample_id}_0000.nii.gz"
base_mask_path = Path(dataset) / "Baseline" / "labels" / f"{sample_id}.nii.gz"
synth_scan_path = Path(dataset) / "Synthetic_Follow_Up" / "images" / f"{sample_id}_0000.nii.gz"
synth_mask_path = Path(dataset) / "Synthetic_Follow_Up" / "labels" / f"{sample_id}.nii.gz"

base_scan = image_proc(dirs.RAWDATA / base_scan_path)
base_mask = mask_proc(dirs.RAWDATA / base_mask_path)
synth_scan = image_proc(dirs.RAWDATA / synth_scan_path)
synth_mask = mask_proc(dirs.RAWDATA / synth_mask_path)

In [11]:
scan_metadata = base_scan.fingerprint
scan_metadata

{'class': 'MedImage',
 'hash': 'ff85a941fceb26225c48a1728dc7708b8dbdfe8b',
 'size': Size3D(w=512, h=512, d=706),
 'ndim': 3,
 'nvoxels': 185073664,
 'spacing': Spacing3D(x=0.68359375, y=0.68359375, z=1.0),
 'origin': Coordinate3D(x=-160.0, y=-4.0, z=-833.5999755859375),
 'direction': Direction([1.00,0.00,0.00], [0.00,1.00,0.00], [0.00,0.00,1.00]),
 'min': -1024.0,
 'max': 2976.0,
 'sum': -92027533761.0,
 'mean': -497.24813229504116,
 'std': 514.3937331804074,
 'variance': 264600.91273527616,
 'dtype_str': '32-bit signed integer',
 'dtype_numpy': numpy.int32}

In [12]:
mask_metadata = base_mask.fingerprint
mask_metadata

{'class': 'Mask',
 'hash': 'bb0acc2646076aec5916c0721afc716525e18b51',
 'size': Size3D(w=512, h=512, d=706),
 'ndim': 3,
 'nvoxels': 185073664,
 'spacing': Spacing3D(x=0.68359375, y=0.68359375, z=1.0),
 'origin': Coordinate3D(x=-160.0, y=-4.0, z=-833.5999755859375),
 'direction': Direction([1.00,0.00,0.00], [0.00,1.00,0.00], [0.00,0.00,1.00]),
 'min': 0.0,
 'max': 19.0,
 'sum': 934178.0,
 'mean': 0.005047600937970299,
 'std': 0.25113873271611303,
 'variance': 0.06307066307025526,
 'dtype_str': '8-bit unsigned integer',
 'dtype_numpy': numpy.uint8,
 'mask.bbox.size': Size3D(w=31, h=41, d=25),
 'mask.bbox.min_coord': Coordinate3D(x=166, y=176, z=100),
 'mask.bbox.max_coord': Coordinate3D(x=197, y=217, z=125),
 'mask.feret_diameter': 36.51063779726782,
 'mask.roundness': 0.606473662639058,
 'mask.flatness': 1.2599880828983712,
 'mask.elongation': 2.084690053771264,
 'mask.equivalent_spherical_radius': 9.615066544776484,
 'mask.equivalent_spherical_perimeter': 1161.7547386775173,
 'mask.eq

In [23]:
import numpy as np
label_array= sitk.GetArrayFromImage(base_mask)
for i in range(1, len(np.unique(label_array))):
    curr_mask = (label_array == (i)).astype('uint8')

    curr_mask_img = sitk.GetImageFromArray(curr_mask)
    curr_mask_img.CopyInformation(base_mask)
    
    curr_mask_mi = Mask(curr_mask_img, metadata={"mask.ndim": 3})

    print(f'Volume {i} has {curr_mask.sum()} voxels.')

Volume 1 has 7968 voxels.
Volume 2 has 15039 voxels.
Volume 3 has 5687 voxels.
Volume 4 has 3610 voxels.
Volume 5 has 4229 voxels.
Volume 6 has 4186 voxels.
Volume 7 has 8404 voxels.
Volume 8 has 4947 voxels.
Volume 9 has 1108 voxels.
Volume 10 has 7160 voxels.
Volume 11 has 6373 voxels.
Volume 12 has 8414 voxels.
Volume 13 has 291 voxels.
Volume 14 has 1024 voxels.
Volume 15 has 5935 voxels.
Volume 16 has 11095 voxels.
Volume 17 has 1891 voxels.
Volume 18 has 5820 voxels.
Volume 19 has 2408 voxels.


In [25]:
metadata

Unnamed: 0,File Name,Source
0,LesionLocator_2342,NIH-LYMPH-ABD-002


In [None]:
def process_one(sample:pd.Series,
				dataset:str,
				timepoint:str):
	id = sample['File Name']
	logger.info(f'Processing sample: {id}')

	image_path = Path(dataset) / timepoint / 'images' / f'{id}_0000.nii.gz'
	mask_path = Path(dataset) / timepoint / 'labels' / f'{id}.nii.gz'

	proc_path_stem = Path(dataset, "images", timepoint, id)
	# Process image
	image_metadata = image_proc(dirs.RAWDATA / image_path)
	masks_metadata = mask_proc(dirs.RAWDATA / mask_path)
	logger.info(f'Image and mask loaded for sample: {id}')

	sample_index = {}
	for mask_key, mask_metadata in masks_metadata.items():
		sample_index[f"{id}_{mask_key}"] = {"id": id,
									  		"image_path": proc_path_stem / 'CT.nii.gz',
									  		"mask_path": proc_path_stem / f'mask_{mask_key}.nii.gz',
									  		"annotation_type": "RERECIST",
											"annotation_coords": mask_metadata["annotation_coords"],
											"largest_slice_index": mask_metadata["largest_slice_index"],
											"size": image_metadata["size"],
											"spacing": image_metadata["spacing"],
											"origin": image_metadata["origin"],
											"direction": image_metadata["direction"],
											"mask_volume": mask_metadata["sum"],
											"lesion_location": None,
											"source": None
											}

	return sample_index

In [24]:
curr_mask_mi.fingerprint

{'class': 'Mask',
 'hash': '2974a6e55f4429209b2fd3a0187975ebf69cd5fa',
 'size': Size3D(w=512, h=512, d=706),
 'ndim': 3,
 'nvoxels': 185073664,
 'spacing': Spacing3D(x=0.68359375, y=0.68359375, z=1.0),
 'origin': Coordinate3D(x=-160.0, y=-4.0, z=-833.5999755859375),
 'direction': Direction([1.00,0.00,0.00], [0.00,1.00,0.00], [0.00,0.00,1.00]),
 'min': 0.0,
 'max': 1.0,
 'sum': 2408.0,
 'mean': 1.30110354328966e-05,
 'std': 0.003607057833769037,
 'variance': 1.3010866216154578e-05,
 'dtype_str': '8-bit unsigned integer',
 'dtype_numpy': numpy.uint8,
 'mask.bbox.size': Size3D(w=19, h=24, d=11),
 'mask.bbox.min_coord': Coordinate3D(x=252, y=246, z=421),
 'mask.bbox.max_coord': Coordinate3D(x=271, y=270, z=432),
 'mask.feret_diameter': 18.36765479852699,
 'mask.roundness': 0.7173159204238074,
 'mask.flatness': 1.2367831470781059,
 'mask.elongation': 1.3280599700515419,
 'mask.equivalent_spherical_radius': 6.452400953129126,
 'mask.equivalent_spherical_perimeter': 523.1817152660182,
 'mask.

In [13]:
if base_mask.volume_count > 1:
    print("Multiple masks, must separate")

Multiple masks, must separate


In [11]:
baseline_sample_index = {"id": sample_id,
                         "image_path": base_scan_path,
                         "mask_path": base_mask_path,
                         "recist_coords": None,
                         "spacing": base_scan.spacing,
                         "origin": base_scan.origin,
                         "direction": base_scan.direction,
                         "mask_volume": base_mask.fingerprint["sum"],
                         "lesion_location": "abdomen",
                         "source": metadata_pat1['Source'].values[0]
                         }

In [17]:
np_base_mask = base_mask.to_numpy()[0]

np_base_mask.shape

(611, 512, 512)

In [15]:
import numpy as np
from skimage.measure import regionprops 
def mask2D_to_bbox(mask:np.array, 
                   padding:int | None = None,
                   spacing:np.array = None
                   ) -> np.array:
        
        props = regionprops(mask)[0]
        y_cent, x_cent = props.centroid
        orientation = props.orientation
        semi_maj_axis_len = props.axis_major_length / 2

        x_start = x_cent - np.sin(orientation) * semi_maj_axis_len
        y_start = y_cent - np.cos(orientation) * semi_maj_axis_len

        x_end = x_cent + np.sin(orientation) * semi_maj_axis_len
        y_end = y_cent + np.cos(orientation) * semi_maj_axis_len

        boxes = np.array([x_start, y_start, x_end, y_end])

        # if padding:
        #     boxes = pad_bbox(box = boxes,
        #                      mask = mask,
        #                      padding = padding,
        #                      spacing = spacing)
        
        return boxes.astype(int)

In [16]:
import numpy as np
def get_recist_coords(mask:MedImage):
    # Convert the sitk.Image to a numpy array
    np_mask = mask.to_numpy()[0]
    # Sum the mask in the x and y axes to find the axial slice with the largest tumour area
    axial_sum = np.sum(np_mask, axis=(1,2))
    # Get the index of the axial slice with the largest tumour area
    axial_index = np.argmax(axial_sum)

    max_slice = np_mask[axial_index]
    recist_coords = mask2D_to_bbox(max_slice)
    
    return recist_coords
    

In [18]:
baseline_sample_index['recist_coords'] = get_recist_coords(base_mask)

In [19]:
baseline_sample_index

{'id': 'LesionLocator_0001',
 'image_path': PosixPath('CVPR_LesionLocator/Baseline/images/LesionLocator_0001_0000.nii.gz'),
 'mask_path': PosixPath('CVPR_LesionLocator/Baseline/labels/LesionLocator_0001.nii.gz'),
 'recist_coords': array([347, 179, 326, 200]),
 'spacing': Spacing3D(x=0.919921875, y=0.919921875, z=0.5),
 'origin': Coordinate3D(x=470.080078125, y=470.080078125, z=-305.0),
 'direction': Direction([-1.00,0.00,0.00], [0.00,-1.00,0.00], [0.00,0.00,1.00]),
 'mask_volume': 20462.0,
 'lesion_location': 'abdomen',
 'source': 'KiTS23_case_00000'}