## Construct from MIT index

In [1]:
import SimpleITK as sitk
import pandas as pd

from pathlib import Path
from damply import dirs
from joblib import Parallel, delayed
from typing import Optional
from itertools import chain

from readii.image_processing import flattenImage, alignImages
from readii.io.loaders import loadImageDatasetConfig
from imgtools.io.writers.nifti_writer import NIFTIWriter, NiftiWriterIOError
from readii.negative_controls_refactor import NegativeControlManager
from readii.process.config import get_full_data_name
from readii.utils import logger



def get_readii_settings(dataset_config: dict) -> tuple[list, list, list]:
    """Extract READII settings from a configuration dictionary.
    
    Parameters
    ----------
    dataset_config : dict
        Configuration dictionary read in with `loadImageDatasetConfig` containing READII settings
    
    Returns
    -------
    tuple
        A tuple containing:
        - regions: list of regions to process
        - permutations: list of permutations to apply
        - crop: list of crop settings
    """
    readii_config = dataset_config['READII']
    if 'IMAGE_TYPES' not in readii_config:
        message = "READII configuration must contain 'IMAGE_TYPES'."
        logger.error(message)
        raise KeyError(message)
    
    regions = readii_config['IMAGE_TYPES']['regions']

    permutations = readii_config['IMAGE_TYPES']['permutations']

    crop = readii_config['IMAGE_TYPES']['crop']

    return regions, permutations, crop


def get_masked_image_metadata(dataset_index:pd.DataFrame,
                              dataset_config:Optional[dict] = None,
                              image_modality:Optional[str] = None,
                              mask_modality:Optional[str] = None):
    """Get rows of Med-ImageTools index.csv with the mask modality and the corresponding image modality and create a new index with just these rows for READII
    
    Parameters
    ----------
    dataset_index : pd.DataFrame
        DataFrame loaded from a Med-ImageTools index.csv containing image metadata. Must have columns for Modality, ReferencedSeriesUID, and SeriesInstanceUID.
    dataset_config : Optional[dict]
        Dictionary of configuration settings to get image and mask modality from for filtering dataset_index. Must include MIT MODALITIES image and MIT MODALITIES mask. Expected output from running loadImageDatasetConfig.
    image_modality : Optional[str]
        Image modality to filter dataset_index with. Will override dataset_config setting.
    mask_modality : Optional[str]
        Mask modality to filter dataset_index with. Will override dataset_config setting.

    Returns
    -------
    pd.DataFrame
        Subset of the dataset_index with just the masks and their reference images' metadata.
    """

    if image_modality is None:
        if dataset_config is None:
            message = "No image modality setting passed. Must pass a image_modality or dataset_config with an image modality setting."
            logger.error(message)
            raise ValueError(message)
        
        # Get the image modality from config to retrieve from the metadata
        image_modality = dataset_config["MIT"]["MODALITIES"]["image"]
    
    if mask_modality is None:
        if dataset_config is None:
            message = "No mask modality setting passed. Must pass a mask_modality or dataset_config with a mask modality setting."
            logger.error(message)
            raise ValueError(message)
        
        # Get the mask modality from config to retrieve from the metadata
        mask_modality = dataset_config["MIT"]["MODALITIES"]["mask"]

    # Get all metadata rows with the mask modality
    mask_metadata = dataset_index[dataset_index['Modality'] == mask_modality]

    # Get a Series of ReferenceSeriesUIDs from the masks - these point to the images the masks were made on
    referenced_series_ids = mask_metadata['ReferencedSeriesUID']
    
    # Get image metadata rows with a SeriesInstanceUID matching one of the ReferenceSeriesUIDS of the masks
    image_metadata = dataset_index[dataset_index['Modality'] == image_modality]
    masked_image_metadata = image_metadata[image_metadata['SeriesInstanceUID'].isin(referenced_series_ids)]

    # Return the subsetted metadata
    return pd.concat([masked_image_metadata, mask_metadata], sort=True)



def save_out_negative_controls(nifti_writer: NIFTIWriter,
                               patient_id: str,
                               image: sitk.Image,
                               mask_roi_name: str,
                               region: str,
                               permutation: str,
                               orig_image_dirs: str):
    """Save out negative control images using the NIFTIWriter."""

    try:
        out_path = nifti_writer.save(
                        image,
                        PatientID=patient_id,
                        region=region,
                        permutation=permutation,
                        mask_roi_name=mask_roi_name,
                        orig_image_dirs=orig_image_dirs
                    )
    except NiftiWriterIOError as e:
        message = f"{permutation} {region} negative control file already exists for {patient_id}. If you wish to overwrite, set overwrite to true in the NIFTIWriter."
        logger.debug(message)
        

    return {'Image_Path': out_path.relative_to(dirs.PROCDATA), 'Region': region, 'Permutation': permutation}

    

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
"""Create negative control images and save them out as niftis"""
dataset = 'NSCLC-Radiomics_test'

if dataset is None:
    message = "Dataset name must be provided."
    logger.error(message)
    raise ValueError(message)

config_dir_path = dirs.CONFIG / 'datasets'

dataset_config = loadImageDatasetConfig(dataset, config_dir_path)

dataset_name = dataset_config['DATASET_NAME']
full_data_name = get_full_data_name(config_dir_path / dataset)
logger.info(f"Creating negative controls for dataset: {dataset_name}")

# Extract READII settings
regions, permutations, _crop = get_readii_settings(dataset_config)

# Set up negative control manager with settings from config
manager = NegativeControlManager.from_strings(
    negative_control_types=permutations,
    region_types=regions,
    random_seed=10
)

mit_images_dir_path = dirs.PROCDATA / full_data_name / 'images' /f'mit_{dataset_name}'

dataset_index = pd.read_csv(Path(mit_images_dir_path, f'mit_{dataset_name}_index.csv'))

image_modality = dataset_config["MIT"]["MODALITIES"]["image"]
mask_modality = dataset_config["MIT"]["MODALITIES"]["mask"]

readii_image_dir = mit_images_dir_path.parent / f'readii_{dataset_name}'

In [None]:
masked_image_index = get_masked_image_metadata(dataset_index = dataset_index,
                                                image_modality = image_modality,
                                                mask_modality = mask_modality)

# Set up writer for saving out the negative controls
nifti_writer = NIFTIWriter(
        root_directory = readii_image_dir,
        filename_format = "{orig_image_dirs}/{mask_roi_name}/" + f"{image_modality}" + "_{permutation}_{region}.nii.gz",
        create_dirs = True,
        existing_file_mode = 'SKIP',
        sanitize_filenames = True,
        index_filename = readii_image_dir /f"readii_{dataset_name}_index.csv",
    )

# StudyInstanceUID
for study, study_data in masked_image_index.groupby('StudyInstanceUID'):
    logger.info(f"Processing StudyInstanceUID: {study}")

    # Get image metadata as a pd.Series
    image_metadata = study_data[study_data['Modality'] == image_modality].squeeze()
    image_path = Path(image_metadata['filepath'])
    # Load in image
    raw_image = sitk.ReadImage(mit_images_dir_path / image_path)
    # Remove extra dimension of image, set origin, spacing, direction to original
    image = alignImages(raw_image, flattenImage(raw_image))

    
    # Get mask metadata as a pd.Series
    all_mask_metadata = study_data[study_data['Modality'] == mask_modality]

    for row_idx, mask_metadata in all_mask_metadata.iterrows():


        mask_path = mit_images_dir_path / Path(mask_metadata['filepath'])
        # Load in mask
        raw_mask = sitk.ReadImage(mask_path)
        mask = alignImages(raw_mask, flattenImage(raw_mask))

        mask_roi_name = mask_metadata['ImageID']
        
        readii_image_metadata = [save_out_negative_controls(nifti_writer, 
                                                            patient_id = image_metadata['PatientID'],
                                                            image = neg_image,
                                                            mask_roi_name = mask_roi_name,
                                                            region = region,
                                                            permutation = permutation,
                                                            orig_image_dirs = image_path.parent
                                ) for neg_image, permutation, region in manager.apply(image, mask)
        ]

        # Generate each image type and save it out with the nifti writer
        # readii_image_metadata = Parallel(n_jobs=-1, require="sharedmem")(
        #             delayed(save_out_negative_controls)(
        #                 nifti_writer, 
        #                 patient_id = image_metadata['PatientID'],
        #                 image = neg_image,
        #                 mask_roi_name = mask_roi_name,
        #                 region = region,
        #                 permutation = permutation,
        #                 orig_image_dirs = image_path.parent
        #             ) for neg_image, permutation, region in manager.apply(image, mask)
        # )    

# Set up index for PyRadiomics from MIT and READII index files

In [3]:
MIT_index_file = mit_images_dir_path / f'mit_{dataset_name}_index-simple.csv'
MIT_index = pd.read_csv(MIT_index_file)

In [None]:
image_rows = MIT_index[MIT_index['Modality'] == image_modality]
mask_rows = MIT_index[MIT_index['Modality'] == mask_modality]

MIT_merge_index = pd.merge(
    image_rows,
    mask_rows,
    left_on=['SeriesInstanceUID', 'SampleNumber', 'PatientID'],
    right_on=['ReferencedSeriesUID', 'SampleNumber', 'PatientID'],
    suffixes=('_image', '_mask')
)


In [6]:
image_mask_match_mit = pd.DataFrame(
    data={"SampleID": MIT_merge_index.apply(lambda x: f"{x.PatientID}_{str(x.SampleNumber).zfill(4)}", axis=1),
          "MaskID": MIT_merge_index['ImageID_mask'],
          "Permutation": "original",
          "Region": "full",
          "Image": MIT_merge_index.apply(lambda x: f"{Path(f"mit_{dataset_name}") / x.filepath_image}", axis=1),
          "Mask": MIT_merge_index.apply(lambda x: f"{Path(f"mit_{dataset_name}") / x.filepath_mask}", axis=1)
          }
)
image_mask_match_mit.head()

Unnamed: 0,SampleID,MaskID,Permutation,Region,Image,Mask
0,LUNG1-001_0000,GTV,original,full,mit_NSCLC-Radiomics_test/LUNG1-001_0000/CT_633...,mit_NSCLC-Radiomics_test/LUNG1-001_0000/RTSTRU...
1,LUNG1-001_0000,Lung-Left,original,full,mit_NSCLC-Radiomics_test/LUNG1-001_0000/CT_633...,mit_NSCLC-Radiomics_test/LUNG1-001_0000/RTSTRU...
2,LUNG1-002_0001,GTV,original,full,mit_NSCLC-Radiomics_test/LUNG1-002_0001/CT_232...,mit_NSCLC-Radiomics_test/LUNG1-002_0001/RTSTRU...
3,LUNG1-002_0001,Lung-Left,original,full,mit_NSCLC-Radiomics_test/LUNG1-002_0001/CT_232...,mit_NSCLC-Radiomics_test/LUNG1-002_0001/RTSTRU...


In [7]:
READII_index_file = readii_image_dir / f'readii_{dataset_name}_index.csv'
READII_index = pd.read_csv(READII_index_file)

In [8]:
image_mask_match_readii = pd.DataFrame(
    data={"SampleID": READII_index.apply(lambda x: f"{Path(x.dir_original_image).parent}", axis=1),
          "MaskID": READII_index['ImageID_mask'],
          "Permutation": READII_index["Permutation"],
          "Region": READII_index["Region"],
          "Image": READII_index.apply(lambda x: f"{Path(f"readii_{dataset_name}") / x.filepath}", axis=1),
          "Mask": READII_index.apply(lambda x: f"{Path(f"mit_{dataset_name}") / Path(x.dir_original_image).parent / x.dirname_mask / x.ImageID_mask}.nii.gz", axis=1),
          }
)
image_mask_match_readii.head()

Unnamed: 0,SampleID,MaskID,Permutation,Region,Image,Mask
0,LUNG1-002_0001,GTV,sampled,full,readii_NSCLC-Radiomics_test/LUNG1-002_0001/CT_...,mit_NSCLC-Radiomics_test/LUNG1-002_0001/RTSTRU...
1,LUNG1-002_0001,GTV,sampled,roi,readii_NSCLC-Radiomics_test/LUNG1-002_0001/CT_...,mit_NSCLC-Radiomics_test/LUNG1-002_0001/RTSTRU...
2,LUNG1-002_0001,GTV,sampled,non_roi,readii_NSCLC-Radiomics_test/LUNG1-002_0001/CT_...,mit_NSCLC-Radiomics_test/LUNG1-002_0001/RTSTRU...
3,LUNG1-002_0001,GTV,shuffled,full,readii_NSCLC-Radiomics_test/LUNG1-002_0001/CT_...,mit_NSCLC-Radiomics_test/LUNG1-002_0001/RTSTRU...
4,LUNG1-002_0001,GTV,shuffled,roi,readii_NSCLC-Radiomics_test/LUNG1-002_0001/CT_...,mit_NSCLC-Radiomics_test/LUNG1-002_0001/RTSTRU...


In [20]:
pyradiomics_index = pd.concat([image_mask_match_mit, image_mask_match_readii], ignore_index=True, axis=0)
pyradiomics_index.sort_values(by=['Permutation', 'Region', 'SampleID', 'MaskID', ], inplace=True, ignore_index=True)

features_dir = dirs.PROCDATA / full_data_name / "features"
features_dir.mkdir(parents=True, exist_ok=True)
pyradiomics_index.to_csv(dirs.PROCDATA / full_data_name / "features" / f'pyradiomics_{dataset_name}_index.csv', index=False)