In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
# filepath setup
DATA_SOURCE = "TCIA"
DATASET_NAME = "CC-Radiomics-Phantom"
DATA_DIR = "../data/procdata"
MIT_INDEX_DIR = "mit_" + DATASET_NAME


data_index_file = Path(DATA_DIR, f"{DATA_SOURCE}_{DATASET_NAME}", MIT_INDEX_DIR, f"{MIT_INDEX_DIR}_index.csv")
pyrad_index_output_path = Path(DATA_DIR, "pyradiomics_features", f"pyrad_{DATASET_NAME}_index.csv")

In [None]:
# Read in index file output by imgtools autopipeline
dataset_index = pd.read_csv(data_index_file)

In [None]:
# List of image IDs to keep - which masks to use with the image (CT)
keep_image_ids = ['020_00', '030_00', '040_00', '050_00', 'acryllic_00', 'cork_00', 'dcork_00', 'resin_00', 'rubber_00', 'wood_00', 'CT']

# Select out just the rows from the dataset index with the selected IDs
select_dataset_index = dataset_index[dataset_index['ImageID'].isin(keep_image_ids)]

# Setting up the pyradiomics input file 
# Initialize an empty list to store the image, mask pairs as lists
pyrad_dataset_list = []
# Get the unique samples from the dataset index
unique_sample_ids = np.sort(select_dataset_index['SampleID'].unique())

relative_path_prefix = Path("..", MIT_INDEX_DIR)

for id in unique_sample_ids:
    # Get all the index rows associated with this sample
    sample_rows = select_dataset_index[select_dataset_index['SampleID'] == id]
    # Get the image row - this currently assumes there's only one CT per sample
    image_row = sample_rows[sample_rows['Modality'] == 'CT']

    # Get the path to the image file as a list to remove the index, metadata, etc. from the dataframe
    image_path = image_row['filepath'].to_list()
    image_path = [relative_path_prefix / image_path[0]]

    # Iterate over the mask files associated with this sample image
    for mask_row in sample_rows[sample_rows['Modality'] == 'RTSTRUCT'].itertuples(index=False):
        # Make a list of the sampleID, image and mask paths and then append this to the dataset list
        pyrad_dataset_list.append([id] + image_path + [relative_path_prefix / mask_row[0]])

# Convert the list of image + mask pair lists to a dataframe for saving
pyrad_dataset_df = pd.DataFrame(data=pyrad_dataset_list, columns=['ID', 'Image', 'Mask'])

In [None]:
# Save out the dataframe as a csv for pyradiomics to use for feature extraction
pyrad_dataset_df.to_csv("../data/procdata/TCIA_CC-Radiomics-Phantom/pyradiomics_features/pyrad_CC-Radiomics-Phantom_index.csv", index=False)