In [5]:
import pandas as pd
from pathlib import Path
import itertools
from ast import literal_eval
import re
import yaml

In [155]:
use_all_rois = ["NSCLC Radiogenomics", "CC-Tumor-Heterogeneity", "QIN LUNG CT", "RIDER Pilot"]

In [163]:
rawdata_path = Path("../rawdata/roi_scrape_results/Combined")

# Matching string for tumor ROIs
TUMOR_REGEX = re.compile('(tumou?r|gtv|mask|mass|primary|lesion|nodule)', re.IGNORECASE)

MODALITIES = ['RTSTRUCT', 'SEG']

tracker = list()

for dataset_file in sorted(rawdata_path.glob('*.csv')):
    for modality in MODALITIES:
        if modality in dataset_file.stem:
            dataset_name = dataset_file.stem.removesuffix(f'_{modality}_summary')
            data_modality = modality
            break

    match data_modality:
        case 'RTSTRUCT':
            roi_name_column = 'ExtractableROINames'
        case 'SEG':
            roi_name_column = 'OriginalROINames'
    
    # Load in metadata for the dataset
    try:
        roi_data = pd.read_csv(dataset_file)
    except pd.errors.EmptyDataError:
        print(f"Empty data file found for {dataset_name}")
        continue

    # Get total number of files before filtering for tumor ROIs
    original_file_count = len(roi_data)

    # set up output directory
    output_dir = Path(f"../procdata/ROI_info/{dataset_name}")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Turn the list of strings into lists
    list_cols = ["OriginalROINames", "ExtractableROINames"]
    for col in list_cols:
        if col in roi_data.columns:
            roi_data[col] = roi_data[col].apply(literal_eval)

    # Get set of unique ROI names for the dataset based on modality
    unique_roi_names = set(itertools.chain(*roi_data[roi_name_column]))

    # CPTAC datasets have a seed ROI that needs to be removed. All other ROIs are tumors, named by location.
    if "CPTAC" in dataset_name:
        SEED_REGEX = re.compile('- *S| see', re.IGNORECASE)
        seed_roi_names = {match for match in unique_roi_names if re.search(SEED_REGEX, match)}
        tumor_roi_names = unique_roi_names - seed_roi_names
    elif dataset_name in use_all_rois:
        tumor_roi_names = unique_roi_names
    else:
        # Get just the tumor ROI names
        tumor_roi_names = {match for match in unique_roi_names if re.search(TUMOR_REGEX, match)}


    multi_lesion_counter = 0
    if tumor_roi_names:
        for idx, row in roi_data.iterrows():
            row_roi_names = set(row[roi_name_column])
            
            # If the sample doesn't have any ROI names that match the tumor regex, drop it
            if len(row_roi_names.intersection(tumor_roi_names)) == 0:
                roi_data.drop(idx, inplace=True)
            
            if len(row_roi_names.intersection(tumor_roi_names)) > 1:
                multi_lesion_counter += 1

        # Save out the updated ROI dataframe with only the rows with tumor ROIs
        roi_data.to_csv(output_dir / f"{dataset_name}_{modality}_tumor_summary.csv", index=False)

        # Save out a yaml file with the set of all unique tumor ROI names found in the dataset
        with open(output_dir / f"{dataset_name}_{modality}_tumor_ROI_names.yaml", 'w') as tumor_output_file:
            dataset_tumor_roi_names = {dataset_name: sorted(tumor_roi_names)}
            yaml.dump(dataset_tumor_roi_names, tumor_output_file, default_flow_style=False)
        
        pats_w_tumors = len(roi_data)

    else:
        pats_w_tumors = 0
        # print(f"No tumor ROIs found for {dataset_name}")

    
    # Save out a yaml file with the set of all unique ROI names found in the dataset
    with open(output_dir / f"{dataset_name}_{modality}_all_ROI_names.yaml", 'w') as all_output_file:
        dataset_all_roi_names = {dataset_name: sorted(unique_roi_names)}
        yaml.dump(dataset_all_roi_names, all_output_file, default_flow_style=False)

    tracker.append([dataset_name, data_modality, original_file_count,len(unique_roi_names), len(tumor_roi_names), pats_w_tumors, multi_lesion_counter])
    

tracker_df = pd.DataFrame(tracker, columns=['Dataset', 'Modality', 'Num of files', 'Num ROIs', 'Num tumor ROIs', 'Num files w/ tumor ROIs', 'Num multi-lesion files'])
tracker_df

Unnamed: 0,Dataset,Modality,Num of files,Num ROIs,Num tumor ROIs,Num files w/ tumor ROIs,Num multi-lesion files
0,4D-Lung,RTSTRUCT,800,154,11,800,0
1,ACRIN-6698,SEG,2213,2,2,2213,0
2,Adrenal-ACC-Ki67-Seg,SEG,53,1,1,53,0
3,Advanced-MRI-Breast-Lesions,SEG,99,1,1,99,0
4,Breast-MRI-NACT-Pilot,SEG,756,2,1,378,0
5,C4KC-KiTS,SEG,210,2,1,210,0
6,CC-Radiomics-Phantom-3,RTSTRUCT,275,7,0,0,0
7,CC-Radiomics-Phantom,RTSTRUCT,34,176,0,0,0
8,CC-Tumor-Heterogeneity,RTSTRUCT,68,8,8,68,0
9,CPTAC-CCRCC,RTSTRUCT,636,54,41,590,0


In [162]:
tracker_df.to_csv(f"../procdata/ROI_info/ROI_summary_all_datasets.csv", index=False)

# Code to print unique ROI names to manually search for tumor ROIs

In [154]:
dataset = "CC-Tumor-Heterogeneity"

dataset_file = sorted(rawdata_path.glob(f'{dataset}*_summary.csv'))[0]
roi_data = pd.read_csv(dataset_file)

modality = roi_data.Modality.unique()[0]
match modality:
    case "RTSTRUCT":
        roi_name_column = "ExtractableROINames"
    case "SEG":
        roi_name_column = "OriginalROINames"
    case _:
        raise ValueError(f"Unsupported modality: {modality}")

# Turn the list of strings into lists
list_cols = ["OriginalROINames", "ExtractableROINames"]
for col in list_cols:
    if col in roi_data.columns:
        roi_data[col] = roi_data[col].apply(literal_eval)

# Matching string for tumor ROIs
#tumor_regex = re.compile('(tumor|gtv|mask|mass)', re.IGNORECASE)

# Get set of unique ROI names for the dataset
unique_roi_names = set(itertools.chain(*roi_data[roi_name_column]))
print(unique_roi_names)
print(len(unique_roi_names))

if "CPTAC" in dataset:
    seed_roi_names = {match for match in unique_roi_names if re.search(SEED_REGEX, match)}
    print(seed_roi_names)
    print(len(seed_roi_names))

    tumor_roi_names = unique_roi_names - seed_roi_names
    
else:
    # Get just the tumor ROI names
    tumor_roi_names = {match for match in unique_roi_names if re.search(TUMOR_REGEX, match)}

print(tumor_roi_names)
print(len(tumor_roi_names))

{'Ut-MRT2-Sag-2', 'Ut-MRT2-Ax-3', 'Ut-MRT2-Sag-3-2', 'Ut-MRT2-Sag-1', 'Ut-MRT2-Sag-3', 'Ut-MRT2-Sag-5', 'Ut-MRT2-Sag-2-1', 'Ut-MRT2-Sag-3-1'}
8
set()
0


In [23]:
roi_data

Unnamed: 0,PatientID,StudyInstanceUID,SeriesInstanceUID,Modality,OriginalROINames,ReferencedSOPInstanceUIDs,ReferencedSeriesInstanceUID,ReferencedModality
0,ACRIN-6698-900810,1.3.6.1.4.1.14519.5.2.1.7695.4164.213756630771...,1.3.6.1.4.1.14519.5.2.1.7695.4164.196202689395...,SEG,[VOLSER Analysis Mask],['1.3.6.1.4.1.14519.5.2.1.7695.4164.3233315094...,,
1,ACRIN-6698-890372,1.3.6.1.4.1.14519.5.2.1.7695.4164.287511750191...,1.3.6.1.4.1.14519.5.2.1.7695.4164.182441648132...,SEG,[VOLSER Analysis Mask],['1.3.6.1.4.1.14519.5.2.1.7695.4164.1430346490...,,
2,ACRIN-6698-373346,1.3.6.1.4.1.14519.5.2.1.7695.4164.133907539099...,1.3.6.1.4.1.14519.5.2.1.7695.4164.328421782905...,SEG,[VOLSER Analysis Mask],['1.3.6.1.4.1.14519.5.2.1.7695.4164.9980687911...,,
3,ACRIN-6698-870077,1.3.6.1.4.1.14519.5.2.1.7695.4164.323599920248...,1.3.6.1.4.1.14519.5.2.1.7695.4164.178637128214...,SEG,[VOLSER Analysis Mask],['1.3.6.1.4.1.14519.5.2.1.7695.4164.2544212183...,,
4,ACRIN-6698-793197,1.3.6.1.4.1.14519.5.2.1.7695.4164.229353582366...,1.3.6.1.4.1.14519.5.2.1.7695.4164.311293249185...,SEG,[VOLSER Analysis Mask],['1.3.6.1.4.1.14519.5.2.1.7695.4164.9122526245...,,
...,...,...,...,...,...,...,...,...
2208,ACRIN-6698-995480,1.3.6.1.4.1.14519.5.2.1.7695.4164.143590032635...,1.3.6.1.4.1.14519.5.2.1.7695.4164.238059688381...,SEG,[DWI Tumor Mask],['1.3.6.1.4.1.14519.5.2.1.7695.4164.2290757043...,,
2209,ACRIN-6698-995480,1.3.6.1.4.1.14519.5.2.1.7695.4164.229986390322...,1.3.6.1.4.1.14519.5.2.1.7695.4164.228245020792...,SEG,[VOLSER Analysis Mask],['1.3.6.1.4.1.14519.5.2.1.7695.4164.2481932323...,,
2210,ACRIN-6698-995480,1.3.6.1.4.1.14519.5.2.1.7695.4164.297233534323...,1.3.6.1.4.1.14519.5.2.1.7695.4164.184624988262...,SEG,[VOLSER Analysis Mask],['1.3.6.1.4.1.14519.5.2.1.7695.4164.2248324241...,,
2211,ACRIN-6698-995480,1.3.6.1.4.1.14519.5.2.1.7695.4164.143590032635...,1.3.6.1.4.1.14519.5.2.1.7695.4164.295636270955...,SEG,[VOLSER Analysis Mask],['1.3.6.1.4.1.14519.5.2.1.7695.4164.2706391450...,,


# Multi-lesion summary

In [None]:
# for each tumor dataset, get unique patient IDs
# get all rows for that patient ID
# compare ROI names across rows - if they don't match, multiple tumors have been segmented (possibly)