In [1]:
import pandas as pd
from pathlib import Path
import itertools
from ast import literal_eval
import re
import yaml

In [9]:
rawdata_path = Path("../rawdata/roi_scrape_results")

# Matching string for tumor ROIs
TUMOR_REGEX = re.compile('Tumor*|tumor*|GTV*', re.IGNORECASE)

for dataset_file in sorted(rawdata_path.glob('*.csv')):
    dataset_name = dataset_file.stem.removesuffix('_RTSTRUCT_summary')

    roi_data = pd.read_csv(dataset_file)

    # Turn the list of strings into lists
    list_cols = ["OriginalROINames", "ExtractableROINames", "ReferencedSOPInstanceUIDs"]
    for col in list_cols:
        if col in roi_data.columns:
            roi_data[col] = roi_data[col].apply(literal_eval)

    # Get set of unique ROI names for the dataset
    unique_roi_names = set(itertools.chain(*roi_data.ExtractableROINames))

    if "CPTAC" in dataset_name:
        SEED_REGEX = re.compile('- S| see', re.IGNORECASE)
        seed_roi_names = {match for match in unique_roi_names if re.search(SEED_REGEX, match)}

        tumor_roi_names = unique_roi_names - seed_roi_names
    else:
        # Get just the tumor ROI names
        tumor_roi_names = {match for match in unique_roi_names if re.match(TUMOR_REGEX, match)}

    if tumor_roi_names:
        print(f"Found {len(tumor_roi_names)} tumor ROI names in {dataset_name}")
        samples_with_tumours = pd.DataFrame()

        multi_lesion_counter = 0
        for idx, row in roi_data.iterrows():
            row_roi_names = set(row.ExtractableROINames)
            
            # If the sample doesn't have any ROI names that match the tumor regex, drop it
            if len(row_roi_names.intersection(tumor_roi_names)) == 0:
                roi_data.drop(idx, inplace=True)
            
            if len(row_roi_names.intersection(tumor_roi_names)) > 1:
                multi_lesion_counter += 1
                
        if multi_lesion_counter > 0:
            print(f"Multiple lesions found in {multi_lesion_counter} {dataset_name} samples")

        # set up output directory
        output_dir = Path(f"../procdata/GTV_RTSTRUCT_samples/{dataset_name}")
        output_dir.mkdir(parents=True, exist_ok=True)

        # Save out the updated ROI dataframe with only the rows with tumor ROIs
        roi_data.to_csv(output_dir / f"{dataset_name}_RTSTRUCT_GTV_summary.csv", index=False)

        # Save out a yaml file with the set of all unique tumor ROI names found in the dataset
        with open(output_dir / f"{dataset_name}_tumor_ROI_names.yaml", 'w') as tumor_output_file:
            dataset_tumor_roi_names = {dataset_name: sorted(tumor_roi_names)}
            yaml.dump(dataset_tumor_roi_names, tumor_output_file, default_flow_style=False)
        
    else:
        print(f"No tumor ROIs found in {dataset_name}")
        # set up output directory for dataset with no tumor ROIs
        output_dir = Path(f"../procdata/non-GTV_RTSTRUCT_samples/{dataset_name}")
        output_dir.mkdir(parents=True, exist_ok=True)

    
    # Save out a yaml file with the set of all unique ROI names found in the dataset
    with open(output_dir / f"{dataset_name}_all_ROI_names.yaml", 'w') as all_output_file:
        dataset_all_roi_names = {dataset_name: sorted(unique_roi_names)}
        yaml.dump(dataset_all_roi_names, all_output_file, default_flow_style=False)

    print("---")

Found 11 tumor ROI names in 4D-Lung
---
No tumor ROIs found in CC-Radiomics-Phantom-3
---
No tumor ROIs found in CC-Radiomics-Phantom
---
No tumor ROIs found in CC-Tumor-Heterogeneity
---
Found 41 tumor ROI names in CPTAC-CCRCC
---
Found 62 tumor ROI names in CPTAC-HNSCC
---
Found 43 tumor ROI names in CPTAC-PDA
---
Found 41 tumor ROI names in CPTAC-UCEC
---
Found 8 tumor ROI names in HEAD-NECK-RADIOMICS-HN1
Multiple lesions found in 81 HEAD-NECK-RADIOMICS-HN1 samples
---
Found 24 tumor ROI names in HNSCC-3DCT-RT
Multiple lesions found in 48 HNSCC-3DCT-RT samples
---
Found 99 tumor ROI names in HNSCC
Multiple lesions found in 390 HNSCC samples
---
Found 132 tumor ROI names in Head-Neck-PET-CT
Multiple lesions found in 514 Head-Neck-PET-CT samples
---
No tumor ROIs found in LCTSC
---
Found 20 tumor ROI names in NSCLC-Radiomics-Interobserver1
Multiple lesions found in 21 NSCLC-Radiomics-Interobserver1 samples
---
Found 49 tumor ROI names in NSCLC-Radiomics
Multiple lesions found in 246 N

# Code to print unique ROI names to manually search for tumor ROIs

In [None]:
dataset = "CPTAC-CCRCC"
roi_data = pd.read_csv(f"../rawdata/roi_scrape_results/{dataset}_RTSTRUCT_summary.csv")

# Turn the list of strings into lists
list_cols = ["OriginalROINames", "ExtractableROINames", "ReferencedSOPInstanceUIDs"]
for col in list_cols:
    if col in roi_data.columns:
        roi_data[col] = roi_data[col].apply(literal_eval)

# Matching string for tumor ROIs
tumor_regex = re.compile('Tumor*|tumor*|GTV*')

# Get set of unique ROI names for the dataset
unique_roi_names = set(itertools.chain(*roi_data.ExtractableROINames))
print(unique_roi_names)
print(len(unique_roi_names))

if "CPTAC" in dataset:
    SEED_REGEX = re.compile('- S', re.IGNORECASE)
    seed_roi_names = {match for match in unique_roi_names if re.search(SEED_REGEX, match)}
    print(seed_roi_names)
    print(len(seed_roi_names))

    tumor_roi_names = unique_roi_names - seed_roi_names
    print(tumor_roi_names)