In [3]:
from pathlib import Path

import pandas as pd
import yaml

In [6]:
# Load in the configuration file
config_file_path = Path("../config/datasets/HEAD-NECK-RADIOMICS-HN1.yaml")
with config_file_path.open("r") as f:
    config = yaml.safe_load(f)

RESULTS_DATA_PATH = Path("../data/results/")

DATASET_NAME = config["DATA_SOURCE"] + "_" + config["DATASET_NAME"]

RANDOM_SEED = 10

# general data directory path setup
DATA_DIR_PATH = Path("../data")
RAW_DATA_PATH = DATA_DIR_PATH / "rawdata" / DATASET_NAME
PROC_DATA_PATH = DATA_DIR_PATH / "procdata" / DATASET_NAME
RESULTS_DATA_PATH = DATA_DIR_PATH / "results" / DATASET_NAME

In [7]:
# Get list of image types from results directory
image_types = {str(type.name).removesuffix("_features.csv") for type in sorted(RESULTS_DATA_PATH.rglob("**/*_features.csv"))}

# List of pyradiomic feature sets to combine
feature_sets_to_combine = ['pyradiomics_original_all_features', 'pyradiomics_wavelet_shape_glrlm_only']

# What to call the resulting combined feature set
combined_feature_set_name = "pyradiomics_original_plus_aerts"

for image_type in image_types:

    # initalize variable for the combined dataframe
    combined_features = None

    # loop through the feature sets to combine
    for feature_set in feature_sets_to_combine:
        # Load the features
        feature_set_path = RESULTS_DATA_PATH / feature_set / f"{image_type}_features.csv"
        features = pd.read_csv(feature_set_path)

        # if this is the first feature set, set as the combined features dataframe
        if combined_features is None:
            combined_features = features
        else:
            # Check that patient count is the same
            if len(combined_features) != len(features):
                raise ValueError(f"Patient count mismatch between {combined_features} and {features}")

            additional_features = ['ID'] + list(features.columns.difference(combined_features.columns))
            
            # Merge the features
            combined_features = pd.merge(combined_features, features[additional_features], how="inner", on="ID")
    

    # save out the combined features
    combined_features_path = RESULTS_DATA_PATH / combined_feature_set_name / f"{image_type}_features.csv"
    combined_features_path.parent.mkdir(parents=True, exist_ok=True)
    combined_features.to_csv(combined_features_path, index=False)

In [None]:
combined_features