 TODO: 
 
 - Step 1: Download data from ORCESTRA
 - Step 2: MultiAssayExperiment + BumpyMatrix deconstructor

In [2]:
from helpers import *
import yaml


# Step X: Set up the configuration for this dataset

In [1]:
# Set dataset name
# Should be in format "RADCURE" or "Head-Neck-Radiomics-HN1"
DATASET_NAME = "Head-Neck-Radiomics-HN1"
# DATASET_NAME = "RADCURE"

#### Load configuration settings

In [3]:
config_file = f"../../config/{DATASET_NAME}.yaml"
config = yaml.safe_load(open(config_file))

# Step X: Load clinical data

In [4]:
clinical_data_path = f"../../../rawdata/{DATASET_NAME}/clinical/{DATASET_NAME}.csv"

# Load clinical data into a pandas dataframe
clinical_data = loadFileToDataFrame(clinical_data_path)
print(f"Clinical data loaded with {len(clinical_data)} patients.")

Clinical data loaded with 137 patients.


# Step X: Clean clinical data
- Remove any specified exceptions in config file
- Set up the outcome variable column by making sure it is a boolean

In [5]:
# Get exclusion variable dictionary from config file
exclusion_clinical_variables = config["exclusion_variables"]

if exclusion_clinical_variables:
    print("Will exclude clinical variables:", exclusion_clinical_variables.keys())
    # Drop rows with values in the exclusion variables
    clinical_data = subsetDataframe(clinical_data, excludeDict=exclusion_clinical_variables)
    print("Clinical data updated, now has", len(clinical_data), "patients.")
else:
    print("No exclusion variables found in config file.")

No exclusion variables found in config file.


### Outcome Variable setup
This data will be used for a Cox Proportional Hazards model, which expects time and event outcome variables. Time must be a continuous variable, and event must be a binary variable.
Event Variable must be in the format where 1 is the event (e.g. death), and 0 is non-event (e.g. alive).
In this pipeline, we are expecting the time to event to be in years, so will convert any other units to years.

In [6]:
# Set up ouptput variable columns for modelling
time_column_label = config["outcome_variables"]["time_label"]
event_column_label = config["outcome_variables"]["event_label"] 

# def survivalTimeColumnSetup
# Confirm that time column is numeric
if not np.issubdtype(clinical_data[time_column_label].dtype, np.number):
    raise ValueError(f"Time column {time_column_label} is not numeric. Please confirm time label in {DATASET_NAME}.yaml is the correct column or convert to numeric.")
else:
    print(f"Time column {time_column_label} is numeric. Making copy with standardized column name.")
    if config["outcome_variables"]["convert_to_years"]:
        print(f"Converting time column {time_column_label} from days to years.")
        clinical_data["survival_time_in_years"] = clinical_data[time_column_label] / 365
    else:
        clinical_data["survival_time_in_years"] = clinical_data[time_column_label]


# def survivalEventColumnSetup
# Determine what type of value is in event column
event_variable_type = type(clinical_data[event_column_label][0])
if np.issubdtype(event_variable_type, np.number):
    print(f"Event column {event_column_label} is binary. Making copy with standardized column name.")
    clinical_data["survival_event_binary"] = clinical_data[event_column_label]

elif np.issubdtype(event_variable_type, np.bool_):
    print(f"Event column {event_column_label} is boolean. Converting to binary.")
    clinical_data["survival_event_binary"] = clinical_data[event_column_label].astype(int)

elif np.issubdtype(event_variable_type, np.str_):
    print(f"Event column {event_column_label} is string. Checking what values are present in the column.")

    event_column_values = clinical_data[event_column_label].str.lower().unique()

    if len(event_column_values) != 2:
        raise ValueError(f"Event column {event_column_label} can only have two values. Please confirm event label in {DATASET_NAME}.yaml is the correct column or update to have only two values.")
    
    # Check if alive and dead are present in the event column
    if 'alive' in event_column_values and 'dead' in event_column_values: 
        print(f"Converting to binary where 0 is alive and 1 is dead.")

        clinical_data['survival_event_binary'] = clinical_data[event_column_label].str.lower().replace({'alive': '0', 'dead': '1'}).astype(int)

    else:
        raise ValueError(f"Event column {event_column_label} doesn't contain any variation of 'alive' and 'dead'. Please confirm event label in {DATASET_NAME}.yaml is the correct column.")

Time column overall_survival_in_days is numeric. Making copy with standardized column name.
Converting time column overall_survival_in_days from days to years.
Event column event_overall_survival is binary. Making copy with standardized column name.


### Set patient ID as index

In [7]:
clinical_patient_identifier = getPatientIdentifierLabel(clinical_data)
clinical_data.set_index(clinical_patient_identifier, inplace=True)

# Step X: Get list of image types in each image feature folder
Will be looping over each of these for processing

In [8]:
readii_features_dir = f"../../../rawdata/{DATASET_NAME}/readii_outputs"
fmcib_features_dir = f"../../../rawdata/{DATASET_NAME}/fmcib_outputs"

In [9]:
# Get list of image types in the radiomic and fmcib feature directories
radiomic_image_types = [file.removeprefix('radiomicfeatures_').removesuffix("_" + DATASET_NAME + ".csv") 
                        for file in os.listdir(readii_features_dir)]

fmcib_image_types = [file.removeprefix('fmcibfeatures_').removesuffix("_" + DATASET_NAME + ".csv") 
                           for file in os.listdir(fmcib_features_dir)]

# For each image type:

# Scratch code

In [None]:
import os
os.getcwd()