In [1]:
from sklearn.linear_model import Lasso 

import pandas as pd
from readii.io.loaders import loadImageDatasetConfig, loadFileToDataFrame
from readii.process.subset import getPatientIntersectionDataframes
from readii.process.label import addOutcomeLabels, eventOutcomeColumnSetup, timeOutcomeColumnSetup
from pathlib import Path
import numpy as np
import yaml

### Set up variables from config file for the dataset

In [2]:
# Load in the configuration file
config = loadImageDatasetConfig("NSCLC-Radiomics", Path("../../config/datasets"))

# Initialize dataset parameters
CLINICAL_DATA_FILE = config["CLINICAL_FILE"]
OUTCOME_VARIABLES = config["OUTCOME_VARIABLES"]

DATASET_NAME = config["DATA_SOURCE"] + "_" + config["DATASET_NAME"]

RANDOM_SEED = 10

# general data directory path setup
DATA_DIR_PATH = Path("../../data")
RAW_DATA_PATH = DATA_DIR_PATH / "rawdata" / DATASET_NAME
PROC_DATA_PATH = DATA_DIR_PATH / "procdata" / DATASET_NAME
RESULTS_DATA_PATH = DATA_DIR_PATH / "results" / DATASET_NAME

pyradiomics_settings = "pyradiomics_original_plus_aerts"

# Intersect Clinical and Feature data to get patient subset for analysis

## Clinical data loading and processing

In [3]:
# Load clinical data file
raw_clinical_data = loadFileToDataFrame((RAW_DATA_PATH / "clinical" / CLINICAL_DATA_FILE))

# Load the Med-ImageTools index to use for mapping TCIA IDs to local file names
mit_index = loadFileToDataFrame((RAW_DATA_PATH / "images" / config["MIT_INDEX_FILE"]))

# SampleID is local file name
# PatientID is TCIA ID
id_map = mit_index['SampleID']
id_map.index = mit_index["PatientID"]
id_map.drop_duplicates(inplace=True)

# Map the SampleIDs to the clinical data and add as a column for intersection
raw_clinical_data['SampleID'] = raw_clinical_data['PatientID'].map(id_map)
raw_clinical_data.set_index('SampleID', inplace=True)

## Feature data loading and processing

In [10]:
image_type = "full_original"
raw_feature_data = loadFileToDataFrame((RESULTS_DATA_PATH / pyradiomics_settings / f"{image_type}_features.csv"))

raw_feature_data.rename(columns={"ID": "SampleID"}, inplace=True)
# Set the index to SampleID
raw_feature_data.set_index('SampleID', inplace=True)

## Intersect clinical and feature data

In [11]:
clinical_data, pyrad_subset = getPatientIntersectionDataframes(raw_clinical_data, raw_feature_data, need_pat_index_A=False, need_pat_index_B=False)

# Set up outcome columns in clinical data

In [6]:
clinical_data = eventOutcomeColumnSetup(dataframe_with_outcome=clinical_data,
                                        outcome_column_label=OUTCOME_VARIABLES["event_label"],
                                        standard_column_label="survival_event_binary",
                                        )
clinical_data = timeOutcomeColumnSetup(dataframe_with_outcome=clinical_data,
                                       outcome_column_label=OUTCOME_VARIABLES["time_label"],
                                       standard_column_label="survival_time_years",
                                       convert_to_years=OUTCOME_VARIABLES["convert_to_years"])

# Feature Selection

## Existing signature

### Load signature from yaml file

In [14]:
# Path to the yaml file in the rawdata directory
radiomic_signature_yaml = RAW_DATA_PATH.parent / "cph_weights_radiomic_signature" / "aerts_original.yaml"

try:
    with open(radiomic_signature_yaml, 'r') as f:
        yaml_data = yaml.safe_load(f)
        if not isinstance(yaml_data, dict):
            raise TypeError("ROI match YAML must contain a dictionary")
        radiomic_signature = pd.Series(yaml_data['signature'])
except Exception as e:
    print(f"Error loading YAML file: {e}")
    raise

### Get just features in radiomic signature

In [15]:
signature_feature_data = raw_feature_data[radiomic_signature.index]

## LASSO feature selection

In [None]:
def calc_LASSO(X_train, event_status_train, a_lasso):
    """
    Performs LASSO for feature selection
   
    Parameters
    -----------
    X_train: pd.DataFrame
        A dataframe containing only radiomic features
    event_status_train: pd.DataFrame
        A column containing the event status of patients
    a_lasso: float
        The regularization parameter alpha to be used when initializing the LASSO model
       
    Returns
    ----------
    lasso_passed: list
        All feature names with non zero coefficients
    """
 
    lasso = Lasso(a_lasso)
    lasso.fit(X_train, event_status_train)
    lasso_coef = np.abs(lasso.coef_)
 
    all_rad_feats = X_train.columns.values.tolist()
    lasso_passed = np.array(all_rad_feats)[lasso_coef>0]
 
    return(lasso_passed)

In [None]:
from readii.process import getOnlyPyradiomicsFeatures

feats_only = getOnlyPyradiomicsFeatures(pyrad_subset)
event_status_train = clinical_data['survival_event_binary']
calc_LASSO(feats_only, event_status_train, 0.5)

# Prediction Modelling

## With existing signature weights

feature_hazards = 