In [2]:
from sklearn.linear_model import Lasso 

import pandas as pd
from readii.io.loaders import loadImageDatasetConfig, loadFileToDataFrame
from readii.process.subset import getPatientIntersectionDataframes
from pathlib import Path
import numpy as np

In [3]:
# Load in the configuration file
config = loadImageDatasetConfig("NSCLC-Radiomics", Path("../../config/datasets"))

# Initialize dataset parameters
CLINICAL_DATA_FILE = config["CLINICAL_FILE"]
DATASET_NAME = config["DATA_SOURCE"] + "_" + config["DATASET_NAME"]

RANDOM_SEED = 10

# general data directory path setup
DATA_DIR_PATH = Path("../../data")
RAW_DATA_PATH = DATA_DIR_PATH / "rawdata" / DATASET_NAME
PROC_DATA_PATH = DATA_DIR_PATH / "procdata" / DATASET_NAME
RESULTS_DATA_PATH = DATA_DIR_PATH / "results" / DATASET_NAME

pyradiomics_settings = "pyradiomics_original_all_features"

In [4]:
# Load clinical data file
clinical_data = loadFileToDataFrame((RAW_DATA_PATH / "clinical" / CLINICAL_DATA_FILE))

In [7]:
# Load the Med-ImageTools index to use for mapping TCIA IDs to local file names
mit_index = loadFileToDataFrame((RAW_DATA_PATH / "images" / config["MIT_INDEX_FILE"]))

# SampleID is local file name
# PatientID is TCIA ID
id_map = mit_index['SampleID']
id_map.index = mit_index["PatientID"]
id_map.drop_duplicates(inplace=True)

# Map the SampleIDs to the clinical data and add as a column for intersection
clinical_data['SampleID'] = clinical_data['PatientID'].map(id_map)
clinical_data.set_index('SampleID', inplace=True)

In [12]:
image_type = "full_original"
pyrad_features = loadFileToDataFrame((RESULTS_DATA_PATH / pyradiomics_settings / f"{image_type}_features.csv"))

pyrad_features.rename(columns={"ID": "SampleID"}, inplace=True)
# Set the index to SampleID
pyrad_features.set_index('SampleID', inplace=True)

In [14]:
clinical_data.index.intersection(pyrad_features.index)

Index(['NSCLC-Radiomics_001', 'NSCLC-Radiomics_002', 'NSCLC-Radiomics_003',
       'NSCLC-Radiomics_004', 'NSCLC-Radiomics_005', 'NSCLC-Radiomics_006',
       'NSCLC-Radiomics_007', 'NSCLC-Radiomics_008', 'NSCLC-Radiomics_009',
       'NSCLC-Radiomics_010',
       ...
       'NSCLC-Radiomics_413', 'NSCLC-Radiomics_414', 'NSCLC-Radiomics_415',
       'NSCLC-Radiomics_416', 'NSCLC-Radiomics_417', 'NSCLC-Radiomics_418',
       'NSCLC-Radiomics_419', 'NSCLC-Radiomics_420', 'NSCLC-Radiomics_421',
       'NSCLC-Radiomics_422'],
      dtype='object', name='SampleID', length=421)

In [15]:
clinical_subset, pyrad_subset = getPatientIntersectionDataframes(clinical_data, pyrad_features, need_pat_index_A=False, need_pat_index_B=False)

In [16]:
def calc_LASSO(X_train, event_status_train, a_lasso):
    """
    Performs LASSO for feature selection
   
    Parameters
    -----------
    X_train: pd.DataFrame
        A dataframe containing only radiomic features
    event_status_train: pd.DataFrame
        A column containing the event status of patients
    a_lasso: float
        The regularization parameter alpha to be used when initializing the LASSO model
       
    Returns
    ----------
    lasso_passed: list
        All feature names with non zero coefficients
    """
 
    lasso = Lasso(a_lasso)
    lasso.fit(X_train, event_status_train)
    lasso_coef = np.abs(lasso.coef_)
 
    all_rad_feats = X_train.columns.values.tolist()
    lasso_passed = np.array(all_rad_feats)[lasso_coef>0]
 
    return(lasso_passed)

In [18]:
from readii.process import getOnlyPyradiomicsFeatures

feats_only = getOnlyPyradiomicsFeatures(pyrad_subset)
event_status_train = clinical_subset['deadstatus.event']
calc_LASSO(feats_only, event_status_train, 0.5)

  model = cd_fast.enet_coordinate_descent(


array(['original_shape_VoxelVolume', 'original_shape_SurfaceArea',
       'original_firstorder_Energy',
       'original_firstorder_InterquartileRange',
       'original_firstorder_Maximum', 'original_firstorder_Median',
       'original_firstorder_Minimum', 'original_firstorder_TotalEnergy',
       'original_firstorder_Variance', 'original_glcm_ClusterProminence',
       'original_glrlm_GrayLevelNonUniformity',
       'original_glrlm_HighGrayLevelRunEmphasis',
       'original_glrlm_LongRunHighGrayLevelEmphasis',
       'original_glrlm_RunLengthNonUniformity',
       'original_glszm_LargeAreaEmphasis',
       'original_glszm_LargeAreaHighGrayLevelEmphasis',
       'original_glszm_SizeZoneNonUniformity',
       'original_glszm_ZoneVariance',
       'original_gldm_DependenceNonUniformity',
       'original_gldm_GrayLevelNonUniformity',
       'original_gldm_LargeDependenceHighGrayLevelEmphasis',
       'original_ngtdm_Complexity'], dtype='<U50')