In [1]:
import os
import pandas as pd
from pathlib import Path
import glob

import numpy as np
import pingouin as pg

from scipy.stats import ttest_ind
from sklearn.preprocessing import StandardScaler

from mirp import extract_features
from mirp.settings.perturbation_parameters import ImagePerturbationSettingsClass
from sklearn.mixture import GaussianMixture

Intraclass correlation coefficients(ICC) analyses (GTR Level)

In [None]:
df = pd.read_csv('/path/to/ICC_data')

df['reader'] = df['ID'].apply(lambda x: 'r1' if 'ICC_1' in x else 'r2') 

prefixes_to_keep = ['original_shape_', 'original_firstorder_', 'original_glcm_', 'original_glrlm_', 'original_glszm_', 'original_gldm_', 'original_ngtdm_']

columns_to_keep = ['CaseID','reader'] + [col for col in df.columns if any(col.startswith(prefix) for prefix in prefixes_to_keep)]

ICC_inter = df[columns_to_keep].copy() 
ICC_inter['reader'] = ICC_inter['reader'].astype(str)
icc_results =  []

for column in ICC_inter.columns[2:]:
    icc_data = ICC_inter[['CaseID', 'reader', column]].copy()
    icc_data = icc_data.pivot(index='CaseID', columns='reader', values=column).reset_index()
    icc_data.columns.name = None  
    
    icc_data_melted = icc_data.melt(id_vars='CaseID', value_vars=['r1', 'r2'], var_name='reader', value_name=column)
    icc = pg.intraclass_corr(data=icc_data_melted, targets='CaseID', raters='reader', ratings=column, nan_policy='omit').round(3)
    icc['feature'] = column  
    icc_results.append(icc)

icc_results_df = pd.concat(icc_results, ignore_index=True)
icc_results_df.to_csv('/path/to/ICC_result.csv')

print('ICC calculation completed!')

In [None]:
icc_filtered_df = icc_results_df[(icc_results_df['Type'] == 'ICC3') & (icc_results_df['ICC'] <= 0.75)]
icc_features_remove = icc_filtered_df['feature'].unique()
print(f'Removed features number: {len(icc_features_remove)}')
print(icc_features_remove)

Perturbation  (GTR & ITH Level)

In [None]:
def assign_label(row):
    if row['image_noise_level'] == 0 and pd.isna(row['image_noise_iteration_id']) and row['image_rotation_angle'] == 0 and row['image_translation_x'] == 0 and row['image_translation_y'] == 0 and row['image_translation_z'] == 0:
        return 'r1'
    elif row['image_noise_iteration_id'] == 0 and row['image_rotation_angle'] == 0.5 and row['image_translation_x'] == 0.5 and row['image_translation_y'] == 0.5 and row['image_translation_z'] == 0.5:
        return 'r2'
    elif row['image_noise_iteration_id'] == 1 and row['image_rotation_angle'] == 0.5 and row['image_translation_x'] == 0.5 and row['image_translation_y'] == 0.5 and row['image_translation_z'] == 0.5:
        return 'r3'
    else:
        return 'other' 

In [None]:
settings = '/path/to/config_settings.xml'

feature_data = extract_features(
    image='/path/to/Perturbation_image',
    mask='/path/to/Perturbation_label',
    base_discretisation_method='fixed_bin_number',
    image_modality='CT',
    base_discretisation_bin_width=16.0,
    settings=settings  
)

Perturbation_data = pd.concat(feature_data, ignore_index=True)

In [None]:
Perturbation_data['reader'] = Perturbation_data.apply(assign_label, axis=1)

prefixes_to_keep = ['stat_', 'ivh_', 'morph_', 'ih_', 'cm_', 'rlm_','szm_', 'dzm_','ngt_','ngl_']
columns_to_keep = ['sample_name','reader'] + [col for col in Perturbation_data.columns if any(col.startswith(prefix) for prefix in prefixes_to_keep)]

ICC_Pert = Perturbation_data[columns_to_keep].copy() 

ICC_Pert['sample_name'] = ICC_Pert['sample_name'].astype(str)
ICC_Pert['reader'] = ICC_Pert['reader'].astype(str)

icc_Pert_results =  []

In [None]:
for column in ICC_Pert.columns[2:]:

    icc_data = ICC_Pert[['sample_name', 'reader', column]].copy()
    icc_data = icc_data.pivot(index='sample_name', columns='reader', values=column).reset_index()
    icc_data.columns.name = None  

    icc_data_melted = icc_data.melt(id_vars='sample_name', value_vars=['r1', 'r2', 'r3'], var_name='reader', value_name=column)

    icc = pg.intraclass_corr(data=icc_data_melted, targets='sample_name', raters='reader', ratings=column, nan_policy='omit').round(3)
    icc['feature'] = column  
    icc_Pert_results.append(icc)

per_results_df = pd.concat(icc_Pert_results, ignore_index=True)
per_results_df.to_csv('/path/to/perturbation_result_GTR.csv')
print('Complete!')

per_filtered_df = per_results_df[(per_results_df['Type'] == 'ICC3k') & (per_results_df['ICC'] <= 0.75)] 
pert_features_remove = per_filtered_df['feature'].unique()
len(pert_features_remove)
# These features need to align with corresponding pyradiomics features.

In [None]:
removed_features = np.unique(np.concatenate((pert_features_remove, icc_features_remove))) 
Combined_removed_features = pd.DataFrame(removed_features, columns=['removed_Feature'])
Combined_removed_features.to_excel('/path/to/Removed_features_GTR', index=False)

Feature Dimensionality Reduction--GTR Level

In [None]:
def standardize_features(df_stand, id_columns):
    if isinstance(id_columns, str): 
        id_columns = [id_columns]
    df_stand.reset_index(drop=True, inplace=True)   
    features_to_scale = df_stand.drop(columns=id_columns)   
    
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features_to_scale)
    
    scaled_features_df = pd.DataFrame(scaled_features, columns=features_to_scale.columns)
    
    id_columns_df = df_stand[id_columns]
    
    result_df = pd.concat([id_columns_df, scaled_features_df], axis=1)
    
    return result_df, scaler

In [None]:
training_pyradiomics = ('/path/to/training_data')

combined_removed_features_GTR = pd.read_excel('/path/to/Removed_features_GTR')
prefixes_features_GTR_remove = combined_removed_features_GTR['removed_Feature'].tolist()

train_set = training_pyradiomics.drop(columns=prefixes_features_GTR_remove).reset_index(drop=True)

Pearson correlation analyses

In [None]:
PCC_inter_standard, training_scaler = standardize_features(train_set,['ID'])

corr_matrix = PCC_inter_standard.drop(['ID'], axis=1).corr()  

high_corr_var=np.where(corr_matrix>0.75)  

to_remove = set()  

for var in zip(*high_corr_var):  
    if var[0] != var[1] and var[0] not in to_remove: 
        to_remove.add(var[1]) 

df_filtered = PCC_inter_standard.drop(columns=PCC_inter_standard.columns[list(to_remove)]) 
pcc_features_remove = corr_matrix.columns[list(to_remove)]
len(pcc_features_remove) 

Independent T-test

In [None]:
df_clinical = ('/path/to/clinical_data')
df_Ttest = pd.merge(PCC_inter_standard, df_clinical[['ID', 'ORR_RECIST1.1']], on='ID', how='left')

columns = list(df_Ttest.columns)
columns.remove('ORR_RECIST1.1')
columns.remove('ID')
columns.insert(1, 'ORR_RECIST1.1')
df_Ttest = df_Ttest[columns]

columns_to_remove = list(prefixes_features_GTR_remove) + list(pcc_features_remove)
df_Ttest = df_Ttest.drop(columns=columns_to_remove) 

In [None]:
group_0 = df_Ttest[df_Ttest['ORR_RECIST1.1'] == 0]  
group_1 = df_Ttest[df_Ttest['ORR_RECIST1.1'] == 1] 

features = df_Ttest.columns[2:]
significant_features = []
p_values = []
for feature in features:
    stat, p_value = ttest_ind(group_0[feature], group_1[feature])
    p_values.append(p_value)  
    if p_value < 0.05:  
        significant_features.append(feature)

print(len(significant_features), 'Significant features:', significant_features)
p_values_df = pd.DataFrame({'Feature': features, 'P-Value': p_values})
print(p_values_df) 

In [None]:
final_features_GTR = pd.DataFrame(significant_features, columns=['Feature'])
final_features_GTR.to_excel('/path/to/final_features_GTR_list', index=False)

In [None]:
# # Training set
columns_to_keep = ['ID'] + [col for col in df_Ttest.columns if col in set(final_features_GTR['Feature'])]  
training_pyradiomics_cluster_test = df_Ttest[columns_to_keep].copy() 
scaled_train_set, training_scaler = standardize_features(training_pyradiomics_cluster_test,'ID')

# # Internal validation set
invalset_pyradiomics = ('/path/to/invalset_data')
finalfeature_inval_set = invalset_pyradiomics[columns_to_keep].copy() 

scaled_inval_set  = training_scaler.transform(finalfeature_inval_set.drop(['ID'], axis=1))
scaled_inval_set = pd.DataFrame(scaled_inval_set, columns=finalfeature_inval_set.drop(['ID'], axis=1).columns)
scaled_inval_set = pd.concat([invalset_pyradiomics['ID'], scaled_inval_set], axis=1)

# # External test set
test_set_pyradiomics = pd.read_excel('/path/to/testset_data')
test_set_temp_for_srandardized= test_set_pyradiomics[columns_to_keep].copy() 

scaled_test_set  = training_scaler.transform(test_set_temp_for_srandardized.drop(['ID'], axis=1))
scaled_test_set = pd.DataFrame(scaled_test_set, columns=test_set_temp_for_srandardized.drop(['ID'], axis=1).columns)
scaled_test_set = pd.concat([test_set_pyradiomics['ID'], scaled_test_set], axis=1)

# # TCIA--TCGA
TCIA_TCGA_pyradiomics = pd.read_excel('/path/to/TCGAset_data')

TCIA_TCGA_temp_for_srandardized= TCIA_TCGA_pyradiomics[columns_to_keep].copy() 

scaled_TCIA_TCGA  = training_scaler.transform(TCIA_TCGA_temp_for_srandardized.drop(['ID'], axis=1))
scaled_TCIA_TCGA = pd.DataFrame(scaled_TCIA_TCGA, columns=TCIA_TCGA_temp_for_srandardized.drop(['ID'], axis=1).columns)
scaled_TCIA_TCGA = pd.concat([TCIA_TCGA_pyradiomics['ID'], scaled_TCIA_TCGA], axis=1)