# Delta Radiomics

### **Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import os
import yaml

### **Import Data**

In [None]:


with open("D:/DSLS/Omics2/integromics/config.yaml", "r") as f:
    config = yaml.safe_load(f)

print(config)


{'data': {'root_dir': 'D:/DSLS/Omics2/modelling/modelling_data', 'subjects': ['005', '006', '007', '008', '009', '010', '011', '013', '014', '015', '022', '026', '028', '031', '046', '047', '048', '050', '055', '061', '068', '070', '077', '095']}, 'clinical': {'root_dir': 'D:/DSLS/Omics2/modelling/clinical_data', 'main_file': '10162025_UMCG_wide_export_Yescarta_infused_for_tFL_study.xlsx'}, 'processing': {'load_images': True, 'load_masks': True, 'image_extension': '.nii.gz', 'mask_extension': '.nii.gz'}, 'output': {'save_features': True, 'output_dir': 'outputs/'}}


In [None]:

def calculate_delta_radiomics(data_folder_path):
    """
    Reads radiomics data from subfolders (Time A and Time B), filters for 'suv2.5' 
    segmentation, calculates the delta (B - A) for numeric features, and stores
    the results in a dictionary per patient.

    Args:
        data_folder_path (str): The path to the main folder containing patient subfolders.

    Returns:
        dict: A dictionary where keys are patient folder names (Patient IDs) 
              and values are dictionaries containing the calculated delta radiomics features.
    """
    all_delta_radiomics = {}
    A_radiomics, B_radiomics = {}, {}

    # 1. Iterate through all items in the main data folder
    for patient_folder_name in os.listdir(data_folder_path):
        patient_path = os.path.join(data_folder_path, patient_folder_name)
        
        # Ensure it is actually a directory (a patient folder)
        if os.path.isdir(patient_path):
            print(f"--- Processing {patient_folder_name} ---")
            
            # Initialize paths for Time A and Time B files
            file_A_path = None
            file_B_path = None
            
            # 2. Find the radiomics files for Time A and Time B in the patient folder
            for filename in os.listdir(patient_path):
                path_excel = os.path.join(patient_path, filename)
                # NOTE: Assuming the files are named consistently and contain 'A' or 'B' 
                # to identify the time point. Adjust this logic if needed.
  
                if '_A' in path_excel.upper() and path_excel.endswith('.xlsx'):
                        file_A_path = path_excel
                elif '_B' in path_excel.upper() and path_excel.endswith('.xlsx'):
                        file_B_path = path_excel
            if file_A_path and file_B_path:
                try:
                    # 3. Read and preprocess the data
                    
                    # Read Excel files and transpose them (assuming features are in columns 
                    # and metadata/values in rows; pandas reads the first row as header)
                    # We assume 'segmentation' is one of the columns after reading.
                    df_A = pd.read_excel(file_A_path)
                    df_B = pd.read_excel(file_B_path)
                    
                    # 4. Filter for the 'suv2.5' segmentation row
                    # NOTE: the column containing 'suv2.5' is named 'Segmentation'
                    # and the feature names are in the other columns.
                    # filtering the columns fro 23 onwards to get only feature values
                    row_A = df_A[df_A['Segmentation'].str.contains('suv2.5')].iloc[0, 23:]
                    row_B = df_B[df_B['Segmentation'].str.contains('suv2.5')].iloc[0, 23:]

                    # Create a Series of only the numeric feature values for A and B
                    
                    # Convert to numeric, coercing errors to NaN (just in case)
                    numeric_A = pd.to_numeric(row_A, errors='coerce')
                    numeric_B = pd.to_numeric(row_B, errors='coerce')

                    # 6. Calculate Delta Radiomics (Time B - Time A)
                    delta_radiomics = numeric_B - numeric_A
                    
                    
                    # Convert the resulting pandas Series into a standard Python dictionary
                    # and store it under the patient's ID
                    # dropna() to remove any features that resulted in NaN
                    all_delta_radiomics[patient_folder_name] = delta_radiomics.dropna().to_dict()
                    A_radiomics[patient_folder_name] = numeric_A.dropna().to_dict()
                    B_radiomics[patient_folder_name] = numeric_B.dropna().to_dict()
                    print(f"Successfully calculated radiomics and delta radiomics for {patient_folder_name}.")

                except Exception as e:
                    print(f"Error processing files for {patient_folder_name}: {e}")
            else:
                print(f"Could not find both A and B files in {patient_folder_name}.")
    A = pd.DataFrame.from_dict(A_radiomics, orient='index')
    B = pd.DataFrame.from_dict(B_radiomics, orient='index')
    all_delta_radiomics = pd.DataFrame.from_dict(all_delta_radiomics, orient='index')

    return all_delta_radiomics, A, B


**The ones kylie did, had 99 features, the others have 43. why?**

In [4]:
# import sys
# print(sys.executable)

In [5]:
# import sys
# !"{sys.executable}" -m pip uninstall -y openpyxl
# !"{sys.executable}" -m pip install openpyxl==3.1.5


In [6]:
# Define the path to main data folder

DATA_DIR = config["data"]["root_dir"]

# Run the function
delta_radiomics_results, a_radiomics, b_radiomics = calculate_delta_radiomics(DATA_DIR)

# Print the results for verification
print("\n--- Final Results Summary ---")
for patient, delta_data in delta_radiomics_results.items():
    print(f"\n{patient} Delta Radiomics ({len(delta_data)} features):")
    # Print the first 5 features as an example
    print(dict(list(delta_data.items())[:5]))

--- Processing 005 ---
Successfully calculated radiomics and delta radiomics for 005.
--- Processing 006 ---
Successfully calculated radiomics and delta radiomics for 006.
--- Processing 007 ---
Successfully calculated radiomics and delta radiomics for 007.
--- Processing 008 ---
Successfully calculated radiomics and delta radiomics for 008.
--- Processing 009 ---
Successfully calculated radiomics and delta radiomics for 009.
--- Processing 010 ---
Successfully calculated radiomics and delta radiomics for 010.
--- Processing 011 ---
Successfully calculated radiomics and delta radiomics for 011.
--- Processing 013 ---
Successfully calculated radiomics and delta radiomics for 013.
--- Processing 014 ---
Successfully calculated radiomics and delta radiomics for 014.
--- Processing 015 ---
Successfully calculated radiomics and delta radiomics for 015.
--- Processing 022 ---
Successfully calculated radiomics and delta radiomics for 022.
--- Processing 026 ---
Successfully calculated radiomi

In [7]:
delta_radiomics_results

Unnamed: 0,MeshVolume (cc),Volume (cc),Compactness1,Compactness2,Elongation,Flatness,LeastAxisLength,MajorAxisLength,Maximum2DDiameterColumn,Maximum2DDiameterRow,...,glrlm_RunVariance,glrlm_ShortRunEmphasis,glrlm_ShortRunHighGrayLevelEmphasis,glrlm_ShortRunLowGrayLevelEmphasis,TLG,Number of lesions,Dmax Patient (mm),Spread Patient (mm),Dmax Bulk (mm),Spread Bulk (mm)
5,179.678833,202.66198,0.000264,0.001121,-0.154568,-0.179412,2.448599,306.989805,-151.000382,230.851363,...,1.119328,-0.098716,-0.101934,-0.097912,1122.099512,13.0,184.614198,10352.503667,160.97647,5849.759928
6,-203.486928,-206.514567,0.004919,0.041521,0.002634,0.002381,0.127861,-11.301956,303.648427,176.881094,...,1.416009,0.032562,0.032562,0.032562,-1902.683739,1.0,547.16608,7671.66298,354.932145,1722.075793
7,-338.856476,-53.558539,-0.000507,-0.00037,0.071425,0.016399,20.010696,30.697467,-81.079674,-33.35876,...,-5.936345,0.076732,0.083133,0.07546,227.653881,3.0,123.012344,2255.654095,130.451805,983.137861
8,-132.17556,-134.14302,0.003246,0.029861,-0.082737,0.030653,3.284378,-55.262061,153.100326,473.795599,...,,,,,-1383.518197,0.0,396.153536,1392.367241,17.553961,-558.529696
9,134.036417,135.082881,-0.003934,-0.035294,0.278651,0.110604,78.870652,25.173805,285.955105,125.184095,...,-1.293857,0.00839,-0.004554,0.012367,623.006859,4.0,553.035805,4706.208333,26.947836,535.824989
10,-1740.405214,-1709.315684,0.000275,0.001942,0.041219,0.041415,-18.618091,-173.882011,-362.968918,-421.96246,...,-19.18255,-0.058513,-0.058513,-0.058513,-12856.820854,2.0,-74.319871,536.33788,-251.560087,181.872939
11,-403.43125,-411.208054,-0.008448,-0.057458,0.184636,0.143643,41.23991,-25.1433,-29.935314,-174.406684,...,-47.635928,0.12576,0.254256,0.103773,-1665.931074,20.0,76.004485,9073.532138,-218.245188,4925.288459
13,282.473562,285.526503,-0.015736,-0.255738,0.442808,-0.060073,26.527685,124.898852,64.273602,104.376945,...,-1.636574,-0.009532,-0.101302,0.013411,901.939401,7.0,340.650929,3034.678995,165.954339,1933.434785
14,-618.581947,-619.5321,-0.001067,-0.007992,0.153009,-0.070289,-34.663123,144.560198,68.070835,63.707567,...,,,,,-865.378541,3.0,53.796518,1726.580754,-80.063723,854.025032
15,49.867963,49.214353,0.016168,0.256642,0.266914,0.405557,-0.581837,-299.663486,-166.887134,-267.583392,...,,,,,321.512895,-3.0,-530.927813,-1122.497606,-530.927813,-1122.497606


In [8]:
# Clean and prepare dataframes
# by dropping columns with any NaN values and resetting index
# to keep only the complete cases (some patients have 99 columns with NaNs, but 43 are always present)
# we'll work with those 43.
for df in [delta_radiomics_results, a_radiomics, b_radiomics]:
    df.dropna(axis=1, how='any', inplace=True)
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'id'}, inplace=True)
    df['id'] = df['id'].astype(int)

In [9]:
# to differentiate the columns of A and B datasets
a_radiomics = a_radiomics.add_suffix('_a')

In [10]:
print(a_radiomics.head())


   id_a  MeshVolume (cc)_a  Volume (cc)_a  Compactness1_a  Compactness2_a  \
0     5         410.103787     398.813580        0.005837        0.012107   
1     6         382.203626     388.922067        0.009420        0.031529   
2     7        2318.279862    2316.219528        0.001282        0.000584   
3     8         657.198795     662.416920        0.011321        0.045535   
4     9         280.358030     284.356359        0.014593        0.075666   

   Elongation_a  Flatness_a  LeastAxisLength_a  MajorAxisLength_a  \
0      0.558487    0.355846         102.570709         288.244831   
1      0.222277    0.119058          75.010564         630.035602   
2      0.286171    0.198725         162.471469         817.568425   
3      0.573857    0.188817          94.938713         502.808344   
4      0.134513    0.093095          62.068863         666.725172   

   Maximum2DDiameterColumn_a  ...  SUV_StandardDeviation_a  SUV_TotalEnergy_a  \
0                 604.608311  ...        

In [11]:
b_radiomics = b_radiomics.add_suffix('_b')

In [12]:
print(b_radiomics.head())

   id_b  MeshVolume (cc)_b  Volume (cc)_b  Compactness1_b  Compactness2_b  \
0     5         589.782620     601.475560        0.006102        0.013228   
1     6         178.716698     182.407500        0.014339        0.073050   
2     7        1979.423386    2262.660989        0.000776        0.000214   
3     8         525.023235     528.273900        0.014567        0.075396   
4     9         414.394448     419.439240        0.010660        0.040372   

   Elongation_b  Flatness_b  LeastAxisLength_b  MajorAxisLength_b  \
0      0.403919    0.176433         105.019308         595.234636   
1      0.224911    0.121439          75.138425         618.733646   
2      0.357596    0.215124         182.482165         848.265892   
3      0.491120    0.219470          98.223091         447.546283   
4      0.413164    0.203700         140.939516         691.898977   

   Maximum2DDiameterColumn_b  ...  SUV_StandardDeviation_b  SUV_TotalEnergy_b  \
0                 453.607929  ...        

In [13]:
for patient, delta_data in delta_radiomics_results.items():
    if len(delta_data) == 99:
        print(patient)

In [14]:
filtered_results = {patient: data for patient, data in delta_radiomics_results.items() if len(data) != 99}

In [15]:
len(filtered_results)

44

In [16]:
for patient, delta_data in delta_radiomics_results.items():
        print(patient)

id
MeshVolume (cc)
Volume (cc)
Compactness1
Compactness2
Elongation
Flatness
LeastAxisLength
MajorAxisLength
Maximum2DDiameterColumn
Maximum2DDiameterRow
Maximum2DDiameterSlice
Maximum3DDiameter
MinorAxisLength
SphericalDisproportion
Sphericity
SurfaceArea
SurfaceVolumeRatio (cc)
SUV_10Percentile
SUV_90Percentile
SUV_Energy
SUV_Entropy
SUV_InterquartileRange
SUV_Kurtosis
SUV_Maximum
SUV_MeanAbsoluteDeviation
SUV_Mean
SUV_Median
SUV_Peak
SUV_Minimum
SUV_Range
SUV_RobustMeanAbsoluteDeviation
SUV_RootMeanSquared
SUV_Skewness
SUV_StandardDeviation
SUV_TotalEnergy
SUV_Uniformity
SUV_Variance
TLG
Number of lesions
Dmax Patient (mm)
Spread Patient (mm)
Dmax Bulk (mm)
Spread Bulk (mm)


Other than Kylies folders, we're missing these:  
**12: missing folder A**  
**19: missing results for A**  
**21: missing folder B**  
**64: missing results for B**  

Therefore we're left with only 14 complete delta radiomics feature sets.

# Clinical Data

In [17]:

clinical_dir = config["clinical"]["root_dir"]      # "D:/DSLS/Omics2/modelling/clinical_data"
clinical_file = config["clinical"]["main_file"]    # "10162025_UMCG_wide_export_Yescarta_infused_for_tFL_study.xlsx"

clinical_path = os.path.join(clinical_dir, clinical_file)
clinic_data = pd.read_excel(clinical_path)


In [18]:
clinic_data.head()

Unnamed: 0,record_id,medhis_diag_comments,scr_date_tb1stmeeting,scr_sex,scr_sex.factor,scr_age,scr_height,scr_weight,scr_bmi,indication_dis_diagnosis,...,post_cart_ther_spec_2___ne.factor,post_cart_ther_comment_spec,cli_st_lab_date,cli_st_hemoglobin,cli_st_trombocytes,cli_st_leukocytes,cli_st_neutrophils,cli_st_ldh,cli_st_crp,cli_st_ferritin
0,Record ID,Comments,Date 1st tumorboard meeting,Sex,,Age,Height,Weight,BMI (kg/m2),Diagnosis for which there is now a cellular th...,...,,Please specify all subsequent anti-cancer ther...,Date lab results,Hemoglobin in mmol/L,Thrombocytes in 10E9/L,Leukocytes in 10E9/L,Neutrophils in 10E9/L (automated differentiation),LDH in U/L,CRP in mg/L,Ferritin in µg/l
1,FTC-UMCG-0001,splenectomy 2012: total hip links 2015: jich...,2020-05-04,0,Male,68,180,72.6,22,1,...,Unchecked,,2020-04-28,7.1,90,6.3,4.74,169,26,NE
2,FTC-UMCG-0002,> 20 jaar geleden DVT links Longembolie links...,2020-05-07,0,Male,73,190,86,24,2,...,Unchecked,,2020-05-14,64,172,4.3,2.83,NE,47,2847
3,FTC-UMCG-0003,"2019 Nov Grootcellig B-Non-Hodgkin lymfoom,...",2020-05-18,0,Male,59,181,91,28,1,...,Unchecked,Radiotherapy CNS and Korfel 3x response evalua...,2020-05-15,7.4,389,11.9,NE,214,14,1404
4,FTC-UMCG-0004,2015 gehoorverlies 2019 aug: DLBCL ...,2020-05-14,1,Female,61,169,73,26,1,...,Unchecked,,2020-04-21,6.5,159,9.2,6.55,296,3.0,NE


In [19]:
clinic_data

Unnamed: 0,record_id,medhis_diag_comments,scr_date_tb1stmeeting,scr_sex,scr_sex.factor,scr_age,scr_height,scr_weight,scr_bmi,indication_dis_diagnosis,...,post_cart_ther_spec_2___ne.factor,post_cart_ther_comment_spec,cli_st_lab_date,cli_st_hemoglobin,cli_st_trombocytes,cli_st_leukocytes,cli_st_neutrophils,cli_st_ldh,cli_st_crp,cli_st_ferritin
0,Record ID,Comments,Date 1st tumorboard meeting,Sex,,Age,Height,Weight,BMI (kg/m2),Diagnosis for which there is now a cellular th...,...,,Please specify all subsequent anti-cancer ther...,Date lab results,Hemoglobin in mmol/L,Thrombocytes in 10E9/L,Leukocytes in 10E9/L,Neutrophils in 10E9/L (automated differentiation),LDH in U/L,CRP in mg/L,Ferritin in µg/l
1,FTC-UMCG-0001,splenectomy 2012: total hip links 2015: jich...,2020-05-04,0,Male,68,180,72.6,22,1,...,Unchecked,,2020-04-28,7.1,90,6.3,4.74,169,26,NE
2,FTC-UMCG-0002,> 20 jaar geleden DVT links Longembolie links...,2020-05-07,0,Male,73,190,86,24,2,...,Unchecked,,2020-05-14,64,172,4.3,2.83,NE,47,2847
3,FTC-UMCG-0003,"2019 Nov Grootcellig B-Non-Hodgkin lymfoom,...",2020-05-18,0,Male,59,181,91,28,1,...,Unchecked,Radiotherapy CNS and Korfel 3x response evalua...,2020-05-15,7.4,389,11.9,NE,214,14,1404
4,FTC-UMCG-0004,2015 gehoorverlies 2019 aug: DLBCL ...,2020-05-14,1,Female,61,169,73,26,1,...,Unchecked,,2020-04-21,6.5,159,9.2,6.55,296,3.0,NE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,FTC-UMCG-0088,Hematologische voorgeschiedenis 2013 bi-cy...,2023-09-28,0,Male,54,178,69.8,22,1,...,Unchecked,,2023-09-29,9.1,93,7.0,NE,369,7,1643
65,FTC-UMCG-0089,2013 dec: laaggradig B-NHL stadium IV met s...,2023-10-05,1,Female,70,160,58.7,23,2,...,Unchecked,Epcoritamab monotherapy - 48 mg per injection ...,2023-10-04,8.1,205,5.3,2.97,325,17,204
66,FTC-UMCG-0090,Relevante voorgeschiedenis: 2016 Stadium IV D...,2023-10-12,0,Male,70,170,73,25,1,...,Unchecked,2024-02 recidief diffuus grootcellig B-cel lym...,2023-10-12,9.5,327,6.6,5.06,991,78,669
67,FTC-UMCG-0096,Voorgeschiedenis: Tonsilectomie 2004 IBS ...,2022-11-22,0,Male,62,180,78,24,1,...,Unchecked,,2022-10-11,6.8,109,20.7,NE,475,15,1932


In [20]:
clinic_data.shape

(69, 275)

In [21]:
clinic_data['record_id'].values

array(['Record ID', 'FTC-UMCG-0001', 'FTC-UMCG-0002', 'FTC-UMCG-0003',
       'FTC-UMCG-0004', 'FTC-UMCG-0005', 'FTC-UMCG-0006', 'FTC-UMCG-0007',
       'FTC-UMCG-0008', 'FTC-UMCG-0009', 'FTC-UMCG-0010', 'FTC-UMCG-0011',
       'FTC-UMCG-0012', 'FTC-UMCG-0013', 'FTC-UMCG-0014', 'FTC-UMCG-0015',
       'FTC-UMCG-0016', 'FTC-UMCG-0017', 'FTC-UMCG-0018', 'FTC-UMCG-0019',
       'FTC-UMCG-0020', 'FTC-UMCG-0021', 'FTC-UMCG-0022', 'FTC-UMCG-0023',
       'FTC-UMCG-0024', 'FTC-UMCG-0025', 'FTC-UMCG-0026', 'FTC-UMCG-0027',
       'FTC-UMCG-0028', 'FTC-UMCG-0029', 'FTC-UMCG-0030', 'FTC-UMCG-0031',
       'FTC-UMCG-0046', 'FTC-UMCG-0047', 'FTC-UMCG-0048', 'FTC-UMCG-0049',
       'FTC-UMCG-0050', 'FTC-UMCG-0051', 'FTC-UMCG-0052', 'FTC-UMCG-0053',
       'FTC-UMCG-0054', 'FTC-UMCG-0055', 'FTC-UMCG-0060', 'FTC-UMCG-0061',
       'FTC-UMCG-0064', 'FTC-UMCG-0065', 'FTC-UMCG-0066', 'FTC-UMCG-0067',
       'FTC-UMCG-0068', 'FTC-UMCG-0069', 'FTC-UMCG-0070', 'FTC-UMCG-0075',
       'FTC-UMCG-0076', 'FTC-

In [22]:
clinic_data['id_cleaned'] = [value[-3:] for value in clinic_data['record_id'].values]

In [23]:
clinic_data.head()

Unnamed: 0,record_id,medhis_diag_comments,scr_date_tb1stmeeting,scr_sex,scr_sex.factor,scr_age,scr_height,scr_weight,scr_bmi,indication_dis_diagnosis,...,post_cart_ther_comment_spec,cli_st_lab_date,cli_st_hemoglobin,cli_st_trombocytes,cli_st_leukocytes,cli_st_neutrophils,cli_st_ldh,cli_st_crp,cli_st_ferritin,id_cleaned
0,Record ID,Comments,Date 1st tumorboard meeting,Sex,,Age,Height,Weight,BMI (kg/m2),Diagnosis for which there is now a cellular th...,...,Please specify all subsequent anti-cancer ther...,Date lab results,Hemoglobin in mmol/L,Thrombocytes in 10E9/L,Leukocytes in 10E9/L,Neutrophils in 10E9/L (automated differentiation),LDH in U/L,CRP in mg/L,Ferritin in µg/l,ID
1,FTC-UMCG-0001,splenectomy 2012: total hip links 2015: jich...,2020-05-04,0,Male,68,180,72.6,22,1,...,,2020-04-28,7.1,90,6.3,4.74,169,26,NE,001
2,FTC-UMCG-0002,> 20 jaar geleden DVT links Longembolie links...,2020-05-07,0,Male,73,190,86,24,2,...,,2020-05-14,64,172,4.3,2.83,NE,47,2847,002
3,FTC-UMCG-0003,"2019 Nov Grootcellig B-Non-Hodgkin lymfoom,...",2020-05-18,0,Male,59,181,91,28,1,...,Radiotherapy CNS and Korfel 3x response evalua...,2020-05-15,7.4,389,11.9,NE,214,14,1404,003
4,FTC-UMCG-0004,2015 gehoorverlies 2019 aug: DLBCL ...,2020-05-14,1,Female,61,169,73,26,1,...,,2020-04-21,6.5,159,9.2,6.55,296,3.0,NE,004


In [24]:
clinic_data

Unnamed: 0,record_id,medhis_diag_comments,scr_date_tb1stmeeting,scr_sex,scr_sex.factor,scr_age,scr_height,scr_weight,scr_bmi,indication_dis_diagnosis,...,post_cart_ther_comment_spec,cli_st_lab_date,cli_st_hemoglobin,cli_st_trombocytes,cli_st_leukocytes,cli_st_neutrophils,cli_st_ldh,cli_st_crp,cli_st_ferritin,id_cleaned
0,Record ID,Comments,Date 1st tumorboard meeting,Sex,,Age,Height,Weight,BMI (kg/m2),Diagnosis for which there is now a cellular th...,...,Please specify all subsequent anti-cancer ther...,Date lab results,Hemoglobin in mmol/L,Thrombocytes in 10E9/L,Leukocytes in 10E9/L,Neutrophils in 10E9/L (automated differentiation),LDH in U/L,CRP in mg/L,Ferritin in µg/l,ID
1,FTC-UMCG-0001,splenectomy 2012: total hip links 2015: jich...,2020-05-04,0,Male,68,180,72.6,22,1,...,,2020-04-28,7.1,90,6.3,4.74,169,26,NE,001
2,FTC-UMCG-0002,> 20 jaar geleden DVT links Longembolie links...,2020-05-07,0,Male,73,190,86,24,2,...,,2020-05-14,64,172,4.3,2.83,NE,47,2847,002
3,FTC-UMCG-0003,"2019 Nov Grootcellig B-Non-Hodgkin lymfoom,...",2020-05-18,0,Male,59,181,91,28,1,...,Radiotherapy CNS and Korfel 3x response evalua...,2020-05-15,7.4,389,11.9,NE,214,14,1404,003
4,FTC-UMCG-0004,2015 gehoorverlies 2019 aug: DLBCL ...,2020-05-14,1,Female,61,169,73,26,1,...,,2020-04-21,6.5,159,9.2,6.55,296,3.0,NE,004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,FTC-UMCG-0088,Hematologische voorgeschiedenis 2013 bi-cy...,2023-09-28,0,Male,54,178,69.8,22,1,...,,2023-09-29,9.1,93,7.0,NE,369,7,1643,088
65,FTC-UMCG-0089,2013 dec: laaggradig B-NHL stadium IV met s...,2023-10-05,1,Female,70,160,58.7,23,2,...,Epcoritamab monotherapy - 48 mg per injection ...,2023-10-04,8.1,205,5.3,2.97,325,17,204,089
66,FTC-UMCG-0090,Relevante voorgeschiedenis: 2016 Stadium IV D...,2023-10-12,0,Male,70,170,73,25,1,...,2024-02 recidief diffuus grootcellig B-cel lym...,2023-10-12,9.5,327,6.6,5.06,991,78,669,090
67,FTC-UMCG-0096,Voorgeschiedenis: Tonsilectomie 2004 IBS ...,2022-11-22,0,Male,62,180,78,24,1,...,,2022-10-11,6.8,109,20.7,NE,475,15,1932,096


In [25]:
clinic_data['id_cleaned'].values

array([' ID', '001', '002', '003', '004', '005', '006', '007', '008',
       '009', '010', '011', '012', '013', '014', '015', '016', '017',
       '018', '019', '020', '021', '022', '023', '024', '025', '026',
       '027', '028', '029', '030', '031', '046', '047', '048', '049',
       '050', '051', '052', '053', '054', '055', '060', '061', '064',
       '065', '066', '067', '068', '069', '070', '075', '076', '077',
       '078', '079', '080', '081', '082', '083', '084', '085', '086',
       '087', '088', '089', '090', '096', '104'], dtype=object)

In [26]:
delta_radiomics_results['id']

0      5
1      6
2      7
3      8
4      9
5     10
6     11
7     13
8     14
9     15
10    22
11    26
12    28
13    31
14    46
15    47
16    48
17    50
18    52
19    55
20    61
21    68
22    70
23    77
24    95
Name: id, dtype: int64

In [27]:
patient_ids = clinic_data['id_cleaned'].values[1:].astype(int)

In [28]:
# find patients that are in both datasets
# values starts from 1 to skip the comment row
intercept = [id for id in delta_radiomics_results['id'] if id in patient_ids]

In [29]:
clinic_data['id_cleaned'] = ['ID'] + patient_ids.tolist()

In [30]:
clinic_data_cleaned = clinic_data[clinic_data['id_cleaned'].isin(intercept)]

In [31]:
clinic_data_cleaned.reset_index(drop=True, inplace=True)

In [32]:
clinic_data_cleaned.shape

(24, 276)

Eventually, we have 24 patients with complete clinical and delta radiomics data to work with.

**Note:** patient 95 is missing their clinical data. 

In [33]:
# we now should select features we need for modelling the baseline, without the delta radiomics
clinic_data_cleaned

Unnamed: 0,record_id,medhis_diag_comments,scr_date_tb1stmeeting,scr_sex,scr_sex.factor,scr_age,scr_height,scr_weight,scr_bmi,indication_dis_diagnosis,...,post_cart_ther_comment_spec,cli_st_lab_date,cli_st_hemoglobin,cli_st_trombocytes,cli_st_leukocytes,cli_st_neutrophils,cli_st_ldh,cli_st_crp,cli_st_ferritin,id_cleaned
0,FTC-UMCG-0005,2019 mei: hemicastratie links Hematologische...,1900-01-01,0,Male,62,173,58.0,19,1,...,,2020-05-20,5.3,145,2.4,,NE,0.3,894,5
1,FTC-UMCG-0006,2014 Diffuus Grootcellig B-cel lymfoom st I...,2020-07-02,1,Female,58,173,57.0,19,2,...,"Verdere behandeling, inclusief allo-SCT in UMCU",2020-07-13,7.6,6,5.7,NE,275,9,371,6
2,FTC-UMCG-0007,2020 (feb) Stadium IV high grade B-cel lymfoom...,2020-08-10,0,Male,58,182,99.2,30,5,...,,2020-08-18,64.0,295,50.0,NE,885,47,2570,7
3,FTC-UMCG-0008,"2017 okt: Snel progressief DLBCL, stadium I...",2020-09-21,1,Female,72,169,60.0,21,1,...,Epcoritamab monotherapie,2020-09-18,5.7,321,1.9,NE,250,1.0,NE,8
4,FTC-UMCG-0009,2017 okt gastro- en colonoscopie ivm chroni...,2020-09-02,0,Male,48,186,106.0,31,2,...,,2020-09-02,7.2,324,8.7,NE,283,10,54,9
5,FTC-UMCG-0010,2020 feb nefrostomiekatheter rechts ivm hyd...,2020-10-19,0,Male,54,181,69.0,21,1,...,,2020-10-19,5.4,382,26.3,NE,417,206,4786,10
6,FTC-UMCG-0011,2020-03: koorts zonder lokaliserende klachten....,2020-10-15,0,Male,34,185,86.1,25,5,...,,2020-10-19,6.8,442,5.3,3.87,992,17,485,11
7,FTC-UMCG-0013,Hematologische voorgeschiedenis: 2013 (mei) ...,2020-11-23,0,Male,46,187,97.0,28,2,...,,2020-11-25,5.8,253,5.7,4.49,313,32,559,13
8,FTC-UMCG-0014,020 (mei) Diffuus grootcellig B-cel lymfoom st...,2020-12-17,0,Male,70,190,96.0,27,1,...,,2020-12-15,6.8,497,6.2,NE,484,225,1535,14
9,FTC-UMCG-0015,ematologische voorgeschiedenis: 2005 (dec) s...,2021-01-07,1,Female,66,162,56.0,21,2,...,,2021-01-08,6.4,432,7.6,6.10,235,8,569,15


In [34]:
# dropping columns with all NaN values
clinic_data_cleaned = clinic_data_cleaned.dropna(axis=1, how='all')

In [35]:
clinic_data_cleaned.shape

(24, 266)

In [36]:
# we don't need factor columns for modelling as they are encoded already
factors = [factor for factor in clinic_data_cleaned.columns if 'factor' in factor]

In [37]:
comments = [comm for comm in clinic_data_cleaned.columns if 'comment' in comm]

In [38]:
comments

['medhis_diag_comments', 'post_cart_ther_comment_spec']

In [39]:
locations = [loc for loc in clinic_data_cleaned.columns if 'loc' in loc]

In [40]:
locations

['indication_extran_site_loc___1',
 'indication_extran_site_loc___1.factor',
 'indication_extran_site_loc___2',
 'indication_extran_site_loc___2.factor',
 'indication_extran_site_loc___3',
 'indication_extran_site_loc___3.factor',
 'indication_extran_site_loc___21',
 'indication_extran_site_loc___21.factor',
 'indication_extran_site_loc___4',
 'indication_extran_site_loc___4.factor',
 'indication_extran_site_loc___5',
 'indication_extran_site_loc___5.factor',
 'indication_extran_site_loc___6',
 'indication_extran_site_loc___6.factor',
 'indication_extran_site_loc___7',
 'indication_extran_site_loc___7.factor',
 'indication_extran_site_loc___8',
 'indication_extran_site_loc___8.factor',
 'indication_extran_site_loc___9',
 'indication_extran_site_loc___9.factor',
 'indication_extran_site_loc___10',
 'indication_extran_site_loc___10.factor',
 'indication_extran_site_loc___11',
 'indication_extran_site_loc___11.factor',
 'indication_extran_site_loc___12',
 'indication_extran_site_loc___12.

In [41]:
# these are highly correlated features with bmi
correlated = ['scr_height', 'scr_weight']

* scr_age (continuous) correlates to indication_age_60 (binary), we Keep scr_age (continuous). It retains more information and doesn't arbitrarily cut at 60.  
* indication_ldh_uln: we have the exact value for ldh  
* indication_extran_sites, indication_extran_invol, indication_extranodal_nr	These are highly related. we keep indication_extranodal_nr (exact number). It is the most granular quantitative measure.

In [42]:
indicators = ['indication_ldh_uln','indication_age_60','indication_extran_sites', 'indication_extran_invol']

In [43]:
# cause of death columns are not needed
cause_of_death = [cause for cause in clinic_data_cleaned.columns if '_cause' in cause]

In [44]:
cause_of_death

['surv_death_cause',
 'surv_death_cause.factor',
 'surv_death_cause_oth',
 'surv_death_cause_spec',
 'surv_death_contrib_cause___1',
 'surv_death_contrib_cause___1.factor',
 'surv_death_contrib_cause___2',
 'surv_death_contrib_cause___2.factor',
 'surv_death_contrib_cause___3',
 'surv_death_contrib_cause___3.factor',
 'surv_death_contrib_cause___4',
 'surv_death_contrib_cause___4.factor',
 'surv_death_contrib_cause___5',
 'surv_death_contrib_cause___5.factor',
 'surv_death_contrib_cause___6',
 'surv_death_contrib_cause___6.factor',
 'surv_death_contrib_cause___7',
 'surv_death_contrib_cause___7.factor',
 'surv_death_contrib_cause___8',
 'surv_death_contrib_cause___8.factor',
 'surv_death_contrib_cause___9',
 'surv_death_contrib_cause___9.factor',
 'surv_death_contrib_cause___10',
 'surv_death_contrib_cause___10.factor',
 'surv_death_contrib_cause___11',
 'surv_death_contrib_cause___11.factor',
 'surv_death_contrib_cause___12',
 'surv_death_contrib_cause___12.factor',
 'surv_death_contr

**NOTE:** indication_dis_diagnosis must be one-hot encoded. as the disease is a nominal categorical feature.

In [45]:
disease = pd.get_dummies(clinic_data_cleaned['indication_dis_diagnosis.factor']).astype(int)

In [46]:
disease

Unnamed: 0,DLBCL,HGBCL DH/TH,HGBCL NOS,tFL
0,1,0,0,0
1,0,0,0,1
2,0,0,1,0
3,1,0,0,0
4,0,0,0,1
5,1,0,0,0
6,0,0,1,0
7,0,0,0,1
8,1,0,0,0
9,0,0,0,1


In [47]:
drop_columns = cause_of_death + factors + ['record_id','scr_date_tb1stmeeting', 'indication_dis_diagnosis'] + comments + locations + correlated + indicators
clinic_data_cleaned.drop(columns=drop_columns,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinic_data_cleaned.drop(columns=drop_columns,inplace=True)


In [48]:
clinic_data_cleaned.shape

(24, 97)

In [49]:
clinic_data_cleaned = pd.concat([clinic_data_cleaned, disease], axis=1)

In [50]:
clinic_data_cleaned

Unnamed: 0,scr_sex,scr_age,scr_bmi,total_num_priortherapylines_fl,total_num_priortherapylines_aggressive,indication_priorsct,indication_whops,indication_bulkydisease,indication_stage,indication_extranodal_nr,...,cli_st_leukocytes,cli_st_neutrophils,cli_st_ldh,cli_st_crp,cli_st_ferritin,id_cleaned,DLBCL,HGBCL DH/TH,HGBCL NOS,tFL
0,0,62,19,,2,4,0,0,4,3.0,...,2.4,,NE,0.3,894,5,1,0,0,0
1,1,58,19,0.0,2,1,0,0,3,,...,5.7,NE,275,9,371,6,0,0,0,1
2,0,58,30,,2,4,0,0,4,,...,50.0,NE,885,47,2570,7,0,0,1,0
3,1,72,21,,2,4,0,0,4,2.0,...,1.9,NE,250,1.0,NE,8,1,0,0,0
4,0,48,31,2.0,2,4,0,0,4,2.0,...,8.7,NE,283,10,54,9,0,0,0,1
5,0,54,21,,2,4,0,1,2,,...,26.3,NE,417,206,4786,10,1,0,0,0
6,0,34,25,,2,4,0,1,4,2.0,...,5.3,3.87,992,17,485,11,0,0,1,0
7,0,46,28,2.0,2,4,0,0,4,2.0,...,5.7,4.49,313,32,559,13,0,0,0,1
8,0,70,27,,2,4,0,1,4,,...,6.2,NE,484,225,1535,14,1,0,0,0
9,1,66,21,4.0,1,4,0,1,1,,...,7.6,6.10,235,8,569,15,0,0,0,1


In [51]:
clinic_data_cleaned.replace({'NE': np.nan}, inplace=True)

  clinic_data_cleaned.replace({'NE': np.nan}, inplace=True)


In [52]:
clinic_data_cleaned.describe()

Unnamed: 0,id_cleaned,DLBCL,HGBCL DH/TH,HGBCL NOS,tFL
count,24.0,24.0,24.0,24.0,24.0
mean,32.458333,0.375,0.083333,0.083333,0.458333
std,23.514989,0.494535,0.28233,0.28233,0.508977
min,5.0,0.0,0.0,0.0,0.0
25%,10.75,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0
75%,50.5,1.0,0.0,0.0,1.0
max,77.0,1.0,1.0,1.0,1.0


In [53]:
nans = clinic_data_cleaned.isna().sum().sort_values(ascending=False)

In [54]:
# columns with more than 12 nans, which is half the data for the patients we have
nans[nans > 12]

indication_dis_lymsubtype_cns_onset    23
post_car_ther_other                    23
surv_death_contrib_infect              23
surv_death_contrib_other               23
tr_car_preaph_bridg_type               22
tr_car_bridg_reg_oth                   22
indication_extranodal_nr               19
ae_summ_crs_start_gr2                  16
ae_summ_icans_start_gr2                14
total_num_priortherapylines_fl         14
post_cart_ther_startdate               13
dtype: int64

In [55]:
drop_nans = nans[nans > 12].index

In [56]:
clinic_data_cleaned = clinic_data_cleaned.drop(columns=drop_nans)

In [57]:
clinic_data_cleaned.shape

(24, 90)

In [58]:
clinic_data_cleaned.select_dtypes(include=['object']).columns

Index(['scr_sex', 'scr_age', 'scr_bmi',
       'total_num_priortherapylines_aggressive', 'indication_priorsct',
       'indication_whops', 'indication_bulkydisease', 'indication_stage',
       'indication_pri_refr', 'indication_sec_refr',
       'indication_res_last_ther', 'indication_res_last_ther_spec',
       'indication_dis_lymsubtype_cns', 'indication_ind_date',
       'tr_car_preaph_br', 'tr_car_preaph_bridg_reg___1',
       'tr_car_preaph_bridg_reg___2', 'tr_car_preaph_bridg_reg___3',
       'tr_car_preaph_bridg_reg___4', 'tr_car_preaph_bridg_reg___5',
       'tr_car_preaph_bridg_reg___6', 'tr_car_preaph_bridg_reg___7',
       'tr_car_preaph_bridg_reg___8', 'tr_car_preaph_bridg_reg___9',
       'tr_car_preaph_bridg_reg___10', 'tr_car_preaph_bridg_reg___11',
       'tr_car_preaph_bridg_reg___12', 'tr_car_preaph_bridg_reg___na',
       'tr_car_preaph_bridg_reg___ne', 'tr_car_br', 'tr_car_bridg_type',
       'tr_car_bridg_reg___1', 'tr_car_bridg_reg___2', 'tr_car_bridg_reg___3',
  

In [59]:
clinic_data_cleaned.dtypes

scr_sex                                   object
scr_age                                   object
scr_bmi                                   object
total_num_priortherapylines_aggressive    object
indication_priorsct                       object
                                           ...  
id_cleaned                                 int64
DLBCL                                      int64
HGBCL DH/TH                                int64
HGBCL NOS                                  int64
tFL                                        int64
Length: 90, dtype: object

In [60]:
clinic_data_cleaned.columns

Index(['scr_sex', 'scr_age', 'scr_bmi',
       'total_num_priortherapylines_aggressive', 'indication_priorsct',
       'indication_whops', 'indication_bulkydisease', 'indication_stage',
       'indication_pri_refr', 'indication_sec_refr',
       'indication_res_last_ther', 'indication_res_last_ther_spec',
       'indication_dis_lymsubtype_cns', 'indication_ind_date',
       'tr_car_preaph_br', 'tr_car_preaph_bridg_reg___1',
       'tr_car_preaph_bridg_reg___2', 'tr_car_preaph_bridg_reg___3',
       'tr_car_preaph_bridg_reg___4', 'tr_car_preaph_bridg_reg___5',
       'tr_car_preaph_bridg_reg___6', 'tr_car_preaph_bridg_reg___7',
       'tr_car_preaph_bridg_reg___8', 'tr_car_preaph_bridg_reg___9',
       'tr_car_preaph_bridg_reg___10', 'tr_car_preaph_bridg_reg___11',
       'tr_car_preaph_bridg_reg___12', 'tr_car_preaph_bridg_reg___na',
       'tr_car_preaph_bridg_reg___ne', 'tr_car_br', 'tr_car_bridg_type',
       'tr_car_bridg_reg___1', 'tr_car_bridg_reg___2', 'tr_car_bridg_reg___3',
  

In [61]:
# Assuming clinic_data_filtered is the DataFrame you want to convert
date_columns = [date for date in clinic_data_cleaned.columns if ('date' in date) or ('start' in date) or ('stop' in date)]
# 1. Use convert_dtypes() for general automatic inference
# This function automatically converts to best possible dtypes (e.g., object to string, int64 to Int64, float64 to Float64)
# It's particularly useful for handling missing values using pandas' nullable dtypes (e.g., pd.NA).
print("Applying general type conversion...")

# 2. Force remaining object columns that look like numbers to numeric
for col in clinic_data_cleaned.columns:
        if col not in date_columns:
            # Attempt to convert to numeric.
            # this is to fix a typo in columns where , is used instead of .
            if clinic_data_cleaned[col].dtype == 'object':
                clinic_data_cleaned[col] = pd.to_numeric(clinic_data_cleaned[col].str.replace(',','.'), errors='raise')
            print(f"  Converted column '{col}' to numeric.")
        else: 
            clinic_data_cleaned[col] = pd.to_datetime(clinic_data_cleaned[col], errors='coerce')
            print(f"  Converted column '{col}' to datetime.")
        
print("\nAutomatic type conversion complete.")

Applying general type conversion...
  Converted column 'scr_sex' to numeric.
  Converted column 'scr_age' to numeric.
  Converted column 'scr_bmi' to numeric.
  Converted column 'total_num_priortherapylines_aggressive' to numeric.
  Converted column 'indication_priorsct' to numeric.
  Converted column 'indication_whops' to numeric.
  Converted column 'indication_bulkydisease' to numeric.
  Converted column 'indication_stage' to numeric.
  Converted column 'indication_pri_refr' to numeric.
  Converted column 'indication_sec_refr' to numeric.
  Converted column 'indication_res_last_ther' to numeric.
  Converted column 'indication_res_last_ther_spec' to numeric.
  Converted column 'indication_dis_lymsubtype_cns' to numeric.
  Converted column 'indication_ind_date' to datetime.
  Converted column 'tr_car_preaph_br' to numeric.
  Converted column 'tr_car_preaph_bridg_reg___1' to numeric.
  Converted column 'tr_car_preaph_bridg_reg___2' to numeric.
  Converted column 'tr_car_preaph_bridg_reg

In [62]:
clinic_data_cleaned.dtypes

scr_sex                                   int64
scr_age                                   int64
scr_bmi                                   int64
total_num_priortherapylines_aggressive    int64
indication_priorsct                       int64
                                          ...  
id_cleaned                                int64
DLBCL                                     int64
HGBCL DH/TH                               int64
HGBCL NOS                                 int64
tFL                                       int64
Length: 90, dtype: object

In [63]:
variances = clinic_data_cleaned.select_dtypes(include=np.number).var().sort_values()

In [64]:
# zero variance columns are not useful for modelling so I am dropping them
zero_var = variances[variances == 0].index

In [65]:
zero_var

Index(['tr_car_preaph_bridg_reg___2', 'tr_car_preaph_bridg_reg___1',
       'tr_car_preaph_bridg_reg___9', 'tr_car_bridg_reg___1',
       'tr_car_preaph_bridg_reg___ne', 'tr_car_preaph_bridg_reg___na',
       'tr_car_preaph_bridg_reg___12', 'tr_car_bridg_reg___2',
       'tr_car_preaph_bridg_reg___11', 'tr_car_preaph_bridg_reg___10',
       'tr_car_preaph_bridg_reg___4', 'tr_car_preaph_bridg_reg___7',
       'tr_car_preaph_bridg_reg___6', 'tr_car_preaph_bridg_reg___5',
       'tr_car_preaph_bridg_reg___3', 'tr_car_bridg_reg___4',
       'tr_car_bridg_reg___11', 'ae_summ_crs_res_v2',
       'post_cart_ther_spec_2___ne', 'post_cart_ther_spec_2___na',
       'post_cart_ther_spec_2___4', 'ae_summ_icans_res_v2', 'tr_car_ld',
       'tr_car_ld_type', 'tr_car_bridg_reg___10', 'tr_car_bridg_reg___na',
       'tr_car_bridg_reg___ne', 'tr_car_bridg_reg___9', 'tr_car_bridg_reg___5',
       'tr_car_bridg_reg___6'],
      dtype='object')

In [66]:
clinic_data_cleaned = clinic_data_cleaned.drop(columns=zero_var)

In [67]:
clinic_data_cleaned.shape

(24, 60)

In [68]:
clinic_data_cleaned.head()

Unnamed: 0,scr_sex,scr_age,scr_bmi,total_num_priortherapylines_aggressive,indication_priorsct,indication_whops,indication_bulkydisease,indication_stage,indication_pri_refr,indication_sec_refr,...,cli_st_leukocytes,cli_st_neutrophils,cli_st_ldh,cli_st_crp,cli_st_ferritin,id_cleaned,DLBCL,HGBCL DH/TH,HGBCL NOS,tFL
0,0,62,19,2,4,0,0,4,1,1,...,2.4,,,0.3,894.0,5,1,0,0,0
1,1,58,19,2,1,0,0,3,0,0,...,5.7,,275.0,9.0,371.0,6,0,0,0,1
2,0,58,30,2,4,0,0,4,1,1,...,5.0,,885.0,47.0,2570.0,7,0,0,1,0
3,1,72,21,2,4,0,0,4,0,1,...,1.9,,250.0,1.0,,8,1,0,0,0
4,0,48,31,2,4,0,0,4,1,1,...,8.7,,283.0,10.0,54.0,9,0,0,0,1


CRS and ICANS are two of the most significant and potentially life-threatening side effects associated with certain powerful immunotherapies, most notably CAR T-cell therapy (Chimeric Antigen Receptor T-cell therapy).


💥 1. Cytokine Release Syndrome (CRS)
CRS is the more common of the two toxicities and is essentially a massive, systemic inflammatory response.

What it is: When the modified CAR T-cells successfully attack cancer cells, they rapidly multiply and release large amounts of signaling molecules called cytokines into the bloodstream (hence the name "Cytokine Release Syndrome"). This rapid, massive release causes a widespread inflammatory state.

Symptoms: CRS symptoms resemble a severe flu:

High fever (the hallmark symptom)

Chills, muscle aches, and fatigue

Hypotension (low blood pressure)

Hypoxia (low oxygen/trouble breathing)

Fast heart rate (tachycardia)

In severe cases, it can lead to multi-organ failure.

The severity of CRS (often graded 1 to 4/5 by consensus guidelines like the ASTCT criteria) is a crucial prognostic factor.

🧠 2. Immune Effector Cell-Associated Neurotoxicity Syndrome (ICANS)
ICANS is a syndrome involving neurological damage or dysfunction caused by the same immune activation that triggers CRS.

What it is: It is a neurological toxicity that typically occurs after or concurrently with CRS. The exact mechanism is complex but involves the massive cytokine release and the T-cells themselves affecting the central nervous system.

Symptoms: ICANS symptoms can range from mild to life-threatening:

Confusion or disorientation

Aphasia (difficulty speaking or understanding language)

Impaired attention

Headache, tremors, or loss of coordination

In severe cases, it can cause seizures or cerebral edema (brain swelling).

ICANS severity is also graded using specific neurological assessment scores and is a very strong independent predictor of outcomes and morbidity after CAR T-cell therapy.

In [69]:
clinic_data_cleaned.shape

(24, 60)

In [70]:
clinic_data_cleaned.columns

Index(['scr_sex', 'scr_age', 'scr_bmi',
       'total_num_priortherapylines_aggressive', 'indication_priorsct',
       'indication_whops', 'indication_bulkydisease', 'indication_stage',
       'indication_pri_refr', 'indication_sec_refr',
       'indication_res_last_ther', 'indication_res_last_ther_spec',
       'indication_dis_lymsubtype_cns', 'indication_ind_date',
       'tr_car_preaph_br', 'tr_car_preaph_bridg_reg___8', 'tr_car_br',
       'tr_car_bridg_type', 'tr_car_bridg_reg___3', 'tr_car_bridg_reg___7',
       'tr_car_bridg_reg___8', 'tr_car_bridg_reg___12', 'tr_car_inf_adm_date',
       'tr_car_ld_start', 'tr_car_inf_date', 'tr_car_inf_discharge_date',
       'ae_summ_start_date_v2', 'ae_summ_crs_v2', 'ae_summ_highestgrade_v2',
       'ae_summ_crs_start_v2', 'ae_summ_crs_stop_v2', 'ae_summ_icans_v2',
       'ae_summ_icans_highestgrade_v2', 'ae_summ_icans_start_v2',
       'ae_summ_icans_stop_v2', 'surv_bestresponse_car',
       'surv_time_bestresponse_car', 'surv_prog_after_ca

In [71]:
# Impute missing values with the median for numeric columns
for col in clinic_data_cleaned.select_dtypes(include=np.number).columns:
    median_value = clinic_data_cleaned[col].median()
    clinic_data_cleaned[col].fillna(median_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clinic_data_cleaned[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clinic_data_cleaned[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

In [72]:
clinic_data_cleaned.isna().sum().sort_values(ascending=False)

ae_summ_icans_start_v2                    12
ae_summ_icans_stop_v2                     12
surv_prog_date                            10
surv_death_date                           10
ae_summ_crs_stop_v2                        3
ae_summ_crs_start_v2                       3
indication_priorsct                        0
indication_whops                           0
scr_sex                                    0
scr_age                                    0
scr_bmi                                    0
total_num_priortherapylines_aggressive     0
indication_dis_lymsubtype_cns              0
indication_ind_date                        0
tr_car_preaph_br                           0
tr_car_preaph_bridg_reg___8                0
tr_car_br                                  0
tr_car_bridg_type                          0
tr_car_bridg_reg___3                       0
tr_car_bridg_reg___7                       0
tr_car_bridg_reg___8                       0
tr_car_bridg_reg___12                      0
indication

In [73]:
# there are date related column that still have nans, but we will not use them for modelling as we can't impute them easily
# also cli_st_lab_date is not needed
date_columns = ['ae_summ_icans_stop_v2','ae_summ_icans_start_v2', 
                'surv_prog_date', 'surv_death_date', 
                'ae_summ_crs_start_v2', 'ae_summ_crs_stop_v2',
                'cli_st_lab_date']
clinic_data_cleaned.drop(columns=date_columns, inplace=True)

In [74]:
clinic_data_cleaned.shape

(24, 53)

In [75]:
clinic_data_cleaned.isna().sum().sum() # confirming no nans remain

np.int64(0)

In [76]:
clinic_data_cleaned.columns

Index(['scr_sex', 'scr_age', 'scr_bmi',
       'total_num_priortherapylines_aggressive', 'indication_priorsct',
       'indication_whops', 'indication_bulkydisease', 'indication_stage',
       'indication_pri_refr', 'indication_sec_refr',
       'indication_res_last_ther', 'indication_res_last_ther_spec',
       'indication_dis_lymsubtype_cns', 'indication_ind_date',
       'tr_car_preaph_br', 'tr_car_preaph_bridg_reg___8', 'tr_car_br',
       'tr_car_bridg_type', 'tr_car_bridg_reg___3', 'tr_car_bridg_reg___7',
       'tr_car_bridg_reg___8', 'tr_car_bridg_reg___12', 'tr_car_inf_adm_date',
       'tr_car_ld_start', 'tr_car_inf_date', 'tr_car_inf_discharge_date',
       'ae_summ_start_date_v2', 'ae_summ_crs_v2', 'ae_summ_highestgrade_v2',
       'ae_summ_icans_v2', 'ae_summ_icans_highestgrade_v2',
       'surv_bestresponse_car', 'surv_time_bestresponse_car',
       'surv_prog_after_car', 'surv_status', 'surv_date', 'post_cart_ther',
       'post_cart_ther_spec_2___1', 'post_cart_ther_spe

----

## Calculate Time-to-Event (T) to prepare for Cox Regression

We need to select one pair of dates to calculate the duration of follow-up (Time, or $T$).  
Choice: The standard time point for post-therapy outcomes is from the date of infusion to the date of follow-up/death.  
Start Date: tr_car_inf_date (Date of CAR-T infusion)  
End Date: surv_date (Date of last follow-up or death)  

In [77]:
# Calculate the Time-to-Event (T) in days
# This is the time from infusion until the event (or censoring)
clinic_data_cleaned['T'] = (
    clinic_data_cleaned['surv_date'] - clinic_data_cleaned['tr_car_inf_date']
).dt.days

In [78]:
clinic_data_cleaned['T']

0       82
1      486
2      105
3     1509
4      183
5     1513
6       14
7     1405
8       15
9      612
10      91
11     411
12    1142
13     327
14     368
15    1041
16     930
17     253
18    1048
19    1004
20     323
21     490
22     808
23     715
Name: T, dtype: int64

## Define the Event Indicator (E)

We need a binary variable (Event, or $E$) that indicates if the event of interest occurred.Choice: The most common target is Overall Survival (OS), where the event is death.Event Variable: surv_status (Assuming this is a 0/1 indicator where 1 = death/event)

In [79]:
# in the dataset, it's the opposite of what we want: 1 means event occurred (death), 0 means censored (alive)
# so we need to invert it
clinic_data_cleaned['surv_status'] = 1 - clinic_data_cleaned['surv_status']

In [80]:
# rename surv_status to E for event
clinic_data_cleaned.rename(columns={'surv_status': 'E'}, inplace=True)

In [81]:
clinic_data_cleaned['E']

0     1
1     1
2     1
3     0
4     0
5     0
6     1
7     0
8     1
9     0
10    1
11    1
12    1
13    1
14    1
15    1
16    0
17    1
18    0
19    0
20    1
21    1
22    0
23    0
Name: E, dtype: int64

In [82]:
# we should separate the possible target variables for modelling now
target_variables = ['surv_time_bestresponse_car', 'surv_prog_after_car']
date_related = ['tr_car_inf_adm_date','tr_car_ld_start', 'tr_car_inf_date', 'tr_car_inf_discharge_date',
       'ae_summ_start_date_v2','surv_date','indication_ind_date']

In [83]:
modelling_data = pd.concat([clinic_data_cleaned, delta_radiomics_results, a_radiomics, b_radiomics], axis=1)

In [84]:
modelling_data.head()

Unnamed: 0,scr_sex,scr_age,scr_bmi,total_num_priortherapylines_aggressive,indication_priorsct,indication_whops,indication_bulkydisease,indication_stage,indication_pri_refr,indication_sec_refr,...,SUV_StandardDeviation_b,SUV_TotalEnergy_b,SUV_Uniformity_b,SUV_Variance_b,TLG_b,Number of lesions_b,Dmax Patient (mm)_b,Spread Patient (mm)_b,Dmax Bulk (mm)_b,Spread Bulk (mm)_b
0,0.0,62.0,19.0,2.0,4.0,0.0,0.0,4.0,1.0,1.0,...,4.5572,38010280.0,0.995566,20.768074,3917.771002,35.0,1196.879586,22496.24366,795.620306,11241.343898
1,1.0,58.0,19.0,2.0,1.0,0.0,0.0,3.0,0.0,0.0,...,1.385548,1335473.0,1.0,1.919742,423.940616,20.0,1374.014576,16735.852,935.974729,7738.145091
2,0.0,58.0,30.0,2.0,4.0,0.0,0.0,4.0,1.0,1.0,...,3.255284,58786350.0,0.996676,10.596874,8874.764809,11.0,1091.509116,6925.147632,626.585808,3097.29928
3,1.0,72.0,21.0,2.0,4.0,0.0,0.0,4.0,0.0,1.0,...,1.118203,4936741.0,1.0,1.250378,1502.998388,6.0,1315.665591,4304.968469,937.066016,2123.989264
4,0.0,48.0,31.0,2.0,4.0,0.0,0.0,4.0,1.0,1.0,...,7.615255,39415060.0,0.904695,57.992107,2515.891962,12.0,1317.910694,7968.471879,780.420733,3480.583802


In [85]:
modelling_data.shape

(25, 186)

In [86]:
# we need to drop the last row, as the patient's clinical data is not available
modelling_data = modelling_data.iloc[:-1,:]

In [87]:
# to find the baseline performance, we don't need id_cleaned as we're not going to
# use this column for adding the delta radiomics yet
X = modelling_data.drop(columns=target_variables + date_related + ['id_a','id_b','id_cleaned','id'])                    

In [88]:
X

Unnamed: 0,scr_sex,scr_age,scr_bmi,total_num_priortherapylines_aggressive,indication_priorsct,indication_whops,indication_bulkydisease,indication_stage,indication_pri_refr,indication_sec_refr,...,SUV_StandardDeviation_b,SUV_TotalEnergy_b,SUV_Uniformity_b,SUV_Variance_b,TLG_b,Number of lesions_b,Dmax Patient (mm)_b,Spread Patient (mm)_b,Dmax Bulk (mm)_b,Spread Bulk (mm)_b
0,0.0,62.0,19.0,2.0,4.0,0.0,0.0,4.0,1.0,1.0,...,4.5572,38010280.0,0.995566,20.768074,3917.771002,35.0,1196.879586,22496.24366,795.620306,11241.343898
1,1.0,58.0,19.0,2.0,1.0,0.0,0.0,3.0,0.0,0.0,...,1.385548,1335473.0,1.0,1.919742,423.940616,20.0,1374.014576,16735.852,935.974729,7738.145091
2,0.0,58.0,30.0,2.0,4.0,0.0,0.0,4.0,1.0,1.0,...,3.255284,58786350.0,0.996676,10.596874,8874.764809,11.0,1091.509116,6925.147632,626.585808,3097.29928
3,1.0,72.0,21.0,2.0,4.0,0.0,0.0,4.0,0.0,1.0,...,1.118203,4936741.0,1.0,1.250378,1502.998388,6.0,1315.665591,4304.968469,937.066016,2123.989264
4,0.0,48.0,31.0,2.0,4.0,0.0,0.0,4.0,1.0,1.0,...,7.615255,39415060.0,0.904695,57.992107,2515.891962,12.0,1317.910694,7968.471879,780.420733,3480.583802
5,0.0,54.0,21.0,2.0,4.0,0.0,1.0,2.0,1.0,1.0,...,1.987194,12405330.0,1.0,3.948941,2293.63417,4.0,562.43254,1173.090291,385.192324,818.62535
6,0.0,34.0,25.0,2.0,4.0,0.0,1.0,4.0,1.0,1.0,...,5.981518,86257520.0,0.964509,35.778557,7210.317554,25.0,726.098719,11287.112665,431.849046,6034.047738
7,0.0,46.0,28.0,2.0,4.0,0.0,0.0,4.0,0.0,1.0,...,3.560358,25470760.0,1.0,12.676147,3110.27828,11.0,602.918708,3715.450001,428.222118,2614.205791
8,0.0,70.0,27.0,2.0,4.0,0.0,1.0,4.0,1.0,1.0,...,5.94256,79292570.0,0.969001,35.314018,6596.153587,12.0,998.195332,6978.814024,539.021819,4064.679621
9,1.0,66.0,21.0,1.0,4.0,0.0,1.0,1.0,1.0,0.0,...,0.762871,4260973.0,1.0,0.581973,1055.946947,1.0,0.0,0.0,0.0,0.0


---

# ML modelling

In [89]:
no_delta_radiomics = pd.concat([clinic_data_cleaned, a_radiomics, b_radiomics], axis=1)
no_delta_radiomics = no_delta_radiomics.iloc[:-1,:]

In [90]:
X_with_delta = modelling_data.drop(columns=target_variables + date_related + ['T','E','id_a','id_b','id_cleaned','id']) 

X_without_delta = no_delta_radiomics.drop(columns=target_variables + date_related + ['T', 'E', 'id_a','id_b','id_cleaned'])

In [91]:
y = modelling_data['E']

In [92]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer 
from sklearn.neighbors import KNeighborsClassifier


In [93]:
RANDOM_STATE = 42

K_BEST_OPTIONS = [5, 10, 20] # a range of k values to try in the grid search

VAR_THRSH = [0.1, 0.5, 1.0] # variance threshold options to try in grid search

In [94]:
scale_radio = a_radiomics.columns[1:].tolist() + b_radiomics.columns[1:].tolist() 

In [95]:
COLUMNS_TO_SCALE_WITH_DELTA = ['scr_age', 'scr_bmi', 'cli_st_trombocytes', 'cli_st_neutrophils','cli_st_ldh', 'cli_st_crp', 'cli_st_ferritin' ] + scale_radio + delta_radiomics_results.columns[1:].tolist()

COLUMNS_TO_SCALE_WITHOUT_DELTA = ['scr_age', 'scr_bmi', 'cli_st_trombocytes', 'cli_st_neutrophils','cli_st_ldh', 'cli_st_crp', 'cli_st_ferritin' ] + scale_radio

COLUMNS_TO_SCALE_NO_RADIO = ['scr_age', 'scr_bmi', 'cli_st_trombocytes', 'cli_st_neutrophils','cli_st_ldh', 'cli_st_crp', 'cli_st_ferritin' ]

COLUMNS_TO_SCALE_ONLY_POINT_A = ['scr_age', 'scr_bmi', 'cli_st_trombocytes', 'cli_st_neutrophils','cli_st_ldh', 'cli_st_crp', 'cli_st_ferritin' ] + a_radiomics.columns[1:].tolist()
COLUMNS_TO_SCALE_ONLY_POINT_B = ['scr_age', 'scr_bmi', 'cli_st_trombocytes', 'cli_st_neutrophils','cli_st_ldh', 'cli_st_crp', 'cli_st_ferritin' ] + b_radiomics.columns[1:].tolist()

In [96]:
only_clinic = clinic_data_cleaned.drop(columns=target_variables + date_related + ['id_cleaned','E','T'])

In [97]:
X_with_a_radiomics = pd.concat([only_clinic, a_radiomics], axis=1).iloc[:-1,:]

In [98]:
X_with_b_radiomics = pd.concat([only_clinic, b_radiomics], axis=1).iloc[:-1,:]

In [99]:
for name, X, COLUMNS_TO_SCALE in [("with delta radiomics features", X_with_delta, COLUMNS_TO_SCALE_WITH_DELTA), 
                                  ("without delta radiomics features", X_without_delta, COLUMNS_TO_SCALE_WITHOUT_DELTA), 
                                  ("only clinical features", only_clinic, COLUMNS_TO_SCALE_NO_RADIO),
                                  ("only clinical + point A radiomics", X_with_a_radiomics, COLUMNS_TO_SCALE_ONLY_POINT_A),
                                  ("only clinical + point B radiomics", X_with_b_radiomics, COLUMNS_TO_SCALE_ONLY_POINT_B)]:
    print("\n" + "="*20)
    print(f"Starting new modelling run: {name}")
    print("="*20 + "\n")

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    # Define the ColumnTransformer for selective scaling
    preprocessor = ColumnTransformer(
        transformers=[
            # Apply StandardScaler only to the specified list of columns
            ('scaling_pipeline', StandardScaler(), COLUMNS_TO_SCALE)
        ],
        # 'remainder='passthrough' is crucial: it keeps all other columns untouched
        remainder='passthrough' 
    )
    # Step 1: Feature Scaling (Crucial for SVMs)
    scaler = StandardScaler()
    # Step 2: Variance Threshold (New Step)
    # Removes features whose variance is below the threshold.
    # This step helps pre-filter non-informative features before SelectKBest.
    variance_filter = VarianceThreshold()
    # Step 3: Feature Selection
    # Use SelectKBest with f_classif (ANOVA F-value)
    feature_selector = SelectKBest(score_func=f_classif)

    # Step 4: Classifier
    # Using a 'linear' kernel is often more stable than RBF when N is small and P is large.
    # Note: The C parameter will be tuned using GridSearchCV.
    classifier = KNeighborsClassifier()
    #SVC(random_state=RANDOM_STATE)

    # Build the pipeline with the new scaling and threshold steps
    pipeline = Pipeline(steps=[
        ('preprocess', preprocessor),                 # Standardize features first
        ('variance_threshold', variance_filter),
        ('select_kbest', feature_selector), 
        ('classifier', classifier)      
    ])

    print("Pipeline updated, now includes StandardScaler and VarianceThreshold for initial feature filtering.")
    print("-" * 50)


    # --- 3. Hyperparameter Tuning with GridSearchCV ---

    # Define the parameter grid to search over.


    param_grid = {

        'variance_threshold__threshold': VAR_THRSH,
        # Tuning the 'k' parameter of SelectKBest (how many features to select)
        'select_kbest__k': K_BEST_OPTIONS,
        # Tuning the 'C' regularization parameter of the SVC
        'classifier__n_neighbors': [3, 4, 5] 
        }
    
    # Use GridSearchCV with the pipeline and the parameter grid.
    # The inner Cross-Validation (cv=5) ensures stable feature selection.
    # The pipeline ensures feature selection is ONLY fitted on the training folds.
    grid_search = GridSearchCV(
        pipeline, 
        param_grid, 
        cv=5,                 # Use 5-fold cross-validation
        scoring='accuracy',   # Metric to optimize
        n_jobs=-1             # Use all available cores
    )

    print("Starting Grid Search training...")
    grid_search.fit(x_train, y_train)


    # --- 4. Evaluate the Best Model ---

    print("\nGrid Search Complete.")
    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best cross-validation score (Training Set): {grid_search.best_score_:.4f}")

    # Predict on the held-out test data using the best estimator found by GridSearchCV
    y_pred = grid_search.predict(x_test)

    # Evaluate performance on the test set
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy (unseen data): {accuracy:.4f}")


    # --- 5. Inspecting the Feature Selection Results of the Best Model ---

    best_selector = grid_search.best_estimator_['select_kbest']
    scores = best_selector.scores_
    p_values = best_selector.pvalues_
    k_best = grid_search.best_params_['select_kbest__k']
    all_features = x_train.columns.tolist()

    # 5.1 Determine the order of features after the ColumnTransformer
    # ColumnTransformer puts the transformed columns first, then the remainder.
    passthrough_features = [col for col in all_features if col not in COLUMNS_TO_SCALE]
    full_feature_names_after_preprocessor = COLUMNS_TO_SCALE + passthrough_features

    # 5.2 Apply the variance mask to the ordered list of feature names
    best_variance_filter = grid_search.best_estimator_['variance_threshold']
    variance_mask = best_variance_filter.get_support()
    features_after_variance_filter = np.array(full_feature_names_after_preprocessor)[variance_mask].tolist()

    # 5.3 Create a DataFrame to sort and display all feature scores from SelectKBest
    feature_ranking = pd.DataFrame({
        'Feature': features_after_variance_filter,
        'F_Score': scores,
        'P_Value': p_values
    })

    # Sort by F_Score (highest first) and print the top K
    feature_ranking = feature_ranking.sort_values(by='F_Score', ascending=False)
    top_k_features = feature_ranking.head(k_best)


    print("\n--- Feature Ranking by SelectKBest (Best Model) ---")
    print(f"Optimal Variance Threshold used: {grid_search.best_params_['variance_threshold__threshold']:.2f}")
    print(f"Number of features remaining after Variance Threshold: {len(features_after_variance_filter)}")
    print(f"Optimal number of features (k) chosen by SelectKBest: {k_best}")

    # Display the top features, scores, and p-values
    print(top_k_features.to_string(float_format="%.4f"))




Starting new modelling run: with delta radiomics features

Pipeline updated, now includes StandardScaler and VarianceThreshold for initial feature filtering.
--------------------------------------------------
Starting Grid Search training...

Grid Search Complete.
Best parameters found: {'classifier__n_neighbors': 3, 'select_kbest__k': 5, 'variance_threshold__threshold': 0.1}
Best cross-validation score (Training Set): 0.5833
Test Accuracy (unseen data): 0.6000

--- Feature Ranking by SelectKBest (Best Model) ---
Optimal Variance Threshold used: 0.10
Number of features remaining after Variance Threshold: 160
Optimal number of features (k) chosen by SelectKBest: 5
                      Feature  F_Score  P_Value
157         cli_st_leukocytes   5.3871   0.0330
98                   Flatness   5.2627   0.0348
146                 tr_car_br   4.4042   0.0511
60   Maximum2DDiameterSlice_b   4.3661   0.0520
57          MajorAxisLength_b   4.1785   0.0567

Starting new modelling run: without de