# Delta Radiomics

In [1]:
import pandas as pd
import numpy as np
import os
import yaml

In [2]:
def calculate_delta_radiomics(data_folder_path):
    """
    Reads radiomics data from subfolders (Time A and Time B), filters for 'suv2.5' 
    segmentation, calculates the delta (B - A) for numeric features, and stores
    the results in a dictionary per patient.

    Args:
        data_folder_path (str): The path to the main folder containing patient subfolders.

    Returns:
        (pd.DataFrame, pd.DataFrame, pd.DataFrame):
            delta_df: Delta radiomics (B - A), patients as index, features as columns.
            A_df: Radiomics at time A, same shape.
            B_df: Radiomics at time B, same shape.
    """
    all_delta_radiomics = {}
    A_radiomics, B_radiomics = {}, {}

    # 1. Iterate through all items in the main data folder
    for patient_folder_name in os.listdir(data_folder_path):
        patient_path = os.path.join(data_folder_path, patient_folder_name)
        
        # Ensure it is actually a directory (a patient folder)
        if os.path.isdir(patient_path):
            print(f"--- Processing {patient_folder_name} ---")
            
            # Initialize paths for Time A and Time B files
            file_A_path = None
            file_B_path = None
            
            # 2. Find the radiomics files for Time A and Time B in the patient folder
            for filename in os.listdir(patient_path):
                path_excel = os.path.join(patient_path, filename)

                # Assuming filenames contain '_A' or '_B' (case-insensitive) + .xlsx
                upper_name = path_excel.upper()
                if '_A' in upper_name and path_excel.endswith('.xlsx'):
                    file_A_path = path_excel
                elif '_B' in upper_name and path_excel.endswith('.xlsx'):
                    file_B_path = path_excel

            if file_A_path and file_B_path:
                try:
                    # 3. Read and preprocess the data
                    df_A = pd.read_excel(file_A_path)
                    df_B = pd.read_excel(file_B_path)
                    
                    # 4. Filter for the 'suv2.5' segmentation row, take columns from 23 onwards
                    row_A = df_A[df_A['Segmentation'].str.contains('suv2.5')].iloc[0, 23:]
                    row_B = df_B[df_B['Segmentation'].str.contains('suv2.5')].iloc[0, 23:]

                    # 5. Convert to numeric, coercing errors to NaN
                    numeric_A = pd.to_numeric(row_A, errors='coerce')
                    numeric_B = pd.to_numeric(row_B, errors='coerce')

                    # 6. Calculate Delta Radiomics (Time B - Time A)
                    delta_radiomics = numeric_B - numeric_A
                    
                    # Store as dicts, dropping NaNs
                    all_delta_radiomics[patient_folder_name] = delta_radiomics.dropna().to_dict()
                    A_radiomics[patient_folder_name] = numeric_A.dropna().to_dict()
                    B_radiomics[patient_folder_name] = numeric_B.dropna().to_dict()

                    print(f"Successfully calculated radiomics and delta radiomics for {patient_folder_name}.")

                except Exception as e:
                    print(f"Error processing files for {patient_folder_name}: {e}")
            else:
                print(f"Could not find both A and B files in {patient_folder_name}.")

    # Convert dicts to DataFrames (patients = rows, features = columns)
    A_df = pd.DataFrame.from_dict(A_radiomics, orient='index')
    B_df = pd.DataFrame.from_dict(B_radiomics, orient='index')
    delta_df = pd.DataFrame.from_dict(all_delta_radiomics, orient='index')

    return delta_df, A_df, B_df

In [3]:
with open("config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

# Extract the actual path STRING from the config
data_folder_path = cfg["paths"]["data_folder"]

# Run the function with a string path, NOT the whole dict
delta_radiomics_results, a_radiomics, b_radiomics = calculate_delta_radiomics(data_folder_path)

# ---- PRINT SUMMARY ----
print("\n--- Final Results Summary ---")
for patient, row in delta_radiomics_results.iterrows():
    # row is a Series of feature values for that patient
    non_na = row.dropna()
    print(f"\n{patient} Delta Radiomics ({len(non_na)} features):")
    print(non_na.head().to_dict())  # first 5 features

--- Processing 024 ---
Successfully calculated radiomics and delta radiomics for 024.
--- Processing 023 ---
Successfully calculated radiomics and delta radiomics for 023.
--- Processing 015 ---
Successfully calculated radiomics and delta radiomics for 015.
--- Processing 046 ---
Successfully calculated radiomics and delta radiomics for 046.
--- Processing 048 ---
Successfully calculated radiomics and delta radiomics for 048.
--- Processing 077 ---
Successfully calculated radiomics and delta radiomics for 077.
--- Processing 070 ---
Successfully calculated radiomics and delta radiomics for 070.
--- Processing 013 ---
Successfully calculated radiomics and delta radiomics for 013.
--- Processing 014 ---
Successfully calculated radiomics and delta radiomics for 014.
--- Processing 022 ---
Successfully calculated radiomics and delta radiomics for 022.
--- Processing 047 ---
Successfully calculated radiomics and delta radiomics for 047.
--- Processing 007 ---
Successfully calculated radiomi

In [4]:
delta_radiomics_results

Unnamed: 0,MeshVolume (cc),Volume (cc),Compactness1,Compactness2,Elongation,Flatness,LeastAxisLength,MajorAxisLength,Maximum2DDiameterColumn,Maximum2DDiameterRow,...,glrlm_LongRunLowGrayLevelEmphasis,glrlm_LowGrayLevelRunEmphasis,glrlm_RunEntropy,glrlm_RunLengthNonUniformity,glrlm_RunLengthNonUniformityNormalized,glrlm_RunPercentage,glrlm_RunVariance,glrlm_ShortRunEmphasis,glrlm_ShortRunHighGrayLevelEmphasis,glrlm_ShortRunLowGrayLevelEmphasis
24,-1350.192459,-1349.633052,-0.000854,-0.004856,-0.071998,-0.065654,-27.970122,3.548268,-51.767337,-44.872591,...,,,,,,,,,,
23,-219.73116,-219.58596,-0.001445,-0.016234,-0.053647,-0.079159,-8.566525,17.394789,-17.549948,30.855676,...,,,,,,,,,,
15,49.867963,49.214353,0.016168,0.256642,0.266914,0.405557,-0.581837,-299.663486,-166.887134,-267.583392,...,,,,,,,,,,
46,81.359002,81.531557,0.00445,0.05048,0.116233,0.059981,-53.057164,-718.377143,-106.43608,-167.299933,...,,,,,,,,,,
48,132.95964,134.90532,-0.003025,-0.029388,-0.000437,0.012482,16.392367,101.549356,0.102906,24.886956,...,,,,,,,,,,
77,62.073205,61.188505,0.012091,0.27319,0.263203,0.349453,19.774698,-36.960332,-28.05052,-31.14318,...,91.455757,0.0,1.217388,26.828424,-0.119355,-0.197984,13.049563,-0.193916,-0.193916,-0.193916
70,-1857.982951,-1865.534346,-0.00259,-0.022411,0.178294,0.071478,18.907248,-127.680835,-720.751501,-693.996017,...,-137.743053,-0.012458,-0.989757,-8.304674,0.060624,0.093988,-50.938508,0.045758,0.081561,0.036807
13,282.473562,285.526503,-0.015736,-0.255738,0.442808,-0.060073,26.527685,124.898852,64.273602,104.376945,...,-7.80773,0.042759,-0.268261,153.580232,0.007379,0.014061,-1.636574,-0.009532,-0.101302,0.013411
14,-618.581947,-619.5321,-0.001067,-0.007992,0.153009,-0.070289,-34.663123,144.560198,68.070835,63.707567,...,,,,,,,,,,
22,-1760.492863,-1745.137872,-0.000683,-0.003,0.113102,0.155583,-7.834522,-259.682419,-251.945347,-282.016102,...,,,,,,,,,,


In [5]:
# Clean and prepare dataframes
# by dropping columns with any NaN values and resetting index
# to keep only the complete cases (some patients have 99 columns with NaNs, but 43 are always present)
# we'll work with those 43.
for df in [delta_radiomics_results, a_radiomics, b_radiomics]:
    df.dropna(axis=1, how='any', inplace=True)
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'id'}, inplace=True)
    df['id'] = df['id'].astype(int)

In [6]:
# to differentiate the columns of A and B datasets
a_radiomics = a_radiomics.add_suffix('_a')

In [7]:
a_radiomics.head()

Unnamed: 0,id_a,MeshVolume (cc)_a,Volume (cc)_a,Compactness1_a,Compactness2_a,Elongation_a,Flatness_a,LeastAxisLength_a,MajorAxisLength_a,Maximum2DDiameterColumn_a,...,SUV_StandardDeviation_a,SUV_TotalEnergy_a,SUV_Uniformity_a,SUV_Variance_a,TLG_a,Number of lesions_a,Dmax Patient (mm)_a,Spread Patient (mm)_a,Dmax Bulk (mm)_a,Spread Bulk (mm)_a
0,24,3236.101787,3249.393552,0.008427,0.025231,0.679259,0.379251,167.996974,442.970946,558.036287,...,2.364306,60455620.0,0.996192,5.589943,11722.728508,13.0,740.204182,5251.85917,615.445828,3999.313134
1,23,1236.71559,1240.8066,0.016529,0.097071,0.679058,0.575951,125.204304,217.387132,320.449684,...,6.980068,212479400.0,0.966459,48.721356,13734.421779,3.0,304.656578,593.294918,304.656578,346.037247
2,15,221.214992,222.556487,0.014253,0.072181,0.473212,0.140572,56.523435,402.096359,273.123144,...,0.670973,2523821.0,1.0,0.450205,734.434052,4.0,530.927813,1122.497606,530.927813,1122.497606
3,46,16.306867,17.72892,0.013737,0.067051,0.120633,0.108154,122.121675,1129.145457,582.706796,...,2.084291,365293.3,1.0,4.344271,71.489771,11.0,1091.820444,7430.718599,1091.820444,7430.718599
4,48,110.32296,112.73328,0.015185,0.08193,0.187392,0.09539,41.559987,435.685464,552.831991,...,2.631478,3952316.0,1.0,6.924674,597.957639,13.0,1063.58772,9799.746563,1063.58772,4069.22703


In [8]:
b_radiomics = b_radiomics.add_suffix('_b')

In [9]:
b_radiomics.head()

Unnamed: 0,id_b,MeshVolume (cc)_b,Volume (cc)_b,Compactness1_b,Compactness2_b,Elongation_b,Flatness_b,LeastAxisLength_b,MajorAxisLength_b,Maximum2DDiameterColumn_b,...,SUV_StandardDeviation_b,SUV_TotalEnergy_b,SUV_Uniformity_b,SUV_Variance_b,TLG_b,Number of lesions_b,Dmax Patient (mm)_b,Spread Patient (mm)_b,Dmax Bulk (mm)_b,Spread Bulk (mm)_b
0,24,1885.909327,1899.7605,0.007573,0.020375,0.607262,0.313596,140.026852,446.519213,506.26895,...,3.385329,68879090.0,0.998717,11.460455,9460.019445,3.0,365.156905,713.74416,348.587255,430.360471
1,23,1016.98443,1021.22064,0.015083,0.080836,0.625411,0.496792,116.637779,234.781921,302.899736,...,3.141397,49782750.0,1.0,9.868377,6367.69374,5.0,344.845302,1017.806614,344.845302,1017.806614
2,15,271.082955,271.77084,0.030421,0.328823,0.740126,0.546129,55.941599,102.432873,106.236011,...,0.762871,4260973.0,1.0,0.581973,1055.946947,1.0,0.0,0.0,0.0,0.0
3,46,97.66587,99.260477,0.018188,0.11753,0.236866,0.168135,69.06451,410.768314,476.270716,...,9.492439,25502770.0,0.734631,90.106402,1282.041848,7.0,741.104645,2741.335065,395.708306,1335.079507
4,48,243.2826,247.6386,0.012161,0.052543,0.186955,0.107872,57.952354,537.23482,552.934897,...,2.792084,11997230.0,1.0,7.79573,1578.893332,13.0,1060.465092,8903.999307,1060.465092,4926.875652


In [10]:
for patient, delta_data in delta_radiomics_results.items():
    if len(delta_data) == 99:
        print(patient)

In [11]:
filtered_results = {patient: data for patient, data in delta_radiomics_results.items() if len(data) != 99}

In [12]:
len(filtered_results)

44

In [13]:
for patient, delta_data in delta_radiomics_results.items():
        print(patient)

id
MeshVolume (cc)
Volume (cc)
Compactness1
Compactness2
Elongation
Flatness
LeastAxisLength
MajorAxisLength
Maximum2DDiameterColumn
Maximum2DDiameterRow
Maximum2DDiameterSlice
Maximum3DDiameter
MinorAxisLength
SphericalDisproportion
Sphericity
SurfaceArea
SurfaceVolumeRatio (cc)
SUV_10Percentile
SUV_90Percentile
SUV_Energy
SUV_Entropy
SUV_InterquartileRange
SUV_Kurtosis
SUV_Maximum
SUV_MeanAbsoluteDeviation
SUV_Mean
SUV_Median
SUV_Peak
SUV_Minimum
SUV_Range
SUV_RobustMeanAbsoluteDeviation
SUV_RootMeanSquared
SUV_Skewness
SUV_StandardDeviation
SUV_TotalEnergy
SUV_Uniformity
SUV_Variance
TLG
Number of lesions
Dmax Patient (mm)
Spread Patient (mm)
Dmax Bulk (mm)
Spread Bulk (mm)


# Clinical Data

In [14]:
with open("config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

data_folder_path = cfg["paths"]["data_folder"]
clinical_path = cfg["paths"]["clinical_data"]

clinic_data = pd.read_excel(clinical_path)

In [15]:
clinic_data.head()

Unnamed: 0,record_id,medhis_diag_comments,scr_date_tb1stmeeting,scr_sex,scr_sex.factor,scr_age,scr_height,scr_weight,scr_bmi,indication_dis_diagnosis,...,post_cart_ther_spec_2___ne.factor,post_cart_ther_comment_spec,cli_st_lab_date,cli_st_hemoglobin,cli_st_trombocytes,cli_st_leukocytes,cli_st_neutrophils,cli_st_ldh,cli_st_crp,cli_st_ferritin
0,Record ID,Comments,Date 1st tumorboard meeting,Sex,,Age,Height,Weight,BMI (kg/m2),Diagnosis for which there is now a cellular th...,...,,Please specify all subsequent anti-cancer ther...,Date lab results,Hemoglobin in mmol/L,Thrombocytes in 10E9/L,Leukocytes in 10E9/L,Neutrophils in 10E9/L (automated differentiation),LDH in U/L,CRP in mg/L,Ferritin in µg/l
1,FTC-UMCG-0001,splenectomy 2012: total hip links 2015: jich...,2020-05-04,0,Male,68,180,72.6,22,1,...,Unchecked,,2020-04-28,7.1,90,6.3,4.74,169,26,NE
2,FTC-UMCG-0002,> 20 jaar geleden DVT links Longembolie links...,2020-05-07,0,Male,73,190,86,24,2,...,Unchecked,,2020-05-14,64,172,4.3,2.83,NE,47,2847
3,FTC-UMCG-0003,"2019 Nov Grootcellig B-Non-Hodgkin lymfoom,...",2020-05-18,0,Male,59,181,91,28,1,...,Unchecked,Radiotherapy CNS and Korfel 3x response evalua...,2020-05-15,7.4,389,11.9,NE,214,14,1404
4,FTC-UMCG-0004,2015 gehoorverlies 2019 aug: DLBCL ...,2020-05-14,1,Female,61,169,73,26,1,...,Unchecked,,2020-04-21,6.5,159,9.2,6.55,296,3.0,NE


In [16]:
clinic_data.shape

(69, 275)

In [17]:
clinic_data['record_id'].values

array(['Record ID', 'FTC-UMCG-0001', 'FTC-UMCG-0002', 'FTC-UMCG-0003',
       'FTC-UMCG-0004', 'FTC-UMCG-0005', 'FTC-UMCG-0006', 'FTC-UMCG-0007',
       'FTC-UMCG-0008', 'FTC-UMCG-0009', 'FTC-UMCG-0010', 'FTC-UMCG-0011',
       'FTC-UMCG-0012', 'FTC-UMCG-0013', 'FTC-UMCG-0014', 'FTC-UMCG-0015',
       'FTC-UMCG-0016', 'FTC-UMCG-0017', 'FTC-UMCG-0018', 'FTC-UMCG-0019',
       'FTC-UMCG-0020', 'FTC-UMCG-0021', 'FTC-UMCG-0022', 'FTC-UMCG-0023',
       'FTC-UMCG-0024', 'FTC-UMCG-0025', 'FTC-UMCG-0026', 'FTC-UMCG-0027',
       'FTC-UMCG-0028', 'FTC-UMCG-0029', 'FTC-UMCG-0030', 'FTC-UMCG-0031',
       'FTC-UMCG-0046', 'FTC-UMCG-0047', 'FTC-UMCG-0048', 'FTC-UMCG-0049',
       'FTC-UMCG-0050', 'FTC-UMCG-0051', 'FTC-UMCG-0052', 'FTC-UMCG-0053',
       'FTC-UMCG-0054', 'FTC-UMCG-0055', 'FTC-UMCG-0060', 'FTC-UMCG-0061',
       'FTC-UMCG-0064', 'FTC-UMCG-0065', 'FTC-UMCG-0066', 'FTC-UMCG-0067',
       'FTC-UMCG-0068', 'FTC-UMCG-0069', 'FTC-UMCG-0070', 'FTC-UMCG-0075',
       'FTC-UMCG-0076', 'FTC-

In [18]:
clinic_data['id_cleaned'] = [value[-3:] for value in clinic_data['record_id'].values]

In [19]:
clinic_data.head()

Unnamed: 0,record_id,medhis_diag_comments,scr_date_tb1stmeeting,scr_sex,scr_sex.factor,scr_age,scr_height,scr_weight,scr_bmi,indication_dis_diagnosis,...,post_cart_ther_comment_spec,cli_st_lab_date,cli_st_hemoglobin,cli_st_trombocytes,cli_st_leukocytes,cli_st_neutrophils,cli_st_ldh,cli_st_crp,cli_st_ferritin,id_cleaned
0,Record ID,Comments,Date 1st tumorboard meeting,Sex,,Age,Height,Weight,BMI (kg/m2),Diagnosis for which there is now a cellular th...,...,Please specify all subsequent anti-cancer ther...,Date lab results,Hemoglobin in mmol/L,Thrombocytes in 10E9/L,Leukocytes in 10E9/L,Neutrophils in 10E9/L (automated differentiation),LDH in U/L,CRP in mg/L,Ferritin in µg/l,ID
1,FTC-UMCG-0001,splenectomy 2012: total hip links 2015: jich...,2020-05-04,0,Male,68,180,72.6,22,1,...,,2020-04-28,7.1,90,6.3,4.74,169,26,NE,001
2,FTC-UMCG-0002,> 20 jaar geleden DVT links Longembolie links...,2020-05-07,0,Male,73,190,86,24,2,...,,2020-05-14,64,172,4.3,2.83,NE,47,2847,002
3,FTC-UMCG-0003,"2019 Nov Grootcellig B-Non-Hodgkin lymfoom,...",2020-05-18,0,Male,59,181,91,28,1,...,Radiotherapy CNS and Korfel 3x response evalua...,2020-05-15,7.4,389,11.9,NE,214,14,1404,003
4,FTC-UMCG-0004,2015 gehoorverlies 2019 aug: DLBCL ...,2020-05-14,1,Female,61,169,73,26,1,...,,2020-04-21,6.5,159,9.2,6.55,296,3.0,NE,004


In [20]:
clinic_data['id_cleaned'].values

array([' ID', '001', '002', '003', '004', '005', '006', '007', '008',
       '009', '010', '011', '012', '013', '014', '015', '016', '017',
       '018', '019', '020', '021', '022', '023', '024', '025', '026',
       '027', '028', '029', '030', '031', '046', '047', '048', '049',
       '050', '051', '052', '053', '054', '055', '060', '061', '064',
       '065', '066', '067', '068', '069', '070', '075', '076', '077',
       '078', '079', '080', '081', '082', '083', '084', '085', '086',
       '087', '088', '089', '090', '096', '104'], dtype=object)

In [21]:
delta_radiomics_results['id']

0     24
1     23
2     15
3     46
4     48
5     77
6     70
7     13
8     14
9     22
10    47
11     7
12     9
13    31
14    52
15    55
16     8
17     6
18    18
19    11
20    16
21    17
22    28
23    10
24    26
25    95
26    61
27    50
28    68
29     5
Name: id, dtype: int64

In [22]:
patient_ids = clinic_data['id_cleaned'].values[1:].astype(int)

In [23]:
# find patients that are in both datasets
# values starts from 1 to skip the comment row
intercept = [id for id in delta_radiomics_results['id'] if id in patient_ids]

In [24]:
clinic_data['id_cleaned'] = ['ID'] + patient_ids.tolist()

In [25]:
clinic_data_cleaned = clinic_data[clinic_data['id_cleaned'].isin(intercept)]

In [26]:
clinic_data_cleaned.reset_index(drop=True, inplace=True)

In [27]:
clinic_data_cleaned.shape

(29, 276)

Eventually, we have 24 patients with complete clinical and delta radiomics data to work with.

**Note:** patient 95 is missing their clinical data. 

In [28]:
# we now should select features we need for modelling the baseline, without the delta radiomics
clinic_data_cleaned

Unnamed: 0,record_id,medhis_diag_comments,scr_date_tb1stmeeting,scr_sex,scr_sex.factor,scr_age,scr_height,scr_weight,scr_bmi,indication_dis_diagnosis,...,post_cart_ther_comment_spec,cli_st_lab_date,cli_st_hemoglobin,cli_st_trombocytes,cli_st_leukocytes,cli_st_neutrophils,cli_st_ldh,cli_st_crp,cli_st_ferritin,id_cleaned
0,FTC-UMCG-0005,2019 mei: hemicastratie links Hematologische...,1900-01-01,0,Male,62,173.0,58.0,19,1,...,,2020-05-20,5.3,145,2.4,,NE,0.3,894,5
1,FTC-UMCG-0006,2014 Diffuus Grootcellig B-cel lymfoom st I...,2020-07-02,1,Female,58,173.0,57.0,19,2,...,"Verdere behandeling, inclusief allo-SCT in UMCU",2020-07-13,7.6,6,5.7,NE,275,9,371,6
2,FTC-UMCG-0007,2020 (feb) Stadium IV high grade B-cel lymfoom...,2020-08-10,0,Male,58,182.0,99.2,30,5,...,,2020-08-18,64.0,295,50.0,NE,885,47,2570,7
3,FTC-UMCG-0008,"2017 okt: Snel progressief DLBCL, stadium I...",2020-09-21,1,Female,72,169.0,60.0,21,1,...,Epcoritamab monotherapie,2020-09-18,5.7,321,1.9,NE,250,1.0,NE,8
4,FTC-UMCG-0009,2017 okt gastro- en colonoscopie ivm chroni...,2020-09-02,0,Male,48,186.0,106.0,31,2,...,,2020-09-02,7.2,324,8.7,NE,283,10,54,9
5,FTC-UMCG-0010,2020 feb nefrostomiekatheter rechts ivm hyd...,2020-10-19,0,Male,54,181.0,69.0,21,1,...,,2020-10-19,5.4,382,26.3,NE,417,206,4786,10
6,FTC-UMCG-0011,2020-03: koorts zonder lokaliserende klachten....,2020-10-15,0,Male,34,185.0,86.1,25,5,...,,2020-10-19,6.8,442,5.3,3.87,992,17,485,11
7,FTC-UMCG-0013,Hematologische voorgeschiedenis: 2013 (mei) ...,2020-11-23,0,Male,46,187.0,97.0,28,2,...,,2020-11-25,5.8,253,5.7,4.49,313,32,559,13
8,FTC-UMCG-0014,020 (mei) Diffuus grootcellig B-cel lymfoom st...,2020-12-17,0,Male,70,190.0,96.0,27,1,...,,2020-12-15,6.8,497,6.2,NE,484,225,1535,14
9,FTC-UMCG-0015,ematologische voorgeschiedenis: 2005 (dec) s...,2021-01-07,1,Female,66,162.0,56.0,21,2,...,,2021-01-08,6.4,432,7.6,6.10,235,8,569,15


In [29]:
# dropping columns with all NaN values
clinic_data_cleaned = clinic_data_cleaned.dropna(axis=1, how='all')

In [30]:
clinic_data_cleaned.shape

(29, 266)

In [31]:
# we don't need factor columns for modelling as they are encoded already
factors = [factor for factor in clinic_data_cleaned.columns if 'factor' in factor]

In [32]:
comments = [comm for comm in clinic_data_cleaned.columns if 'comment' in comm]

In [33]:
comments

['medhis_diag_comments', 'post_cart_ther_comment_spec']

In [34]:
locations = [loc for loc in clinic_data_cleaned.columns if 'loc' in loc]

In [35]:
locations

['indication_extran_site_loc___1',
 'indication_extran_site_loc___1.factor',
 'indication_extran_site_loc___2',
 'indication_extran_site_loc___2.factor',
 'indication_extran_site_loc___3',
 'indication_extran_site_loc___3.factor',
 'indication_extran_site_loc___21',
 'indication_extran_site_loc___21.factor',
 'indication_extran_site_loc___4',
 'indication_extran_site_loc___4.factor',
 'indication_extran_site_loc___5',
 'indication_extran_site_loc___5.factor',
 'indication_extran_site_loc___6',
 'indication_extran_site_loc___6.factor',
 'indication_extran_site_loc___7',
 'indication_extran_site_loc___7.factor',
 'indication_extran_site_loc___8',
 'indication_extran_site_loc___8.factor',
 'indication_extran_site_loc___9',
 'indication_extran_site_loc___9.factor',
 'indication_extran_site_loc___10',
 'indication_extran_site_loc___10.factor',
 'indication_extran_site_loc___11',
 'indication_extran_site_loc___11.factor',
 'indication_extran_site_loc___12',
 'indication_extran_site_loc___12.

In [36]:
locations

['indication_extran_site_loc___1',
 'indication_extran_site_loc___1.factor',
 'indication_extran_site_loc___2',
 'indication_extran_site_loc___2.factor',
 'indication_extran_site_loc___3',
 'indication_extran_site_loc___3.factor',
 'indication_extran_site_loc___21',
 'indication_extran_site_loc___21.factor',
 'indication_extran_site_loc___4',
 'indication_extran_site_loc___4.factor',
 'indication_extran_site_loc___5',
 'indication_extran_site_loc___5.factor',
 'indication_extran_site_loc___6',
 'indication_extran_site_loc___6.factor',
 'indication_extran_site_loc___7',
 'indication_extran_site_loc___7.factor',
 'indication_extran_site_loc___8',
 'indication_extran_site_loc___8.factor',
 'indication_extran_site_loc___9',
 'indication_extran_site_loc___9.factor',
 'indication_extran_site_loc___10',
 'indication_extran_site_loc___10.factor',
 'indication_extran_site_loc___11',
 'indication_extran_site_loc___11.factor',
 'indication_extran_site_loc___12',
 'indication_extran_site_loc___12.

In [37]:
# these are highly correlated features with bmi
correlated = ['scr_height', 'scr_weight']

* scr_age (continuous) correlates to indication_age_60 (binary), we Keep scr_age (continuous). It retains more information and doesn't arbitrarily cut at 60.  
* indication_ldh_uln: we have the exact value for ldh  
* indication_extran_sites, indication_extran_invol, indication_extranodal_nr	These are highly related. we keep indication_extranodal_nr (exact number). It is the most granular quantitative measure.

In [38]:
indicators = ['indication_ldh_uln','indication_age_60','indication_extran_sites', 'indication_extran_invol']

In [39]:
# cause of death columns are not needed
cause_of_death = [cause for cause in clinic_data_cleaned.columns if '_cause' in cause]

In [40]:
cause_of_death

['surv_death_cause',
 'surv_death_cause.factor',
 'surv_death_cause_oth',
 'surv_death_cause_spec',
 'surv_death_contrib_cause___1',
 'surv_death_contrib_cause___1.factor',
 'surv_death_contrib_cause___2',
 'surv_death_contrib_cause___2.factor',
 'surv_death_contrib_cause___3',
 'surv_death_contrib_cause___3.factor',
 'surv_death_contrib_cause___4',
 'surv_death_contrib_cause___4.factor',
 'surv_death_contrib_cause___5',
 'surv_death_contrib_cause___5.factor',
 'surv_death_contrib_cause___6',
 'surv_death_contrib_cause___6.factor',
 'surv_death_contrib_cause___7',
 'surv_death_contrib_cause___7.factor',
 'surv_death_contrib_cause___8',
 'surv_death_contrib_cause___8.factor',
 'surv_death_contrib_cause___9',
 'surv_death_contrib_cause___9.factor',
 'surv_death_contrib_cause___10',
 'surv_death_contrib_cause___10.factor',
 'surv_death_contrib_cause___11',
 'surv_death_contrib_cause___11.factor',
 'surv_death_contrib_cause___12',
 'surv_death_contrib_cause___12.factor',
 'surv_death_contr

**NOTE:** indication_dis_diagnosis must be one-hot encoded. as the disease is a nominal categorical feature.

In [41]:
disease = pd.get_dummies(clinic_data_cleaned['indication_dis_diagnosis.factor']).astype(int)

In [42]:
disease

Unnamed: 0,DLBCL,HGBCL DH/TH,HGBCL NOS,tFL
0,1,0,0,0
1,0,0,0,1
2,0,0,1,0
3,1,0,0,0
4,0,0,0,1
5,1,0,0,0
6,0,0,1,0
7,0,0,0,1
8,1,0,0,0
9,0,0,0,1


In [43]:
drop_columns = cause_of_death + factors + ['record_id','scr_date_tb1stmeeting', 'indication_dis_diagnosis'] + comments + locations + correlated + indicators
clinic_data_cleaned.drop(columns=drop_columns,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinic_data_cleaned.drop(columns=drop_columns,inplace=True)


In [44]:
clinic_data_cleaned.shape

(29, 97)

In [45]:
clinic_data_cleaned = pd.concat([clinic_data_cleaned, disease], axis=1)

In [46]:
clinic_data_cleaned

Unnamed: 0,scr_sex,scr_age,scr_bmi,total_num_priortherapylines_fl,total_num_priortherapylines_aggressive,indication_priorsct,indication_whops,indication_bulkydisease,indication_stage,indication_extranodal_nr,...,cli_st_leukocytes,cli_st_neutrophils,cli_st_ldh,cli_st_crp,cli_st_ferritin,id_cleaned,DLBCL,HGBCL DH/TH,HGBCL NOS,tFL
0,0,62,19,,2,4,0,0,4,3.0,...,2.4,,NE,0.3,894,5,1,0,0,0
1,1,58,19,0.0,2,1,0,0,3,,...,5.7,NE,275,9,371,6,0,0,0,1
2,0,58,30,,2,4,0,0,4,,...,50.0,NE,885,47,2570,7,0,0,1,0
3,1,72,21,,2,4,0,0,4,2.0,...,1.9,NE,250,1.0,NE,8,1,0,0,0
4,0,48,31,2.0,2,4,0,0,4,2.0,...,8.7,NE,283,10,54,9,0,0,0,1
5,0,54,21,,2,4,0,1,2,,...,26.3,NE,417,206,4786,10,1,0,0,0
6,0,34,25,,2,4,0,1,4,2.0,...,5.3,3.87,992,17,485,11,0,0,1,0
7,0,46,28,2.0,2,4,0,0,4,2.0,...,5.7,4.49,313,32,559,13,0,0,0,1
8,0,70,27,,2,4,0,1,4,,...,6.2,NE,484,225,1535,14,1,0,0,0
9,1,66,21,4.0,1,4,0,1,1,,...,7.6,6.10,235,8,569,15,0,0,0,1


In [47]:
clinic_data_cleaned.replace({'NE': np.nan}, inplace=True)

  clinic_data_cleaned.replace({'NE': np.nan}, inplace=True)


In [48]:
clinic_data_cleaned.describe()

Unnamed: 0,id_cleaned,DLBCL,HGBCL DH/TH,HGBCL NOS,tFL
count,29.0,29.0,29.0,29.0,29.0
mean,30.241379,0.37931,0.103448,0.103448,0.413793
std,21.921378,0.493804,0.309934,0.309934,0.50123
min,5.0,0.0,0.0,0.0,0.0
25%,13.0,0.0,0.0,0.0,0.0
50%,23.0,0.0,0.0,0.0,0.0
75%,48.0,1.0,0.0,0.0,1.0
max,77.0,1.0,1.0,1.0,1.0


In [49]:
nans = clinic_data_cleaned.isna().sum().sort_values(ascending=False)

In [50]:
# columns with more than 12 nans, which is half the data for the patients we have
nans[nans > 12]

post_car_ther_other                    28
surv_death_contrib_other               28
indication_dis_lymsubtype_cns_onset    28
surv_death_contrib_infect              28
tr_car_preaph_bridg_type               27
tr_car_bridg_reg_oth                   27
indication_extranodal_nr               22
total_num_priortherapylines_fl         18
ae_summ_crs_start_gr2                  18
post_cart_ther_startdate               16
ae_summ_icans_start_gr2                16
ae_summ_icans_stop_v2                  14
ae_summ_icans_res_v2                   14
ae_summ_icans_start_v2                 14
ae_summ_icans_highestgrade_v2          14
cli_st_neutrophils                     13
surv_death_date                        13
dtype: int64

In [51]:
drop_nans = nans[nans > 12].index

In [52]:
clinic_data_cleaned = clinic_data_cleaned.drop(columns=drop_nans)

In [53]:
clinic_data_cleaned.shape

(29, 84)

In [54]:
clinic_data_cleaned.select_dtypes(include=['object']).columns

Index(['scr_sex', 'scr_age', 'scr_bmi',
       'total_num_priortherapylines_aggressive', 'indication_priorsct',
       'indication_whops', 'indication_bulkydisease', 'indication_stage',
       'indication_pri_refr', 'indication_sec_refr',
       'indication_res_last_ther', 'indication_res_last_ther_spec',
       'indication_dis_lymsubtype_cns', 'indication_ind_date',
       'tr_car_preaph_br', 'tr_car_preaph_bridg_reg___1',
       'tr_car_preaph_bridg_reg___2', 'tr_car_preaph_bridg_reg___3',
       'tr_car_preaph_bridg_reg___4', 'tr_car_preaph_bridg_reg___5',
       'tr_car_preaph_bridg_reg___6', 'tr_car_preaph_bridg_reg___7',
       'tr_car_preaph_bridg_reg___8', 'tr_car_preaph_bridg_reg___9',
       'tr_car_preaph_bridg_reg___10', 'tr_car_preaph_bridg_reg___11',
       'tr_car_preaph_bridg_reg___12', 'tr_car_preaph_bridg_reg___na',
       'tr_car_preaph_bridg_reg___ne', 'tr_car_br', 'tr_car_bridg_type',
       'tr_car_bridg_reg___1', 'tr_car_bridg_reg___2', 'tr_car_bridg_reg___3',
  

In [55]:
clinic_data_cleaned.dtypes

scr_sex                                   object
scr_age                                   object
scr_bmi                                   object
total_num_priortherapylines_aggressive    object
indication_priorsct                       object
                                           ...  
id_cleaned                                 int64
DLBCL                                      int64
HGBCL DH/TH                                int64
HGBCL NOS                                  int64
tFL                                        int64
Length: 84, dtype: object

In [56]:
clinic_data_cleaned.columns

Index(['scr_sex', 'scr_age', 'scr_bmi',
       'total_num_priortherapylines_aggressive', 'indication_priorsct',
       'indication_whops', 'indication_bulkydisease', 'indication_stage',
       'indication_pri_refr', 'indication_sec_refr',
       'indication_res_last_ther', 'indication_res_last_ther_spec',
       'indication_dis_lymsubtype_cns', 'indication_ind_date',
       'tr_car_preaph_br', 'tr_car_preaph_bridg_reg___1',
       'tr_car_preaph_bridg_reg___2', 'tr_car_preaph_bridg_reg___3',
       'tr_car_preaph_bridg_reg___4', 'tr_car_preaph_bridg_reg___5',
       'tr_car_preaph_bridg_reg___6', 'tr_car_preaph_bridg_reg___7',
       'tr_car_preaph_bridg_reg___8', 'tr_car_preaph_bridg_reg___9',
       'tr_car_preaph_bridg_reg___10', 'tr_car_preaph_bridg_reg___11',
       'tr_car_preaph_bridg_reg___12', 'tr_car_preaph_bridg_reg___na',
       'tr_car_preaph_bridg_reg___ne', 'tr_car_br', 'tr_car_bridg_type',
       'tr_car_bridg_reg___1', 'tr_car_bridg_reg___2', 'tr_car_bridg_reg___3',
  

In [57]:
# Assuming clinic_data_filtered is the DataFrame you want to convert
date_columns = [date for date in clinic_data_cleaned.columns if ('date' in date) or ('start' in date) or ('stop' in date)]
# 1. Use convert_dtypes() for general automatic inference
# This function automatically converts to best possible dtypes (e.g., object to string, int64 to Int64, float64 to Float64)
# It's particularly useful for handling missing values using pandas' nullable dtypes (e.g., pd.NA).
print("Applying general type conversion...")

# 2. Force remaining object columns that look like numbers to numeric
for col in clinic_data_cleaned.columns:
        if col not in date_columns:
            # Attempt to convert to numeric.
            # this is to fix a typo in columns where , is used instead of .
            if clinic_data_cleaned[col].dtype == 'object':
                clinic_data_cleaned[col] = pd.to_numeric(clinic_data_cleaned[col].str.replace(',','.'), errors='raise')
            print(f"  Converted column '{col}' to numeric.")
        else: 
            clinic_data_cleaned[col] = pd.to_datetime(clinic_data_cleaned[col], errors='coerce')
            print(f"  Converted column '{col}' to datetime.")
        
print("\nAutomatic type conversion complete.")

Applying general type conversion...
  Converted column 'scr_sex' to numeric.
  Converted column 'scr_age' to numeric.
  Converted column 'scr_bmi' to numeric.
  Converted column 'total_num_priortherapylines_aggressive' to numeric.
  Converted column 'indication_priorsct' to numeric.
  Converted column 'indication_whops' to numeric.
  Converted column 'indication_bulkydisease' to numeric.
  Converted column 'indication_stage' to numeric.
  Converted column 'indication_pri_refr' to numeric.
  Converted column 'indication_sec_refr' to numeric.
  Converted column 'indication_res_last_ther' to numeric.
  Converted column 'indication_res_last_ther_spec' to numeric.
  Converted column 'indication_dis_lymsubtype_cns' to numeric.
  Converted column 'indication_ind_date' to datetime.
  Converted column 'tr_car_preaph_br' to numeric.
  Converted column 'tr_car_preaph_bridg_reg___1' to numeric.
  Converted column 'tr_car_preaph_bridg_reg___2' to numeric.
  Converted column 'tr_car_preaph_bridg_reg

In [58]:
clinic_data_cleaned.dtypes

scr_sex                                   int64
scr_age                                   int64
scr_bmi                                   int64
total_num_priortherapylines_aggressive    int64
indication_priorsct                       int64
                                          ...  
id_cleaned                                int64
DLBCL                                     int64
HGBCL DH/TH                               int64
HGBCL NOS                                 int64
tFL                                       int64
Length: 84, dtype: object

In [59]:
variances = clinic_data_cleaned.select_dtypes(include=np.number).var().sort_values()

In [60]:
# zero variance columns are not useful for modelling so I am dropping them
zero_var = variances[variances == 0].index

In [61]:
zero_var

Index(['tr_car_preaph_bridg_reg___11', 'tr_car_bridg_reg___1',
       'tr_car_bridg_reg___4', 'tr_car_bridg_reg___5',
       'tr_car_preaph_bridg_reg___ne', 'tr_car_preaph_bridg_reg___na',
       'tr_car_preaph_bridg_reg___12', 'ae_summ_crs_res_v2',
       'tr_car_preaph_bridg_reg___10', 'tr_car_preaph_bridg_reg___9',
       'tr_car_preaph_bridg_reg___7', 'tr_car_preaph_bridg_reg___6',
       'tr_car_preaph_bridg_reg___5', 'tr_car_preaph_bridg_reg___4',
       'tr_car_bridg_reg___2', 'tr_car_preaph_bridg_reg___3',
       'tr_car_preaph_bridg_reg___1', 'tr_car_ld_type', 'tr_car_bridg_reg___6',
       'tr_car_bridg_reg___9', 'post_cart_ther_spec_2___4', 'tr_car_ld',
       'tr_car_bridg_reg___10', 'tr_car_bridg_reg___11',
       'tr_car_preaph_bridg_reg___2', 'post_cart_ther_spec_2___na',
       'tr_car_bridg_reg___na', 'post_cart_ther_spec_2___ne',
       'tr_car_bridg_reg___ne'],
      dtype='object')

In [62]:
clinic_data_cleaned = clinic_data_cleaned.drop(columns=zero_var)

In [63]:
clinic_data_cleaned.shape

(29, 55)

In [64]:
clinic_data_cleaned.head()

Unnamed: 0,scr_sex,scr_age,scr_bmi,total_num_priortherapylines_aggressive,indication_priorsct,indication_whops,indication_bulkydisease,indication_stage,indication_pri_refr,indication_sec_refr,...,cli_st_trombocytes,cli_st_leukocytes,cli_st_ldh,cli_st_crp,cli_st_ferritin,id_cleaned,DLBCL,HGBCL DH/TH,HGBCL NOS,tFL
0,0,62,19,2,4,0,0,4,1,1,...,145,2.4,,0.3,894.0,5,1,0,0,0
1,1,58,19,2,1,0,0,3,0,0,...,6,5.7,275.0,9.0,371.0,6,0,0,0,1
2,0,58,30,2,4,0,0,4,1,1,...,295,5.0,885.0,47.0,2570.0,7,0,0,1,0
3,1,72,21,2,4,0,0,4,0,1,...,321,1.9,250.0,1.0,,8,1,0,0,0
4,0,48,31,2,4,0,0,4,1,1,...,324,8.7,283.0,10.0,54.0,9,0,0,0,1


In [65]:
clinic_data_cleaned.shape

(29, 55)

In [66]:
clinic_data_cleaned.columns

Index(['scr_sex', 'scr_age', 'scr_bmi',
       'total_num_priortherapylines_aggressive', 'indication_priorsct',
       'indication_whops', 'indication_bulkydisease', 'indication_stage',
       'indication_pri_refr', 'indication_sec_refr',
       'indication_res_last_ther', 'indication_res_last_ther_spec',
       'indication_dis_lymsubtype_cns', 'indication_ind_date',
       'tr_car_preaph_br', 'tr_car_preaph_bridg_reg___8', 'tr_car_br',
       'tr_car_bridg_type', 'tr_car_bridg_reg___3', 'tr_car_bridg_reg___7',
       'tr_car_bridg_reg___8', 'tr_car_bridg_reg___12', 'tr_car_inf_adm_date',
       'tr_car_ld_start', 'tr_car_inf_date', 'tr_car_inf_discharge_date',
       'ae_summ_start_date_v2', 'ae_summ_crs_v2', 'ae_summ_highestgrade_v2',
       'ae_summ_crs_start_v2', 'ae_summ_crs_stop_v2', 'ae_summ_icans_v2',
       'surv_bestresponse_car', 'surv_time_bestresponse_car',
       'surv_prog_after_car', 'surv_prog_date', 'surv_status', 'surv_date',
       'post_cart_ther', 'post_cart_ther_

In [67]:
# Impute missing values with the median for numeric columns
for col in clinic_data_cleaned.select_dtypes(include=np.number).columns:
    median_value = clinic_data_cleaned[col].median()
    clinic_data_cleaned[col].fillna(median_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clinic_data_cleaned[col].fillna(median_value, inplace=True)


In [68]:
clinic_data_cleaned.isna().sum().sort_values(ascending=False)

surv_prog_date                            12
ae_summ_crs_start_v2                       3
ae_summ_crs_stop_v2                        3
post_cart_ther_spec_2___3                  0
ae_summ_icans_v2                           0
surv_bestresponse_car                      0
surv_time_bestresponse_car                 0
surv_prog_after_car                        0
surv_status                                0
surv_date                                  0
post_cart_ther                             0
post_cart_ther_spec_2___1                  0
post_cart_ther_spec_2___2                  0
scr_sex                                    0
ae_summ_highestgrade_v2                    0
cli_st_lab_date                            0
cli_st_hemoglobin                          0
cli_st_trombocytes                         0
cli_st_leukocytes                          0
cli_st_ldh                                 0
cli_st_crp                                 0
cli_st_ferritin                            0
id_cleaned

In [69]:
clinic_data_cleaned.columns

Index(['scr_sex', 'scr_age', 'scr_bmi',
       'total_num_priortherapylines_aggressive', 'indication_priorsct',
       'indication_whops', 'indication_bulkydisease', 'indication_stage',
       'indication_pri_refr', 'indication_sec_refr',
       'indication_res_last_ther', 'indication_res_last_ther_spec',
       'indication_dis_lymsubtype_cns', 'indication_ind_date',
       'tr_car_preaph_br', 'tr_car_preaph_bridg_reg___8', 'tr_car_br',
       'tr_car_bridg_type', 'tr_car_bridg_reg___3', 'tr_car_bridg_reg___7',
       'tr_car_bridg_reg___8', 'tr_car_bridg_reg___12', 'tr_car_inf_adm_date',
       'tr_car_ld_start', 'tr_car_inf_date', 'tr_car_inf_discharge_date',
       'ae_summ_start_date_v2', 'ae_summ_crs_v2', 'ae_summ_highestgrade_v2',
       'ae_summ_crs_start_v2', 'ae_summ_crs_stop_v2', 'ae_summ_icans_v2',
       'surv_bestresponse_car', 'surv_time_bestresponse_car',
       'surv_prog_after_car', 'surv_prog_date', 'surv_status', 'surv_date',
       'post_cart_ther', 'post_cart_ther_

In [70]:
# there are date related column that still have nans, but we will not use them for modelling as we can't impute them easily
# also cli_st_lab_date is not needed
date_columns = [
    'indication_ind_date',
    'tr_car_inf_adm_date',
    'tr_car_ld_start',
    'tr_car_inf_date',
    'tr_car_inf_discharge_date',
    'ae_summ_start_date_v2',
    'ae_summ_crs_start_v2',
    'ae_summ_crs_stop_v2',
    'surv_prog_date',
    'surv_date',
    'cli_st_lab_date'
]

clinic_data_cleaned.drop(columns=date_columns, inplace=True)


In [71]:
clinic_data_cleaned.shape

(29, 44)

In [72]:
clinic_data_cleaned.isna().sum().sum() # confirming no nans remain

0

# Model training and evaluation

## Define target (y) and predictor matrix (X)

We formulate the problem as a **binary classification task** where the outcome variable is:

- **`y = surv_status`**
  - `0` = alive / censored  
  - `1` = death event  

### Important considerations

- **Identifier columns** (e.g. `id_cleaned`) are removed from the feature matrix `X` because they do not carry clinical information and may introduce spurious patterns.
- **Other survival- or response-related variables** are also excluded from `X` to avoid *conceptual leakage*.  
  These variables represent outcomes or follow-up information and would artificially inflate model performance if used as predictors.

After this step:
- `y` contains only the target labels.
- `X` contains baseline predictor variables only and may still include missing values, which will be handled later within a modeling pipeline.


In [73]:
# define y(label) and X
y = clinic_data_cleaned["surv_status"].astype(int)
exclude_from_X = [
    "surv_status",
    "id_cleaned",
    "surv_bestresponse_car",
    "surv_time_bestresponse_car",
    "surv_prog_after_car",
]

# Keep only columns that actually exist
exclude_from_X = [c for c in exclude_from_X if c in clinic_data_cleaned.columns]
X = clinic_data_cleaned.drop(columns=exclude_from_X)

# Sanity checks
assert y.isna().sum() == 0, "Target contains missing values (labels must not be imputed)."
assert X.shape[0] == y.shape[0], "X and y row counts do not match."


In [74]:
print("X shape:", X.shape)                 # expect: (29, number_of_features)
print("y distribution:\n", y.value_counts())
print("object columns:", X.select_dtypes(include="object").shape[1])
print("Total NaNs in X:", X.isna().sum().sum())


X shape: (29, 39)
y distribution:
 surv_status
0    16
1    13
Name: count, dtype: int64
object columns: 0
Total NaNs in X: 0


- **`object columns: 0`**  
  There are no remaining non-numeric (object/string) columns in X.  
  This is required because models like SVM (and preprocessing steps like StandardScaler) expect numeric input.

- **`Total NaNs in X: 20`**  
  The feature matrix still contains 20 missing values across all predictors.  
  This is expected at this stage because we do not impute missing values globally (which could cause data leakage).  
  Instead, missing values will be handled later using an imputer inside the scikit-learn Pipeline, fitted only on the training data.


In [75]:
# split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# quick checks
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("y_train distribution:\n", y_train.value_counts())
print("y_test distribution:\n", y_test.value_counts())

Train shape: (23, 39) Test shape: (6, 39)
y_train distribution:
 surv_status
0    13
1    10
Name: count, dtype: int64
y_test distribution:
 surv_status
0    3
1    3
Name: count, dtype: int64


In [76]:
# making pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

svm_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("svm", SVC(
        kernel="rbf",
        class_weight="balanced",
        random_state=42
    ))
])

In [77]:
# fitting (set baseline)
svm_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('imputer', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [78]:
# test
y_pred = svm_pipeline.predict(X_test)

In [79]:
y_pred

array([0, 1, 0, 1, 1, 0])

In [80]:
# evaluation
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6666666666666666
Balanced accuracy: 0.6666666666666666
Confusion matrix:
 [[2 1]
 [1 2]]
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.67      0.67      0.67         3

    accuracy                           0.67         6
   macro avg       0.67      0.67      0.67         6
weighted avg       0.67      0.67      0.67         6



Due to the very small test set size, these results have high variance and should be interpreted with caution.

## GridSearchCV

Next, we tune the SVM hyperparameters using **GridSearchCV** with **stratified cross-validation**.  
This searches over a small, predefined set of `C` and `gamma` values and selects the configuration that maximizes **balanced accuracy** on the training folds (to avoid bias and reduce data leakage).

In [81]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# Stratified CV is important for small datasets to preserve class ratios in each fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    "svm__kernel": ["rbf"],
    "svm__C": [0.1, 1, 10, 100],
    "svm__gamma": ["scale", 0.01, 0.1, 1],
}

grid = GridSearchCV(
    estimator=svm_pipeline,        
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=cv,
    n_jobs=-1,
    refit=True
)

# fit the grid on the training set only
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV balanced accuracy (train CV):", grid.best_score_)

best_model = grid.best_estimator_

# Evaluate best model on the held-out test set
y_pred_grid = best_model.predict(X_test)

print("Test accuracy (tuned):", accuracy_score(y_test, y_pred_grid))
print("Test balanced accuracy (tuned):", balanced_accuracy_score(y_test, y_pred_grid))
print("Confusion matrix (tuned):\n", confusion_matrix(y_test, y_pred_grid))
print(classification_report(y_test, y_pred_grid))

Best params: {'svm__C': 1, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
Best CV balanced accuracy (train CV): 0.55
Test accuracy (tuned): 0.6666666666666666
Test balanced accuracy (tuned): 0.6666666666666666
Confusion matrix (tuned):
 [[2 1]
 [1 2]]
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.67      0.67      0.67         3

    accuracy                           0.67         6
   macro avg       0.67      0.67      0.67         6
weighted avg       0.67      0.67      0.67         6



Hyperparameter tuning via GridSearchCV did not improve performance. This suggests that the limitation lies in the formulation of survival as a binary classification problem and the small sample size, rather than suboptimal model parameters

The SVM results provide an initial baseline, but performance may be influenced by the specific assumptions of a margin-based model. To assess whether the observed behavior is model-dependent, we next evaluate a Random Forest classifier, which relies on a fundamentally different learning mechanism and can capture non-linear feature interactions.

In [82]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("rf", RandomForestClassifier(
        n_estimators=100,
        max_depth=3,          # with 29 patients, I used depth=3
        class_weight="balanced",
        random_state=42
    ))
])

In [83]:
rf_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('imputer', ...), ('rf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [85]:
y_pred_rf = rf_pipeline.predict(X_test)
y_pred

array([0, 1, 0, 1, 1, 0])

In [86]:
print("Accuracy (RF):", accuracy_score(y_test, y_pred_rf))
print("Balanced accuracy (RF):", balanced_accuracy_score(y_test, y_pred_rf))
print("Confusion matrix (RF):\n", confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Accuracy (RF): 0.6666666666666666
Balanced accuracy (RF): 0.6666666666666666
Confusion matrix (RF):
 [[2 1]
 [1 2]]
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.67      0.67      0.67         3

    accuracy                           0.67         6
   macro avg       0.67      0.67      0.67         6
weighted avg       0.67      0.67      0.67         6



## Interpretation of classification results

Two classifiers with fundamentally different modeling assumptions were evaluated. Despite these differences, both models achieved similar and only moderate performance. This consistency suggests that the observed limitation is unlikely to be driven by the specific choice of classifier. Instead, it indicates that the current formulation of the task—using a binary survival label—may not fully capture the available information. This motivates the consideration of alternative modeling strategies that can more directly account for time-to-event characteristics.