
# Preprocessing and preparing the data

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [18]:
survival = pd.read_csv('data/survival.csv')
survival.describe()

Unnamed: 0,state,Days
count,1538.0,1538.0
mean,0.172302,568.007152
std,0.377765,605.285336
min,0.0,15.0
25%,0.0,242.0
50%,0.0,407.0
75%,0.0,686.75
max,1.0,7126.0


# LUNG Dataset
## Numerical features

In [38]:

n_lung = pd.read_csv('data/Lung/numerical.csv', index_col=0)
n_lung = n_lung.drop(['center_id', 'center_name'], axis=1)
print(n_lung.describe())
sorted(list(n_lung.keys()))

       Lung_Ipsi_Already_Subtracted_V5Gy   LU_DVH_28  TotalNumberOfNotes  \
count                         100.000000  628.000000          773.000000   
mean                           65.962470    7.146262            9.058215   
std                            17.942499    3.565954            2.946936   
min                            13.856340    0.014666            0.000000   
25%                            55.873377    4.736149            7.000000   
50%                            67.745435    6.597207            9.000000   
75%                            77.869534    9.116760           11.000000   
max                           100.000000   28.990862           27.000000   

       Lung_Contra_Already_Subtracted_V20Gy  \
count                             15.000000   
mean                              16.102849   
std                                5.708704   
min                                4.585702   
25%                               13.074970   
50%                              

['LU_DVH_23',
 'LU_DVH_24',
 'LU_DVH_25',
 'LU_DVH_26',
 'LU_DVH_27',
 'LU_DVH_28',
 'LungEsophagitisTotal',
 'LungEsophagitisWithGrade',
 'LungPneumonitisTotal',
 'LungPneumonitisWithGrade',
 'Lung_Contra_Already_Subtracted_DMean',
 'Lung_Contra_Already_Subtracted_V20Gy',
 'Lung_Contra_Already_Subtracted_V5Gy',
 'Lung_Ipsi + Lung_Contra - Lung_Subtraction_Structure ((Lung_Ipsi + Lung_Contra) - Lung_Subtraction_Structure)_DMean',
 'Lung_Ipsi + Lung_Contra - Lung_Subtraction_Structure ((Lung_Ipsi + Lung_Contra) - Lung_Subtraction_Structure)_V20Gy',
 'Lung_Ipsi + Lung_Contra - Lung_Subtraction_Structure ((Lung_Ipsi + Lung_Contra) - Lung_Subtraction_Structure)_V5Gy',
 'Lung_Ipsi_Already_Subtracted_DMean',
 'Lung_Ipsi_Already_Subtracted_V20Gy',
 'Lung_Ipsi_Already_Subtracted_V5Gy',
 'NumberOfNotesWithToxicityInitialized',
 'TotalNumberOfNotes',
 'cancer_type',
 'vha_id']

In [39]:
n_lung.isnull().sum()/773

Lung_Ipsi_Already_Subtracted_V5Gy                                                                                      0.870634
LU_DVH_28                                                                                                              0.187581
TotalNumberOfNotes                                                                                                     0.000000
Lung_Contra_Already_Subtracted_V20Gy                                                                                   0.980595
Lung_Ipsi + Lung_Contra - Lung_Subtraction_Structure ((Lung_Ipsi + Lung_Contra) - Lung_Subtraction_Structure)_DMean    0.256145
LungEsophagitisWithGrade                                                                                               0.000000
LU_DVH_24                                                                                                              0.187581
NumberOfNotesWithToxicityInitialized                                                                    

In [40]:
#Dropping columns with missed values of more than 80%

def dropping_cols(df, p=80):
    nan_counts = df.isna().sum()    
    nan_percentages = nan_counts / len(df) * 100 
    cols_to_drop = nan_percentages[nan_percentages > p].index.tolist()
    df = df.drop(cols_to_drop, axis=1)
    return df    

n_lung = dropping_cols(n_lung, 80)
n_lung.isnull().sum()/773

LU_DVH_28                                                                                                              0.187581
TotalNumberOfNotes                                                                                                     0.000000
Lung_Ipsi + Lung_Contra - Lung_Subtraction_Structure ((Lung_Ipsi + Lung_Contra) - Lung_Subtraction_Structure)_DMean    0.256145
LungEsophagitisWithGrade                                                                                               0.000000
LU_DVH_24                                                                                                              0.187581
NumberOfNotesWithToxicityInitialized                                                                                   0.000000
LU_DVH_25                                                                                                              0.253558
LungPneumonitisTotal                                                                                    

In [41]:
# Imputation to the kNN
from sklearn.impute import KNNImputer
numeric_df = n_lung.select_dtypes(include='number')
imputer = KNNImputer(n_neighbors=3)
imputed_df = pd.DataFrame(imputer.fit_transform(numeric_df), columns=numeric_df.columns)
# imputed_df['vha_id'] = n_lung['vha_id']
print(imputed_df)

     LU_DVH_28  TotalNumberOfNotes  \
0     6.858694                 8.0   
1     6.677710                 9.0   
2     4.935014                10.0   
3     5.636330                 9.0   
4     7.276093                10.0   
..         ...                 ...   
768   1.298097                 9.0   
769   4.887026                 4.0   
770   6.826124                 6.0   
771   3.590745                 6.0   
772   2.795710                10.0   

     Lung_Ipsi + Lung_Contra - Lung_Subtraction_Structure ((Lung_Ipsi + Lung_Contra) - Lung_Subtraction_Structure)_DMean  \
0                                            15.807393                                                                     
1                                            14.487703                                                                     
2                                            16.569222                                                                     
3                                            14

## Similarity matrix

In [47]:
from scipy.spatial.distance import pdist, squareform

distances = pdist(imputed_df, metric='euclidean')

# Convert distances to similarity matrix
SM = 1 / (1 + squareform(distances))

# Print the similarity matrix
print(type(SM))
pd.DataFrame(SM).to_csv('data/Lung/SM/numericalSM.csv')
SM.shape

<class 'numpy.ndarray'>


(773, 773)

# Processing Categorical DF

In [52]:
Lung_QMs = ['QualityMeasure1','QualityMeasure10','QualityMeasure11','QualityMeasure12','QualityMeasure13','QualityMeasure14','QualityMeasure15','QualityMeasure15Chemo','QualityMeasure15RT','QualityMeasure15Surgery','QualityMeasure16','QualityMeasure17','QualityMeasure18','QualityMeasure19','QualityMeasure19_color','QualityMeasure2','QualityMeasure20','QualityMeasure21A','QualityMeasure21B','QualityMeasure22','QualityMeasure23','QualityMeasure24','QualityMeasure27','QualityMeasure3','QualityMeasure4','QualityMeasure5','QualityMeasure6','QualityMeasure7','QualityMeasure8A','QualityMeasure8B','QualityMeasure9']

c_lung = pd.read_csv('data/Lung/categorical.csv', index_col=0)

c_lung = c_lung.drop(Lung_QMs+['center_name', 'cancer_type', 'vha_id'], axis=1)

df = c_lung.copy()
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Feature '{column}': {unique_values}")

Feature 'Lung_Contra - Lung_Subtraction_Structure (Lung_Contra - Lung_Subtraction_Structure)_V20Gy': [nan 'Red' 'Green' 'Yellow']
Feature 'Lung_Contra - Lung_Subtraction_Structure (Lung_Contra - Lung_Subtraction_Structure)_DMean': [nan 'Green' 'Red' 'Yellow']
Feature 'SpinalCord_DMax': ['Green' 'Yellow' nan 'Red']
Feature 'Esophagus_DMean': ['Green' 'Yellow' 'Red' nan]
Feature 'Lung_Ipsi_Already_Subtracted + Lung_Contra_Already_Subtracted (Lung_Ipsi_Already_Subtracted + Lung_Contra_Already_Subtracted)_V5Gy': [nan 'Red' 'Green' 'Yellow']
Feature 'Lung_Total - Lung_Subtraction_Structure (Lung_Total - Lung_Subtraction_Structure)_V5Gy': ['Yellow' nan 'Green' 'Red']
Feature 'BrachialPlexus_DMax': [nan 'Green' 'Yellow' 'Red']
Feature 'Esophagus_DMax': ['Green' 'Yellow' 'Red' nan]
Feature 'Esophagus_D0.035cc': ['Green' 'Yellow' 'Red' nan]
Feature 'Heart_V45Gy': ['Green' 'Yellow' 'Red' nan]
Feature 'BrachialPlexus_D0.035cc': [nan 'Green' 'Yellow' 'Red']
Feature 'Esophagus_V60Gy': ['Green' 'Red

In [49]:
c_lung.isnull().sum()/773

Lung_Contra - Lung_Subtraction_Structure (Lung_Contra - Lung_Subtraction_Structure)_V20Gy                                              0.187581
Lung_Contra - Lung_Subtraction_Structure (Lung_Contra - Lung_Subtraction_Structure)_DMean                                              0.187581
SpinalCord_DMax                                                                                                                        0.007762
Esophagus_DMean                                                                                                                        0.108668
Lung_Ipsi_Already_Subtracted + Lung_Contra_Already_Subtracted (Lung_Ipsi_Already_Subtracted + Lung_Contra_Already_Subtracted)_V5Gy     0.980595
Lung_Total - Lung_Subtraction_Structure (Lung_Total - Lung_Subtraction_Structure)_V5Gy                                                 0.468305
BrachialPlexus_DMax                                                                                                                    0

In [50]:
c_lung = dropping_cols(c_lung, 80)
c_lung.isnull().sum()/773

Lung_Contra - Lung_Subtraction_Structure (Lung_Contra - Lung_Subtraction_Structure)_V20Gy    0.187581
Lung_Contra - Lung_Subtraction_Structure (Lung_Contra - Lung_Subtraction_Structure)_DMean    0.187581
SpinalCord_DMax                                                                              0.007762
Esophagus_DMean                                                                              0.108668
Lung_Total - Lung_Subtraction_Structure (Lung_Total - Lung_Subtraction_Structure)_V5Gy       0.468305
Esophagus_DMax                                                                               0.108668
Esophagus_D0.035cc                                                                           0.108668
Heart_V45Gy                                                                                  0.025873
Esophagus_V60Gy                                                                              0.108668
SpinalCord_D0.035cc                                                               