In [46]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [47]:
%run verifica.ipynb

# Loading Features

### Datasets with low, medium and high standard deviation

In [48]:
low_2d_sd= pd.read_csv("data_0.5.csv")
medium_2d_sd= pd.read_csv("data_1.0.csv")
high_2d_sd= pd.read_csv("data_1.0.csv")

### Renaming the columns


In [49]:
new_column_names = {'Nodule 1\nDiagnosis at the Nodule Level \n0=Unknown\n1=benign or non-malignant disease\n2= malignant, primary lung cancer\n3 = malignant metastatic)\n': 'Diagnosis Nodule 1', 
                    'Nodule 1\nDiagnosis Method at the Nodule Level\n0 = unknown\n1 = review of radiological images to show 2 years of stable nodule\n2 = biopsy\n3 = surgical resection\n4 = progression or response\n': 'Diagnosis Method Nodule 1',
                      'Nodule 2\nDiagnosis at the Nodule Level \n0=Unknown\n1=benign or non-malignant disease\n2= malignant, primary lung cancer\n3 = malignant metastatic)\n': 'Diagnosis Nodule 2',
                      'Nodule 2\nDiagnosis Method at the Nodule Level\n0 = unknown\n1 = review of radiological images to show 2 years of stable nodule\n2 = biopsy\n3 = surgical resection\n4 = progression or response\n': 'Diagnosis Method Nodule 2',
                      'Nodule 3\nDiagnosis at the Nodule Level \n0=Unknown\n1=benign or non-malignant disease\n2= malignant, primary lung cancer\n3 = malignant metastatic)\n': 'Diagnosis Nodule 3',
                      'Nodule 3\nDiagnosis Method at the Nodule Level\n0 = unknown\n1 = review of radiological images to show 2 years of stable nodule\n2 = biopsy\n3 = surgical resection\n4 = progression or response\n': 'Diagnosis Method Nodule 3',
                      'Nodule 4\nDiagnosis at the Nodule Level \n0=Unknown\n1=benign or non-malignant disease\n2= malignant, primary lung cancer\n3 = malignant metastatic)\n': 'Diagnosis Nodule 4',
                      'Nodule 4\nDiagnosis Method at the Nodule Level\n0 = unknown\n1 = review of radiological images to show 2 years of stable nodule\n2 = biopsy\n3 = surgical resection\n4 = progression or response\n': 'Diagnosis Method Nodule 4',
                      'Nodule 5\nDiagnosis at the Nodule Level \n0=Unknown\n1=benign or non-malignant disease\n2= malignant, primary lung cancer\n3 = malignant metastatic)\n': 'Diagnosis Nodule 5',
                      'Nodule 5\nDiagnosis Method at the Nodule Level\n0 = unknown\n1 = review of radiological images to show 2 years of stable nodule\n2 = biopsy\n3 = surgical resection\n4 = progression or response\n': 'Diagnosis Method Nodule 5'}
low_2d_sd.rename(columns=new_column_names, inplace=True)
medium_2d_sd.rename(columns=new_column_names, inplace=True)
high_2d_sd.rename(columns=new_column_names, inplace=True)

## Processing Data

### Handling Missing Data:

In [50]:
low_2d_sd = low_2d_sd.dropna(subset=['Diagnosis at the Patient Level\n0=Unknown\n1=benign or non-malignant disease\n2= malignant, primary lung cancer\n3 = malignant metastatic\n'])
medium_2d_sd = low_2d_sd.dropna(subset=['Diagnosis at the Patient Level\n0=Unknown\n1=benign or non-malignant disease\n2= malignant, primary lung cancer\n3 = malignant metastatic\n'])
high_2d_sd = low_2d_sd.dropna(subset=['Diagnosis at the Patient Level\n0=Unknown\n1=benign or non-malignant disease\n2= malignant, primary lung cancer\n3 = malignant metastatic\n'])


In [51]:
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
missing_values = low_2d_sd.isna()

In [52]:
missing_values = low_2d_sd.isna()
missing_indices = missing_values.stack()

missing_values_dict = find_missing_values(low_2d_sd)

### Replacing NaN values with zeros

In [56]:
low_2d_sd = replace_nan_with_zero(low_2d_sd)
medium_2d_sd = replace_nan_with_zero(medium_2d_sd)
high_2d_sd = replace_nan_with_zero(high_2d_sd)

### Checking if there are still missing values for all datasets

In [57]:
missing_values_dict = find_missing_values(low_2d_sd) # Dataset with lowest standard deviation

for column, missing_rows in missing_values_dict.items():
    print(f"Column with missing values: {column}")
    print(f"Rows with missing values in this column: {missing_rows}")
    print()

No missing values found in the DataFrame.


In [58]:
missing_values_dict1 = find_missing_values(medium_2d_sd) # Dataset with medium standard deviation
for column, missing_rows in missing_values_dict1.items():
    print(f"Column with missing values: {column}")
    print(f"Rows with missing values in this column: {missing_rows}")
    print()

No missing values found in the DataFrame.


In [59]:
missing_values_dict2 = find_missing_values(high_2d_sd) # Dataset with highest standard deviation

for column, missing_rows in missing_values_dict2.items():
    print(f"Column with missing values: {column}")
    print(f"Rows with missing values in this column: {missing_rows}")
    print()

No missing values found in the DataFrame.


## Encoding Categorical Data¶

### Mapping non-numeric values to numeric values in 'Malignancy' column

In [61]:
malignancy_mapping = {
    '1-Highly Unlikely': 1,
    '2-Moderately Unlikely': 2,
    '3-Indeterminate': 3,
    '4-Moderately Suspicious': 4,
    '5-Highly Suspicious': 5
}

low_2d_sd['Malignancy'] = low_2d_sd['Malignancy'].map(malignancy_mapping)
medium_2d_sd['Malignancy'] = medium_2d_sd['Malignancy'].map(malignancy_mapping)
high_2d_sd['Malignancy'] = high_2d_sd['Malignancy'].map(malignancy_mapping)

### Uniforming the labels

In [62]:
labels = [
    'Head, Neck', 'Carcinoma', 'Prostate', 'Prostate', 'Prostate', 'Colon', 'Colon ', 'Colon', 'Colon', 'Colon',
    'Colon', 'Colon', 'Colon', 'Colon', 'Colon', 'Colon', 'Carcinoma', 'Lung', 'Sarcoma', 'Sarcoma',
    'Sarcoma', 'Sarcoma', 'Sarcoma', 'Sarcoma', 'Pancreatic', 'Gallbladder', 'Colorectal', 'Colorectal', 'Vaginal', 'Lung',
    'Lymphoma', 'Lymphoma', 'Lymphoma', 'Gallbladder', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung',
    'Lung', 'Lymphoma', 'Carcinoma', 'Carcinoma', 'Lung', 'Lung', 'Lung', 'Lung', 'Leukemia', 'Leukemia','Ovarian',
    'Ovarian', 'Ovarian', 'Ovarian', 'Rectal', 'Rectal', 'Rectal', 'Rectal', 'Sarcoma', 'Melanoma', 'Lung', 'Lung',
    'Breast', 'Breast', 'Rectal', 'Cervix', 'Leukemia', 'Leukemia', 'Sarcoma', 'Prostate', 'Prostate ', 'Laryngeal',
    'Multiple Primary Cancers', 'Lung', 'Lung', 'Sarcoma', 'Head, Neck', 'Esophageal', 'Leukemia', 'Melanoma', 'Lymphoma', 'Sarcoma',
    'Renal', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Breast', 'Cervical', 'Mesothelial', 'Mesothelial', 'Carcinoma', 'Lung', 'Fibrosis', 'Fibrosis', 'Fibrosis',
    'Fibrosis', 'Lung', 'Histoplasmosis', 'Histoplasmosis', 'Histoplasmosis', 'Histoplasmosis', 'Histoplasmosis', 'Histoplasmosis', 'Carcinoma', 'Metastatic',
    'Metastatic', 'Carcinoma', 'Carcinoma', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung','Lung','Lymphoma', 'Carcinoma'
]

# Create a LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the labels
encoded_labels = label_encoder.fit_transform(labels)

# Substitute the labels in the datasets
low_2d_sd['Primary tumor site for metastatic disease'] = encoded_labels
medium_2d_sd['Primary tumor site for metastatic disease'] = encoded_labels
high_2d_sd['Primary tumor site for metastatic disease'] = encoded_labels

### Removing patients prefixes (to have an int value)

In [63]:
remove_prefix(low_2d_sd, 'Patient_id', 'LIDC-IDRI-') 
low_2d_sd['Patient_id'] = pd.to_numeric(low_2d_sd['Patient_id'])

remove_prefix(medium_2d_sd, 'Patient_id', 'LIDC-IDRI-')
medium_2d_sd['Patient_id'] = pd.to_numeric(medium_2d_sd['Patient_id'])
                                           
remove_prefix(high_2d_sd, 'Patient_id', 'LIDC-IDRI-')
high_2d_sd['Patient_id'] = pd.to_numeric(high_2d_sd['Patient_id'])

### Drop unnecessary columns

In [64]:
columns_to_remove=['TCIA Patient ID', 'diagnostics_Configuration_EnabledImageTypes',
'diagnostics_Configuration_Settings',
'diagnostics_Image-original_Dimensionality',
'diagnostics_Image-original_Hash',
'diagnostics_Image-original_Maximum',
'diagnostics_Image-original_Mean',
'diagnostics_Image-original_Minimum',
'diagnostics_Image-original_Size',
'diagnostics_Image-original_Spacing',
'diagnostics_Mask-original_BoundingBox',
'diagnostics_Mask-original_CenterOfMass',
'diagnostics_Mask-original_CenterOfMassIndex',
'diagnostics_Mask-original_Hash',
'diagnostics_Mask-original_Size',
'diagnostics_Mask-original_Spacing',
'diagnostics_Mask-original_VolumeNum',
'diagnostics_Mask-original_VoxelNum',
'diagnostics_Versions_Numpy',
'diagnostics_Versions_PyRadiomics',
'diagnostics_Versions_PyWavelet',
'diagnostics_Versions_Python',
'diagnostics_Versions_SimpleITK', 'original_shape_Maximum3DDiameter',
'original_shape_MeshVolume']

low_2d_sd = low_2d_sd.drop(columns=columns_to_remove, errors='ignore')
medium_2d_sd = medium_2d_sd.drop(columns=columns_to_remove, errors='ignore')
high_2d_sd = high_2d_sd.drop(columns=columns_to_remove, errors='ignore')

### Check if there are stll string columns

In [65]:
string_columns = get_string_columns(low_2d_sd)  
print("Columns with string values:", string_columns)

Columns with string values: Index([], dtype='object')


### Remove unknown rows

In [66]:
# Check if any columns have all zeros
columns_with_zeros = (low_2d_sd == 0).all(axis=0)

# Get the column names with all zeros
columns_with_zeros = low_2d_sd.columns[columns_with_zeros]

if not columns_with_zeros.empty:
    print("Columns with all zeros (representing 'unknown'):")
    print(columns_with_zeros)
else:
    print("No columns with all zeros found.")

Columns with all zeros (representing 'unknown'):
Index(['original_firstorder_InterquartileRange',
       'original_firstorder_Kurtosis',
       'original_firstorder_MeanAbsoluteDeviation',
       'original_firstorder_Range',
       'original_firstorder_RobustMeanAbsoluteDeviation',
       'original_firstorder_Skewness', 'original_firstorder_Variance',
       'original_glcm_ClusterProminence', 'original_glcm_ClusterShade',
       'original_glcm_ClusterTendency', 'original_glcm_Contrast',
       'original_glcm_DifferenceAverage', 'original_glcm_DifferenceVariance',
       'original_glcm_Imc1', 'original_glcm_Imc2',
       'original_glcm_InverseVariance', 'original_glcm_SumSquares',
       'original_glrlm_GrayLevelVariance', 'original_glszm_GrayLevelVariance',
       'original_gldm_GrayLevelVariance', 'Diagnosis Nodule 3',
       'Diagnosis Method Nodule 3', 'Diagnosis Nodule 4',
       'Diagnosis Method Nodule 4', 'Diagnosis Nodule 5',
       'Diagnosis Method Nodule 5'],
      dtype='ob

In [67]:
list = ['original_firstorder_InterquartileRange',
       'original_firstorder_Kurtosis',
       'original_firstorder_MeanAbsoluteDeviation',
       'original_firstorder_Range',
       'original_firstorder_RobustMeanAbsoluteDeviation',
       'original_firstorder_Skewness', 'original_firstorder_Variance',
       'original_glcm_ClusterProminence', 'original_glcm_ClusterShade',
       'original_glcm_ClusterTendency', 'original_glcm_Contrast',
       'original_glcm_DifferenceAverage', 'original_glcm_DifferenceVariance',
       'original_glcm_Imc1', 'original_glcm_Imc2',
       'original_glcm_InverseVariance', 'original_glcm_SumSquares',
       'original_glrlm_GrayLevelVariance', 'original_glszm_GrayLevelVariance',
       'original_gldm_GrayLevelVariance']

low_2d_sd = low_2d_sd.drop(columns=list)
medium_2d_sd = medium_2d_sd.drop(columns=list)
high_2d_sd = high_2d_sd.drop(columns=list)