In [10]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder


In [20]:
#change dir here(use ml_labelled files)
file = r'C:\Users\atrip\Research\Model-Interpretation-main\Data\ml_labelled_HN_P2C_20241212.csv'

df = pd.read_csv(file)
print(df.columns)
# selected_columns = ['Tob_label', 'Alc_label', 'HPVstatus', 'ML_label']
# data = df[selected_columns]

  df = pd.read_csv(file)


Index(['SubjectID', 'IRB', 'Instrument', 'PrimaryAnatomy',
       'Patient_LevelInclude_Exclude', 'Setup', 'Age', 'Sex', 'Ethnicity',
       'Race',
       ...
       'Phasor_SH3_ch2_1', 'Phasor_GH1_ch3_1', 'Phasor_SH1_ch3_1',
       'Phasor_GH2_ch3_1', 'Phasor_SH2_ch3_1', 'Phasor_GH3_ch3_1',
       'Phasor_SH3_ch3_1', 'Tob_label', 'Alc_label', 'ML_label'],
      dtype='object', length=125)


In [18]:
print(list(df.columns))

['SubjectID', 'IRB', 'Instrument', 'PrimaryAnatomy', 'Patient_LevelInclude_Exclude', 'Setup', 'Age', 'Sex', 'Ethnicity', 'Race', 'HighestDiagnosisLevel', 'HPVStatus', 'TobaccoUse', 'SmokelessTobaccoUse', 'AlcoholUse', 'OccultPrimaryTumor', 'TumorFocality', 'GrossTumorSize_GreatestDimensionInCentimeters_', 'TumorDepthOfInvasion_DOI__mm_', 'Margins_DistanceFromClosestMargin_mm_', 'StandardUptakeValue_SUV_FromPET_CT', 'NumberOfLymphNodesExamined', 'NumberOfInvolvedLymphNodes', 'SizeOfLargestMetastaticDeposit_cm_', 'LateralityOfLymphNodesInvolved', 'PrimaryTumor_pT_', 'RegionalLymphNodes_pN_', 'TNMDescriptors', 'Chemo_RadiationTherapy', 'AfterSurgery', 'Recurrence', 'Local_days_', 'Regional_days_', 'Distant_days_', 'Death_days_', 'Surgeon', 'Comments_', 'x1', 'Run', 'ScanContext', 'DataChannelsUsed', 'indices_1', 'lifet_avg_ch1_1', 'lifet_avg_ch2_1', 'lifet_avg_ch3_1', 'spec_int_ch1_1', 'spec_int_ch2_1', 'spec_int_ch3_1', 'int_ratio_ch1_1', 'int_ratio_ch2_1', 'int_ratio_ch3_1', 'Laguerre_c

In [21]:
#checks for unique values in a column, used for additional data filtration
print(list(df['ML_label'].unique()))

['Healthy', 'Cancer', 'Ignore']


In [22]:
#additional filtering, removes tags and other data that is not important. 

'''notes:

1) remove run, ReferenceWLI and AnnotationFile?
2) Removed 'Comments_', has no real impact on data prediction
3) Removed Surgeon, does not have impact on study 

'''
df = df.drop(['Instrument', 'SubjectID','error_line','error','IRB','x1','Comments_', 'Surgeon', 'Patient_LevelInclude_Exclude','Run','DataChannelsUsed', 'StandardUptakeValue_SUV_FromPET_CT'], axis=1)


#remove all datapoints with the labelling of 'ignore'

df = df[df['ML_label'] != 'Ignore']

In [23]:

print(df['ML_label'].unique())

['Healthy' 'Cancer']


In [24]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
print(categorical_columns)
df_encoded = df.drop(categorical_columns, axis=1)

['PrimaryAnatomy', 'Setup', 'Sex', 'Ethnicity', 'Race', 'HighestDiagnosisLevel', 'HPVStatus', 'TobaccoUse', 'SmokelessTobaccoUse', 'AlcoholUse', 'OccultPrimaryTumor', 'TumorFocality', 'GrossTumorSize_GreatestDimensionInCentimeters_', 'Margins_DistanceFromClosestMargin_mm_', 'LateralityOfLymphNodesInvolved', 'PrimaryTumor_pT_', 'RegionalLymphNodes_pN_', 'TNMDescriptors', 'Chemo_RadiationTherapy', 'AfterSurgery', 'Recurrence', 'Local_days_', 'Regional_days_', 'Distant_days_', 'Death_days_', 'ScanContext', 'AnnotationFile', 'RefrenceWLI', 'Tob_label', 'Alc_label', 'ML_label']


In [25]:
# # #ENCODING
# # '''Temporarily using One-Hot, will be computationally expensive, but initial testing'''

categorical_columns = ['Sex','Race','HighestDiagnosisLevel','HPVStatus','TobaccoUse', 'AlcoholUse','Tob_label','Alc_label','ML_label']

categorical_columns2 = df.select_dtypes(include=['object']).columns.tolist()
# print(categorical_columns)


encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(df[categorical_columns])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded dataframe with the original dataframe
df_encoded = pd.concat([df, one_hot_df], axis=1)

# Drop the original categorical columns
df_encoded = df_encoded.drop(categorical_columns2, axis=1)

df_encoded = one_hot_df


In [None]:
#CORRELATION HEATMAP
corr_matrix = df_encoded.corr()
#additional filtering to make the data more readable.
 
high_corr = corr_matrix[corr_matrix > 0.5]
high_corr = corr_matrix[corr_matrix < -0.5]

heatmap = sns.heatmap(high_corr, vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)
plt.figure(figsize=(20, 20))
plt.show()

In [29]:
#Numerical codes:
threshold = 0.5
significant_corrs = corr_matrix[(corr_matrix.abs() >= threshold) & (corr_matrix.abs() < 1.0)]

correlation_pairs = significant_corrs.unstack().dropna().reset_index()
correlation_pairs.columns = ['Feature_1', 'Feature_2', 'Correlation']

# Filter out lower/upper triangular duplicates
correlation_pairs = correlation_pairs.loc[correlation_pairs['Feature_1'] < correlation_pairs['Feature_2']]


correlation_pairs = correlation_pairs.sort_values(by='Correlation', key=abs, ascending=False)

# Display the top correlations
print(correlation_pairs.head(100))  # Adjust the number to display as needed

#laguerre coeffs

                              Feature_1  \
330                          Sex_Female   
167               Laguerre_coeffs_ch2_7   
258               Laguerre_coeffs_ch3_7   
239               Laguerre_coeffs_ch3_5   
146               Laguerre_coeffs_ch2_5   
..                                  ...   
284              Laguerre_coeffs_ch3_10   
214              Laguerre_coeffs_ch2_12   
103              Laguerre_coeffs_ch1_11   
358                    Tob_label_Former   
338  HighestDiagnosisLevel_LG Dysplasia   

                                  Feature_2  Correlation  
330                                Sex_Male    -1.000000  
167                   Laguerre_coeffs_ch2_9     0.985841  
258                   Laguerre_coeffs_ch3_9     0.981855  
239                   Laguerre_coeffs_ch3_7     0.981495  
146                   Laguerre_coeffs_ch2_7     0.979072  
..                                      ...          ...  
284                   Laguerre_coeffs_ch3_9     0.695583  
214        