In [116]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV, VarianceThreshold, SelectKBest, mutual_info_classif, SelectKBest
from sklearn.linear_model import Lasso
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.base import BaseEstimator



In [117]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

In [118]:
%run pre_processing2.ipynb

Missing values in Diagnosis at the Patient Level
0=Unknown
1=benign or non-malignant disease
2= malignant, primary lung cancer
3 = malignant metastatic
: 847
Missing values in Diagnosis Method
0 = unknown
1 = review of radiological images to show 2 years of stable nodule
2 = biopsy
3 = surgical resection
4 = progression or response: 847
Missing values in Primary tumor site for metastatic disease: 847
Missing values in Diagnosis Nodule 1: 865
Missing values in Diagnosis Method Nodule 1: 865
Missing values in Diagnosis Nodule 2: 954
Missing values in Diagnosis Method Nodule 2: 954
Missing values in Diagnosis Nodule 3: 966
Missing values in Diagnosis Method Nodule 3: 966
Missing values in Diagnosis Nodule 4: 966
Missing values in Diagnosis Method Nodule 4: 966
Missing values in Diagnosis Nodule 5: 966
Missing values in Diagnosis Method Nodule 5: 966
Missing values in Diagnosis at the Patient Level
0=Unknown
1=benign or non-malignant disease
2= malignant, primary lung cancer
3 = malignant 

# Cross-Validation

#### Using a set of manually chosen features

###### Feature Set 1

In [119]:
group_I= ['Calcification', 'Diagnosis Method Nodule 1',
'Diagnosis Method Nodule 2',
'Diagnosis Method Nodule 3',
'Diagnosis Method Nodule 4',
'Diagnosis Method Nodule 5',
'Diagnosis Nodule 1',
'Diagnosis Nodule 2',
'Diagnosis Nodule 3',
'Diagnosis Nodule 4',
'Diagnosis Nodule 5',
'Diagnosis at the Patient Level\n0=Unknown\n1=benign or non-malignant disease\n2= malignant, primary lung cancer\n3 = malignant metastatic\n',
'Internalstructure', 
'Lobulation',
'Malignancy',
'Margin',
'Nodule_id',
'Patient_id',
'Primary tumor site for metastatic disease',
'Sphericity',
'Spiculation',
'Subtlety',
'Texture',
'original_firstorder_10Percentile',
'original_firstorder_90Percentile',
'original_firstorder_Energy',
'original_firstorder_Entropy',
'original_firstorder_InterquartileRange',
'original_firstorder_Kurtosis',
'original_firstorder_Maximum',
'original_firstorder_Mean',
'original_firstorder_MeanAbsoluteDeviation',
'original_firstorder_Median',
'original_firstorder_Minimum',
'original_firstorder_Range',
'original_firstorder_RobustMeanAbsoluteDeviation',
'original_firstorder_RootMeanSquared',
'original_firstorder_Skewness',
'original_firstorder_TotalEnergy',
'original_firstorder_Uniformity',
'original_firstorder_Variance',
'original_glcm_Autocorrelation',
'original_glcm_ClusterProminence',
'original_glcm_ClusterShade',
'original_glcm_ClusterTendency',
'original_glcm_Contrast',
'original_glcm_Correlation',
'original_glcm_InverseVariance',
'original_glcm_JointAverage',
'original_glcm_JointEntropy',
'original_glcm_SumEntropy',
'original_glcm_SumSquares',
'original_gldm_DependenceEntropy',
'original_gldm_DependenceNonUniformity',
'original_gldm_DependenceNonUniformityNormalized',
'original_gldm_DependenceVariance',
'original_gldm_GrayLevelNonUniformity',
'original_gldm_GrayLevelVariance',
'original_gldm_HighGrayLevelEmphasis',
'original_gldm_LargeDependenceEmphasis',
'original_gldm_LargeDependenceHighGrayLevelEmphasis',
'original_gldm_LargeDependenceLowGrayLevelEmphasis',
'original_gldm_LowGrayLevelEmphasis',
'original_gldm_SmallDependenceEmphasis',
'original_gldm_SmallDependenceHighGrayLevelEmphasis',
'original_gldm_SmallDependenceLowGrayLevelEmphasis',
'original_shape_Elongation',
'original_shape_Flatness',
'original_shape_LeastAxisLength',
'original_shape_MajorAxisLength',
'original_shape_Maximum2DDiameterColumn',
'original_shape_Maximum2DDiameterRow',
'original_shape_Maximum2DDiameterSlice',
'original_shape_MinorAxisLength',
'original_shape_Sphericity',
'original_shape_SurfaceArea',
'original_shape_SurfaceVolumeRatio',
'original_shape_VoxelVolume']

###### Feature Set 2

In [120]:
group_II = ['Calcification', 
'Diagnosis Method Nodule 1',
'Diagnosis Method Nodule 2',
'Diagnosis Method Nodule 3',
'Diagnosis Method Nodule 4',
'Diagnosis Method Nodule 5',
'Diagnosis Nodule 1',
'Diagnosis Nodule 2',
'Diagnosis Nodule 3',
'Diagnosis Nodule 4',
'Diagnosis Nodule 5',
'Diagnosis at the Patient Level\n0=Unknown\n1=benign or non-malignant disease\n2= malignant, primary lung cancer\n3 = malignant metastatic\n',
'Internalstructure', 
'Lobulation',
'Malignancy',
'Margin',
'Nodule_id',
'Patient_id',
'Primary tumor site for metastatic disease',
'Sphericity',
'Spiculation',
'Subtlety',
'Texture',
'original_glcm_Autocorrelation',
'original_glcm_ClusterProminence',
'original_glcm_ClusterShade',
'original_glcm_ClusterTendency',
'original_glcm_Contrast',
'original_glcm_Correlation',
'original_glcm_InverseVariance',
'original_glcm_JointAverage',
'original_glcm_JointEntropy',
'original_glcm_SumEntropy',
'original_glcm_SumSquares',
'original_gldm_DependenceEntropy',
'original_gldm_DependenceNonUniformity',
'original_gldm_DependenceNonUniformityNormalized',
'original_gldm_DependenceVariance',
'original_gldm_GrayLevelNonUniformity',
'original_gldm_GrayLevelVariance',
'original_gldm_HighGrayLevelEmphasis',
'original_gldm_LargeDependenceEmphasis',
'original_gldm_LargeDependenceHighGrayLevelEmphasis',
'original_gldm_LargeDependenceLowGrayLevelEmphasis',
'original_gldm_LowGrayLevelEmphasis',
'original_gldm_SmallDependenceEmphasis',
'original_gldm_SmallDependenceHighGrayLevelEmphasis',
'original_gldm_SmallDependenceLowGrayLevelEmphasis',
'original_shape_Elongation',
'original_shape_Flatness',
'original_shape_LeastAxisLength',
'original_shape_MajorAxisLength',
'original_shape_Maximum2DDiameterColumn',
'original_shape_Maximum2DDiameterRow',
'original_shape_Maximum2DDiameterSlice',
'original_shape_MinorAxisLength',
'original_shape_Sphericity',
'original_shape_SurfaceArea',
'original_shape_SurfaceVolumeRatio',
'original_shape_VoxelVolume']

###### Feature Set 3

In [121]:
group_III = ['Calcification', 
'Diagnosis Method Nodule 1',
'Diagnosis Method Nodule 2',
'Diagnosis Method Nodule 3',
'Diagnosis Method Nodule 4',
'Diagnosis Method Nodule 5',
'Diagnosis Nodule 1',
'Diagnosis Nodule 2',
'Diagnosis Nodule 3',
'Diagnosis Nodule 4',
'Diagnosis Nodule 5',
'Diagnosis at the Patient Level\n0=Unknown\n1=benign or non-malignant disease\n2= malignant, primary lung cancer\n3 = malignant metastatic\n',
'Internalstructure', 
'Lobulation',
'Malignancy',
'Margin',
'Nodule_id',
'Patient_id',
'Primary tumor site for metastatic disease',
'Sphericity',
'Spiculation',
'Subtlety',
'Texture',
'original_firstorder_10Percentile',
'original_firstorder_90Percentile',
'original_firstorder_Energy',
'original_firstorder_Entropy',
'original_firstorder_InterquartileRange',
'original_firstorder_Kurtosis',
'original_firstorder_Maximum',
'original_firstorder_Mean',
'original_firstorder_MeanAbsoluteDeviation',
'original_firstorder_Median',
'original_firstorder_Minimum',
'original_firstorder_Range',
'original_firstorder_RobustMeanAbsoluteDeviation',
'original_firstorder_RootMeanSquared',
'original_firstorder_Skewness',
'original_firstorder_TotalEnergy',
'original_firstorder_Uniformity',
'original_firstorder_Variance',
'original_glcm_Autocorrelation',
'original_glcm_ClusterProminence',
'original_glcm_ClusterShade',
'original_glcm_ClusterTendency',
'original_glcm_Contrast',
'original_glcm_Correlation',
'original_gldm_DependenceEntropy',
'original_gldm_DependenceNonUniformity',
'original_gldm_DependenceNonUniformityNormalized',
'original_gldm_DependenceVariance',
'original_glrlm_GrayLevelNonUniformity',
'original_glrlm_GrayLevelNonUniformityNormalized',
'original_glrlm_GrayLevelVariance',
'original_glrlm_HighGrayLevelRunEmphasis',
'original_glrlm_LongRunEmphasis',
'original_glrlm_LongRunHighGrayLevelEmphasis',
'original_glrlm_LongRunLowGrayLevelEmphasis',
'original_glrlm_LowGrayLevelRunEmphasis',
'original_glrlm_RunEntropy',
'original_glrlm_RunLengthNonUniformity',
'original_glrlm_RunLengthNonUniformityNormalized',
'original_glrlm_RunPercentage',
'original_glrlm_RunVariance',
'original_glrlm_ShortRunEmphasis',
'original_glrlm_ShortRunHighGrayLevelEmphasis',
'original_glrlm_ShortRunLowGrayLevelEmphasis',
'original_shape_Elongation',
'original_shape_Flatness',
'original_shape_LeastAxisLength',
'original_shape_MajorAxisLength',
'original_shape_Maximum2DDiameterColumn',
'original_shape_Maximum2DDiameterRow',
'original_shape_Maximum2DDiameterSlice',
'original_shape_MinorAxisLength',
'original_shape_Sphericity',
'original_shape_SurfaceArea',
'original_shape_SurfaceVolumeRatio',
'original_shape_VoxelVolume']


###### Feature Set 4

In [122]:
group_IV = ['Calcification', 
'Diagnosis Method Nodule 1',
'Diagnosis Method Nodule 2',
'Diagnosis Method Nodule 3',
'Diagnosis Method Nodule 4',
'Diagnosis Method Nodule 5',
'Diagnosis Nodule 1',
'Diagnosis Nodule 2',
'Diagnosis Nodule 3',
'Diagnosis Nodule 4',
'Diagnosis Nodule 5',
'Diagnosis at the Patient Level\n0=Unknown\n1=benign or non-malignant disease\n2= malignant, primary lung cancer\n3 = malignant metastatic\n',
'Internalstructure', 
'Lobulation',
'Malignancy',
'Margin',
'Nodule_id',
'Patient_id',
'Primary tumor site for metastatic disease',
'Sphericity',
'Spiculation',
'Subtlety',
'Texture',
'original_firstorder_10Percentile',
'original_firstorder_90Percentile',
'original_firstorder_Energy',
'original_firstorder_Entropy',
'original_firstorder_InterquartileRange',
'original_firstorder_Kurtosis',
'original_firstorder_Maximum',
'original_firstorder_Mean',
'original_firstorder_MeanAbsoluteDeviation',
'original_firstorder_Median',
'original_firstorder_Minimum',
'original_firstorder_Range',
'original_firstorder_RobustMeanAbsoluteDeviation',
'original_firstorder_RootMeanSquared',
'original_firstorder_Skewness',
'original_firstorder_TotalEnergy',
'original_firstorder_Uniformity',
'original_firstorder_Variance',
'original_glcm_Autocorrelation',
'original_glcm_ClusterProminence',
'original_glcm_ClusterShade',
'original_glcm_ClusterTendency',
'original_glcm_Contrast',
'original_glcm_Correlation',
'original_gldm_DependenceEntropy',
'original_gldm_DependenceNonUniformity',
'original_gldm_DependenceNonUniformityNormalized',
'original_gldm_DependenceVariance',
'original_glrlm_GrayLevelNonUniformity',
'original_glrlm_GrayLevelNonUniformityNormalized',
'original_glrlm_GrayLevelVariance',
'original_glrlm_HighGrayLevelRunEmphasis',
'original_glrlm_LongRunEmphasis',
'original_glrlm_LongRunHighGrayLevelEmphasis',
'original_glrlm_LongRunLowGrayLevelEmphasis',
'original_glrlm_LowGrayLevelRunEmphasis',
'original_glrlm_RunEntropy',
'original_glrlm_RunLengthNonUniformity',
'original_glrlm_RunLengthNonUniformityNormalized',
'original_glrlm_RunPercentage',
'original_glrlm_RunVariance',
'original_glrlm_ShortRunEmphasis',
'original_glrlm_ShortRunHighGrayLevelEmphasis',
'original_glrlm_ShortRunLowGrayLevelEmphasis',
'original_shape_Elongation',
'original_shape_Flatness',
'original_shape_LeastAxisLength',
'original_shape_MajorAxisLength',
'original_shape_Maximum2DDiameterColumn',
'original_shape_Maximum2DDiameterRow',
'original_shape_Maximum2DDiameterSlice',
'original_shape_MinorAxisLength',
'original_shape_Sphericity',
'original_shape_SurfaceArea',
'original_shape_SurfaceVolumeRatio',
'original_shape_VoxelVolume']


##### Applying Bagging and Random Subspaces

In [123]:
def cross_validation(df, group, target_column):
    X = df[group]
    Y = df[target_column]
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    model = RandomForestClassifier(random_state=42)
    cv_scores_group = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    model.fit(X_train, y_train)
    y_pred_group = model.predict(X_test)
    accuracy_group = accuracy_score(y_test, y_pred_group)
    
    return cv_scores_group.mean(), accuracy_group

result = cross_validation(low_2d_sd, group_I, 'Malignancy')
print("Group I, Low 2D SD")
print("Cross-Validation Score:", result[0])
print("Test Accuracy:", result[1])
print()
result1 = cross_validation(low_2d_sd, group_II, 'Malignancy')
print("Group II, Low 2D SD")
print("Cross-Validation Score:", result1[0])
print("Test Accuracy:", result1[1])
print()
result2 = cross_validation(low_2d_sd, group_III, 'Malignancy')
print("Group III, Low 2D SD")
print("Cross-Validation Score:", result2[0])
print("Test Accuracy:", result2[1])
print()
result3 = cross_validation(low_2d_sd, group_IV, 'Malignancy')
print("Group IV, Low 2D SD")
print("Cross-Validation Score:", result3[0])
print("Test Accuracy:", result3[1])
print()
result4 = cross_validation(medium_2d_sd, group_I, 'Malignancy')
print("Group I, Medium 2D SD")
print("Cross-Validation Score:", result4[0])
print("Test Accuracy:", result4[1])
print()
result5 = cross_validation(medium_2d_sd, group_II, 'Malignancy')
print("Group II, Medium 2D SD")
print("Cross-Validation Score:", result5[0])
print("Test Accuracy:", result5[1])
print()
result6 = cross_validation(medium_2d_sd, group_III, 'Malignancy')
print("Group III, Medium 2D SD")
print("Cross-Validation Score:", result6[0])
print("Test Accuracy:", result6[1])
print()
result7 = cross_validation(medium_2d_sd, group_IV, 'Malignancy')
print("Group IV, Medium 2D SD")
print("Cross-Validation Score:", result7[0])
print("Test Accuracy:", result7[1])
print()
result8 = cross_validation(high_2d_sd, group_I, 'Malignancy')
print("Group I, High 2D SD")
print("Cross-Validation Score:", result8[0])
print("Test Accuracy:", result8[1])
print()
result9 = cross_validation(high_2d_sd, group_II, 'Malignancy')
print("Group II, High 2D SD")
print("Cross-Validation Score:", result9[0])
print("Test Accuracy:", result9[1])
print()
result10 = cross_validation(high_2d_sd, group_III, 'Malignancy')
print("Group III, High 2D SD")
print("Cross-Validation Score:", result10[0])
print("Test Accuracy:", result10[1])
print()
result11 = cross_validation(high_2d_sd, group_IV, 'Malignancy')
print("Group IV, High 2D SD")
print("Cross-Validation Score:", result11[0])
print("Test Accuracy:", result11[1])




Group I, Low 2D SD
Cross-Validation Score: 0.7684210526315789
Test Accuracy: 0.8333333333333334





Group II, Low 2D SD
Cross-Validation Score: 0.7894736842105263
Test Accuracy: 0.875





Group III, Low 2D SD
Cross-Validation Score: 0.8
Test Accuracy: 0.875





Group IV, Low 2D SD
Cross-Validation Score: 0.8
Test Accuracy: 0.875





Group I, Medium 2D SD
Cross-Validation Score: 0.7684210526315789
Test Accuracy: 0.8333333333333334





Group II, Medium 2D SD
Cross-Validation Score: 0.7894736842105263
Test Accuracy: 0.875





Group III, Medium 2D SD
Cross-Validation Score: 0.8
Test Accuracy: 0.875





Group IV, Medium 2D SD
Cross-Validation Score: 0.8
Test Accuracy: 0.875





Group I, High 2D SD
Cross-Validation Score: 0.7684210526315789
Test Accuracy: 0.8333333333333334





Group II, High 2D SD
Cross-Validation Score: 0.7894736842105263
Test Accuracy: 0.875





Group III, High 2D SD
Cross-Validation Score: 0.8
Test Accuracy: 0.875





Group IV, High 2D SD
Cross-Validation Score: 0.8
Test Accuracy: 0.875


### Cross-Validation Plot

In [124]:
# import matplotlib.pyplot as plt
# from sklearn.impute import SimpleImputer

# # Filter numeric columns from the DataFrame
# numeric_columns = low_2d_sd.select_dtypes(include=['number'])

# # Create empty lists to store feature names and their corresponding cross-validation scores
# feature_names = []
# avg_scores = []

# model = RandomForestClassifier()

# # Create a SimpleImputer to handle missing values (replace with mean)
# imputer = SimpleImputer(strategy='mean')

# for feature in numeric_columns.columns:
#     X_subset = numeric_columns[[feature]]
#     y = low_2d_sd['Malignancy']

#     # Apply the imputer to handle missing values
#     X_subset = imputer.fit_transform(X_subset)

#     scores = cross_val_score(model, X_subset, y, cv=5, scoring='accuracy')
#     feature_names.append(feature)
#     avg_scores.append(scores.mean())
# #
# # Create a bar plot
# plt.figure(figsize=(12, 6))
# plt.bar(feature_names, avg_scores)
# plt.xticks(rotation=90)
# plt.xlabel('Feature Subset')
# plt.ylabel('Average Cross-Validation Score')
# plt.title('Cross-Validation Scores for Different Feature Subsets')
# plt.show()


# Recursive Feature Elimination (RFE)

In [125]:

# Assuming 'Malignancy' is your target column
target_column = low_2d_sd['Malignancy']

# Check if it's a Series
if isinstance(target_column, pd.Series) and target_column.ndim == 1:
    print("'Malignancy' is a 1-dimensional Series.")
else:
    print("'Malignancy' is not a 1-dimensional Series.")


'Malignancy' is a 1-dimensional Series.


In [126]:

def rfe(dataframe, target_column, num_features_to_select=5):
    # Split the DataFrame into X (features) and y (target)
    X = dataframe.drop(columns=[target_column])
    y = dataframe[target_column]
    
    # Create the RFE model with a RandomForestClassifier (you can use any other estimator)
    model = RandomForestClassifier()
    
    # Create an RFE model with cross-validation
    rfe = RFECV(model, step=1, cv=5, scoring='accuracy', min_features_to_select=num_features_to_select)
    
    # Fit the RFE model to the data
    rfe.fit(X, y)
    
    # Get the selected features
    selected_features = X.columns[rfe.support_]
    
    return selected_features


In [127]:
selected_features = rfe(low_2d_sd, 'Malignancy')
print("Selected Features:", selected_features)



Selected Features: Index(['Nodule_id', 'Patient_id', 'Subtlety', 'Internalstructure',
       'Calcification', 'Sphericity', 'Margin', 'Lobulation', 'Spiculation',
       'Texture', 'original_shape_Elongation', 'original_shape_Flatness',
       'original_shape_LeastAxisLength', 'original_shape_MajorAxisLength',
       'original_shape_Maximum2DDiameterColumn',
       'original_shape_Maximum2DDiameterRow',
       'original_shape_Maximum2DDiameterSlice',
       'original_shape_MinorAxisLength', 'original_shape_Sphericity',
       'original_shape_SurfaceArea', 'original_shape_SurfaceVolumeRatio',
       'original_shape_VoxelVolume', 'original_firstorder_Energy',
       'original_firstorder_TotalEnergy', 'original_glcm_Autocorrelation',
       'original_glcm_DifferenceVariance',
       'original_glrlm_GrayLevelNonUniformity',
       'original_glrlm_LongRunEmphasis',
       'original_glrlm_LongRunHighGrayLevelEmphasis',
       'original_glrlm_LongRunLowGrayLevelEmphasis',
       'original_glr

In [128]:
selected_features1 = rfe(medium_2d_sd, 'Malignancy')
print("Selected Features:", selected_features)



Selected Features: Index(['Nodule_id', 'Patient_id', 'Subtlety', 'Internalstructure',
       'Calcification', 'Sphericity', 'Margin', 'Lobulation', 'Spiculation',
       'Texture', 'original_shape_Elongation', 'original_shape_Flatness',
       'original_shape_LeastAxisLength', 'original_shape_MajorAxisLength',
       'original_shape_Maximum2DDiameterColumn',
       'original_shape_Maximum2DDiameterRow',
       'original_shape_Maximum2DDiameterSlice',
       'original_shape_MinorAxisLength', 'original_shape_Sphericity',
       'original_shape_SurfaceArea', 'original_shape_SurfaceVolumeRatio',
       'original_shape_VoxelVolume', 'original_firstorder_Energy',
       'original_firstorder_TotalEnergy', 'original_glcm_Autocorrelation',
       'original_glcm_DifferenceVariance',
       'original_glrlm_GrayLevelNonUniformity',
       'original_glrlm_LongRunEmphasis',
       'original_glrlm_LongRunHighGrayLevelEmphasis',
       'original_glrlm_LongRunLowGrayLevelEmphasis',
       'original_glr

In [129]:
selected_features2 = rfe(high_2d_sd, 'Malignancy')
print("Selected Features:", selected_features)



Selected Features: Index(['Nodule_id', 'Patient_id', 'Subtlety', 'Internalstructure',
       'Calcification', 'Sphericity', 'Margin', 'Lobulation', 'Spiculation',
       'Texture', 'original_shape_Elongation', 'original_shape_Flatness',
       'original_shape_LeastAxisLength', 'original_shape_MajorAxisLength',
       'original_shape_Maximum2DDiameterColumn',
       'original_shape_Maximum2DDiameterRow',
       'original_shape_Maximum2DDiameterSlice',
       'original_shape_MinorAxisLength', 'original_shape_Sphericity',
       'original_shape_SurfaceArea', 'original_shape_SurfaceVolumeRatio',
       'original_shape_VoxelVolume', 'original_firstorder_Energy',
       'original_firstorder_TotalEnergy', 'original_glcm_Autocorrelation',
       'original_glcm_DifferenceVariance',
       'original_glrlm_GrayLevelNonUniformity',
       'original_glrlm_LongRunEmphasis',
       'original_glrlm_LongRunHighGrayLevelEmphasis',
       'original_glrlm_LongRunLowGrayLevelEmphasis',
       'original_glr

# Feature Importance from Trees

In [130]:

# def feature_select_trees(threshold, dataframe, target_column):
#     X = dataframe.drop(columns=[target_column])
#     y = dataframe[target_column]
#     model = RandomForestClassifier()
#     model.fit(X, y)
#     feature_importances = model.feature_importances_
#     selected_features = X.columns[feature_importances > threshold]

#     return selected_features

def feature_select_trees(threshold, dataframe, target_column):
    X = dataframe.drop(columns=[target_column])
    y = dataframe[target_column]
    model = RandomForestClassifier(random_state=42)
    model.fit(X, y)
    feature_importances = model.feature_importances_
    selected_features = X.columns[feature_importances > threshold]
    return selected_features  # Return the selected features



In [131]:
# Print selected features for different thresholds
thresholds = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06]
for threshold in thresholds:
    selected1 = feature_select_trees(threshold, low_2d_sd, 'Malignancy')
    selected2 = feature_select_trees(threshold, medium_2d_sd, 'Malignancy')
    selected3 = feature_select_trees(threshold, high_2d_sd, 'Malignancy')
    print(f"Threshold = {threshold}")
    print("Low 2D SD:")
    print(selected1)
    print("Medium 2D SD:")
    print(selected2)
    print("High 2D SD:")
    print(selected3)
    print()


Threshold = 0.01
Low 2D SD:
Index(['Nodule_id', 'Patient_id', 'Subtlety', 'Calcification', 'Sphericity',
       'Margin', 'Lobulation', 'Spiculation', 'original_shape_Elongation',
       'original_shape_Flatness', 'original_shape_LeastAxisLength',
       'original_shape_MajorAxisLength',
       'original_shape_Maximum2DDiameterColumn',
       'original_shape_Maximum2DDiameterRow',
       'original_shape_Maximum2DDiameterSlice',
       'original_shape_MinorAxisLength', 'original_shape_Sphericity',
       'original_shape_SurfaceArea', 'original_shape_SurfaceVolumeRatio',
       'original_shape_VoxelVolume', 'original_firstorder_Energy',
       'original_firstorder_TotalEnergy',
       'original_glrlm_GrayLevelNonUniformity',
       'original_glrlm_LongRunEmphasis',
       'original_glrlm_LongRunHighGrayLevelEmphasis',
       'original_glrlm_LongRunLowGrayLevelEmphasis',
       'original_glrlm_RunEntropy', 'original_glrlm_RunLengthNonUniformity',
       'original_glrlm_RunLengthNonUniform

# Variance Threshold

In [132]:
def variance_threshold(threshold, dataframe, target_column):
    X = dataframe.drop(columns=[target_column])
    selector = VarianceThreshold(threshold=threshold)
    X_high_variance = selector.fit_transform(X)
    return X_high_variance


In [133]:
# Print selected features for different thresholds
thresholds = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06]
for threshold in thresholds:
    selected1 = variance_threshold(threshold, low_2d_sd, 'Malignancy')
    selected2 = variance_threshold(threshold, medium_2d_sd, 'Malignancy')
    selected3 = variance_threshold(threshold, high_2d_sd, 'Malignancy')
    print(f"Threshold = {threshold}")
    print("Low 2D SD:")
    print(selected1)
    print("Medium 2D SD:")
    print(selected2)
    print("High 2D SD:")
    print(selected3)
    print()


Threshold = 0.01
Low 2D SD:
[[2.210e+02 6.800e+01 4.000e+00 ... 4.000e+00 0.000e+00 0.000e+00]
 [2.710e+02 9.100e+01 3.000e+00 ... 4.000e+00 0.000e+00 0.000e+00]
 [3.290e+02 1.180e+02 5.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 ...
 [2.626e+03 1.004e+03 4.000e+00 ... 3.000e+00 0.000e+00 0.000e+00]
 [2.646e+03 1.010e+03 3.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.650e+03 1.011e+03 1.000e+00 ... 2.000e+00 3.000e+00 4.000e+00]]
Medium 2D SD:
[[2.210e+02 6.800e+01 4.000e+00 ... 4.000e+00 0.000e+00 0.000e+00]
 [2.710e+02 9.100e+01 3.000e+00 ... 4.000e+00 0.000e+00 0.000e+00]
 [3.290e+02 1.180e+02 5.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 ...
 [2.626e+03 1.004e+03 4.000e+00 ... 3.000e+00 0.000e+00 0.000e+00]
 [2.646e+03 1.010e+03 3.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.650e+03 1.011e+03 1.000e+00 ... 2.000e+00 3.000e+00 4.000e+00]]
High 2D SD:
[[2.210e+02 6.800e+01 4.000e+00 ... 4.000e+00 0.000e+00 0.000e+00]
 [2.710e+02 9.100e+01 3.000e+00 ... 4.000e+00 0.000e+00 0.000e+

# SelectKBest and SelectPercentile

In [134]:
def KBest_Percentile(dataframe, target_column):
    X = dataframe.drop(columns=[target_column])
    y = dataframe[target_column]
    model = Lasso(alpha=0.01)
    model.fit(X, y)
    selected_features = X.columns[model.coef_ != 0]
    return selected_features



In [135]:
KBest_Percentile(low_2d_sd, 'Malignancy')

  model = cd_fast.enet_coordinate_descent(


Index(['Nodule_id', 'Patient_id', 'Subtlety', 'Internalstructure',
       'Calcification', 'Sphericity', 'Margin', 'Lobulation',
       'original_shape_LeastAxisLength', 'original_shape_MajorAxisLength',
       'original_shape_Maximum2DDiameterRow',
       'original_shape_Maximum2DDiameterSlice',
       'original_shape_MinorAxisLength', 'original_shape_SurfaceArea',
       'original_shape_VoxelVolume', 'original_firstorder_Energy',
       'original_firstorder_TotalEnergy',
       'original_glrlm_GrayLevelNonUniformity',
       'original_glrlm_LongRunEmphasis',
       'original_glrlm_LongRunHighGrayLevelEmphasis',
       'original_glrlm_LongRunLowGrayLevelEmphasis',
       'original_glrlm_RunLengthNonUniformity',
       'original_glszm_LargeAreaEmphasis',
       'original_glszm_LargeAreaHighGrayLevelEmphasis',
       'original_glszm_LargeAreaLowGrayLevelEmphasis',
       'original_glszm_ZoneVariance', 'original_gldm_DependenceNonUniformity',
       'original_gldm_DependenceVariance',
  

In [136]:
KBest_Percentile(medium_2d_sd, 'Malignancy')

  model = cd_fast.enet_coordinate_descent(


Index(['Nodule_id', 'Patient_id', 'Subtlety', 'Internalstructure',
       'Calcification', 'Sphericity', 'Margin', 'Lobulation',
       'original_shape_LeastAxisLength', 'original_shape_MajorAxisLength',
       'original_shape_Maximum2DDiameterRow',
       'original_shape_Maximum2DDiameterSlice',
       'original_shape_MinorAxisLength', 'original_shape_SurfaceArea',
       'original_shape_VoxelVolume', 'original_firstorder_Energy',
       'original_firstorder_TotalEnergy',
       'original_glrlm_GrayLevelNonUniformity',
       'original_glrlm_LongRunEmphasis',
       'original_glrlm_LongRunHighGrayLevelEmphasis',
       'original_glrlm_LongRunLowGrayLevelEmphasis',
       'original_glrlm_RunLengthNonUniformity',
       'original_glszm_LargeAreaEmphasis',
       'original_glszm_LargeAreaHighGrayLevelEmphasis',
       'original_glszm_LargeAreaLowGrayLevelEmphasis',
       'original_glszm_ZoneVariance', 'original_gldm_DependenceNonUniformity',
       'original_gldm_DependenceVariance',
  

In [137]:
KBest_Percentile(high_2d_sd, 'Malignancy')

  model = cd_fast.enet_coordinate_descent(


Index(['Nodule_id', 'Patient_id', 'Subtlety', 'Internalstructure',
       'Calcification', 'Sphericity', 'Margin', 'Lobulation',
       'original_shape_LeastAxisLength', 'original_shape_MajorAxisLength',
       'original_shape_Maximum2DDiameterRow',
       'original_shape_Maximum2DDiameterSlice',
       'original_shape_MinorAxisLength', 'original_shape_SurfaceArea',
       'original_shape_VoxelVolume', 'original_firstorder_Energy',
       'original_firstorder_TotalEnergy',
       'original_glrlm_GrayLevelNonUniformity',
       'original_glrlm_LongRunEmphasis',
       'original_glrlm_LongRunHighGrayLevelEmphasis',
       'original_glrlm_LongRunLowGrayLevelEmphasis',
       'original_glrlm_RunLengthNonUniformity',
       'original_glszm_LargeAreaEmphasis',
       'original_glszm_LargeAreaHighGrayLevelEmphasis',
       'original_glszm_LargeAreaLowGrayLevelEmphasis',
       'original_glszm_ZoneVariance', 'original_gldm_DependenceNonUniformity',
       'original_gldm_DependenceVariance',
  

# SFS/SBS

In [138]:


# def sequential_feature_selection(dataframe, target_column, k=5, forward=True, scoring='accuracy'):
#     X = dataframe.drop(columns=[target_column])
#     y = dataframe[target_column]
#     # Create the classifier
#     classifier = RandomForestClassifier(n_estimators=100, n_jobs=-1)

#     # Initialize the SequentialFeatureSelector
#     sfs = SequentialFeatureSelector(classifier,
#                                     k_features=k,
#                                     forward=forward,
#                                     floating=False,
#                                     verbose=2,
#                                     scoring=scoring,
#                                     cv=5)

#     sfs = sfs.fit(X, y)

#     # Get the selected feature indices
#     selected_feature_indices = sfs.k_feature_idx_
#     # Get the selected feature names
#     selected_features = X.columns[selected_feature_indices]

#     return selected_features


In [139]:
# sequential_feature_selection(dataframe=low_2d_sd, target_column='Malignancy', k=5, forward=True, scoring='accuracy')