In [1]:
import sys
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
#import statsmodels.api as sm
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, roc_auc_score, roc_curve
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE

In [2]:
# Load the training dataset from the CSV file into a pandas DataFrame
train_df = pd.read_csv("train_df.csv")

# Load the testing dataset from the CSV file into a pandas DataFrame
test_df = pd.read_csv("test_df.csv")

In [3]:
#Separate the features and target variable from the training dataset
X_train = train_df.drop('Cancer', axis=1)  # Features for training
y_train = train_df.Cancer  # Target variable for training

# Separate the features and target variable from the testing dataset
X_test = test_df.drop('Cancer', axis=1)  # Features for testing
y_test = test_df.Cancer  # Target variable for testing

# Apply Synthetic Minority Over-sampling Technique (SMOTE) to balance the training dataset
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)

# Select top 10 features

In [16]:
# Step 1: Create a RandomForest model
model = RandomForestClassifier(random_state=42)

# Step 2: Create RFE model and select top 10 features
rfe = RFE(model, n_features_to_select=10)
X_new = rfe.fit_transform(X_train, y_train)

# Step 3: Fit the RandomForest model on the selected features
model.fit(X_new, y_train)

# Step 4: Get feature importances from the fitted RandomForest model
feature_importances = model.feature_importances_

# Step 5: Map feature importances back to the original feature names
selected_features = [feature for feature, rank in zip(X_train.columns, rfe.ranking_) if rank == 1]

# Combine feature names with their importances
feature_importance_dict = dict(zip(selected_features, feature_importances))

print(selected_features)

['Age Group', 'Num of Bad Mental Health Days', 'Hours of Sleeping', 'Arthritis', 'Days Drinking', 'Pneumonia Vaccine', 'Tetanus Last 10 Years', 'Income', 'BMI', 'Ethnicity_White']


In [17]:
# Sort the feature importances from highest to lowest
sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))

# Display the feature importances
print("Feature importances of the top 10 selected features:")
for feature, importance in sorted_feature_importance.items():
    print(f"{feature}: {importance}")

Feature importances of the top 10 selected features:
Age Group: 0.32339643954707764
BMI: 0.12812359330495932
Income: 0.09230940471631065
Hours of Sleeping: 0.08201984829731047
Days Drinking: 0.07808895783089995
Pneumonia Vaccine: 0.07494320798612081
Num of Bad Mental Health Days: 0.06104203828854451
Ethnicity_White: 0.05633050772751841
Tetanus Last 10 Years: 0.05402100742186446
Arthritis: 0.049724994879393955


# Select top 20 features

In [18]:
# Step 1: Create a RandomForest model
model = RandomForestClassifier(random_state=42)

# Step 2: Create RFE model and select top 10 features
rfe = RFE(model, n_features_to_select=20)
X_new = rfe.fit_transform(X_train, y_train)

# Step 3: Fit the RandomForest model on the selected features
model.fit(X_new, y_train)

# Step 4: Get feature importances from the fitted RandomForest model
feature_importances = model.feature_importances_

# Step 5: Map feature importances back to the original feature names
selected_features = [feature for feature, rank in zip(X_train.columns, rfe.ranking_) if rank == 1]

# Combine feature names with their importances
feature_importance_dict = dict(zip(selected_features, feature_importances))

print(selected_features)

['Gender', 'Age Group', 'Num of Bad Mental Health Days', 'Years Since Last Checkup', 'Hours of Sleeping', 'Arthritis', 'Married', 'Deaf', 'Age Started Smoking', 'Cigarettes per Day', 'Days Drinking', 'Flu Vaccine', 'Pneumonia Vaccine', 'Tetanus Last 10 Years', 'Had COVID', 'Metropolitan', 'Income', 'Insulin', 'BMI', 'Ethnicity_White']


In [19]:
# Sort the feature importances from highest to lowest
sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))

# Display the feature importances
print("Feature importances of the top 20 selected features:")
for feature, importance in sorted_feature_importance.items():
    print(f"{feature}: {importance}")

Feature importances of the top 20 selected features:
Age Group: 0.23809381096216797
BMI: 0.07039939235044029
Income: 0.0660021660036188
Pneumonia Vaccine: 0.06224467719542843
Arthritis: 0.05579822749657169
Hours of Sleeping: 0.0557480903675899
Days Drinking: 0.055424455457025755
Ethnicity_White: 0.052087797598521314
Num of Bad Mental Health Days: 0.04426062702196869
Flu Vaccine: 0.03940289439664135
Tetanus Last 10 Years: 0.038747154903264455
Age Started Smoking: 0.03227555000062279
Had COVID: 0.030167551538606424
Cigarettes per Day: 0.029871858351774166
Gender: 0.029672553288737556
Insulin: 0.022135887755072928
Metropolitan: 0.02106508629569346
Married: 0.019941536708986372
Years Since Last Checkup: 0.018496303916176914
Deaf: 0.018164378391090804


# Select top 30 features

In [4]:
#Step 1: Create a RandomForest model
model = RandomForestClassifier(random_state=42)

# Step 2: Create RFE model and select top 10 features
rfe = RFE(model, n_features_to_select=30)
X_new = rfe.fit_transform(X_train, y_train)

# Step 3: Fit the RandomForest model on the selected features
model.fit(X_new, y_train)

# Step 4: Get feature importances from the fitted RandomForest model
feature_importances = model.feature_importances_

# Step 5: Map feature importances back to the original feature names
selected_features = [feature for feature, rank in zip(X_train.columns, rfe.ranking_) if rank == 1]

# Combine feature names with their importances
feature_importance_dict = dict(zip(selected_features, feature_importances))

print(selected_features)

['Gender', 'Age Group', 'Num of Bad Mental Health Days', 'Years Since Last Checkup', 'Exercise in Past 30 Days', 'Hours of Sleeping', 'Heart Disease', 'Asthma', 'Depression', 'Kidney Disease', 'Arthritis', 'Diabetes', 'Married', 'Deaf', 'Blind', 'Age Started Smoking', 'Cigarettes per Day', 'Days Drinking', 'Flu Vaccine', 'Pneumonia Vaccine', 'Tetanus Last 10 Years', 'Had COVID', 'Metropolitan', 'Income', 'Insulin', 'BMI', 'Ethnicity_Hispanic', 'Ethnicity_White', 'Education_attended_college', 'Education_graduated_college']


In [5]:
# Sort the feature importances from highest to lowest
sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))

# Display the feature importances
print("Feature importances of the top 10 selected features:")
for feature, importance in sorted_feature_importance.items():
    print(f"{feature}: {importance}")

Feature importances of the top 10 selected features:
Age Group: 0.21936064525432694
Pneumonia Vaccine: 0.06101564749077503
BMI: 0.05676233823362321
Income: 0.05335230282931468
Days Drinking: 0.050224008302660744
Ethnicity_White: 0.05013458282544247
Hours of Sleeping: 0.04676175451138773
Arthritis: 0.045201880750900446
Num of Bad Mental Health Days: 0.039216640823885214
Tetanus Last 10 Years: 0.03382889523179323
Flu Vaccine: 0.033759008969977404
Age Started Smoking: 0.02751240582882533
Cigarettes per Day: 0.026836667617601073
Gender: 0.026670950859586148
Had COVID: 0.02611082187197725
Metropolitan: 0.01794051237727514
Married: 0.01776958193865674
Years Since Last Checkup: 0.017109908759581915
Deaf: 0.016686093445087112
Depression: 0.015838106186621504
Exercise in Past 30 Days: 0.01561024936510931
Insulin: 0.015472394693307478
Asthma: 0.014174861908464753
Heart Disease: 0.013330843097078686
Kidney Disease: 0.01153221314102009
Diabetes: 0.010436619950055412
Education_graduated_college: 0.

# Select top 41 features

In [6]:
#Step 1: Create a RandomForest model
model = RandomForestClassifier(random_state=42)

# Step 2: Create RFE model and select top 10 features
rfe = RFE(model, n_features_to_select=41)
X_new = rfe.fit_transform(X_train, y_train)

# Step 3: Fit the RandomForest model on the selected features
model.fit(X_new, y_train)

# Step 4: Get feature importances from the fitted RandomForest model
feature_importances = model.feature_importances_

# Step 5: Map feature importances back to the original feature names
selected_features = [feature for feature, rank in zip(X_train.columns, rfe.ranking_) if rank == 1]

# Combine feature names with their importances
feature_importance_dict = dict(zip(selected_features, feature_importances))

print(selected_features)

['Gender', 'Age Group', 'Num of Bad Mental Health Days', 'Could Afford Doctor', 'Years Since Last Checkup', 'Exercise in Past 30 Days', 'Hours of Sleeping', 'Heart Attack', 'Heart Disease', 'Stroke', 'Asthma', 'Depression', 'Kidney Disease', 'Arthritis', 'Diabetes', 'Married', 'Deaf', 'Blind', 'Smoked 100', 'Smokeless Tobacco', 'Age Started Smoking', 'Cigarettes per Day', 'Days Drinking', 'Flu Vaccine', 'Pneumonia Vaccine', 'Tetanus Last 10 Years', 'Had COVID', 'Metropolitan', 'Income', 'Insulin', 'BMI', 'Ethnicity_American Indian and Alaskan Native', 'Ethnicity_Asian and Pacific Islander', 'Ethnicity_Black', 'Ethnicity_Hispanic', 'Ethnicity_Multiracial, non-Hispanic', 'Ethnicity_White', 'Education_attended_college', 'Education_did_not_graduate_high_school', 'Education_graduated_college', 'Education_graduated_high_school']


In [7]:
# Sort the feature importances from highest to lowest
sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))

# Display the feature importances
print("Feature importances of the top 41 selected features:")
for feature, importance in sorted_feature_importance.items():
    print(f"{feature}: {importance}")

Feature importances of the top 41 selected features:
Age Group: 0.19404565046422337
Pneumonia Vaccine: 0.06153400810111713
BMI: 0.05262453580382287
Income: 0.0500557602703951
Ethnicity_White: 0.046528424353637916
Days Drinking: 0.045932114425565836
Hours of Sleeping: 0.045369664577126295
Arthritis: 0.043834836224335635
Num of Bad Mental Health Days: 0.036461950724925585
Flu Vaccine: 0.0355748443157517
Tetanus Last 10 Years: 0.03230594751586536
Gender: 0.02596771598710079
Had COVID: 0.024865174694881476
Age Started Smoking: 0.024189656929890502
Cigarettes per Day: 0.023262541407478795
Years Since Last Checkup: 0.020828114860192645
Metropolitan: 0.01785376867010825
Married: 0.01728576228624887
Depression: 0.015664063496423644
Exercise in Past 30 Days: 0.015041058033033666
Deaf: 0.014839507907729511
Insulin: 0.014571864831854147
Asthma: 0.014070656239549032
Kidney Disease: 0.013256680546641735
Heart Disease: 0.010762324603889022
Diabetes: 0.009622673848484985
Education_graduated_college: 