In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [2]:
%%capture
%run  "C:\Users\Lenovo\Desktop\Heart_Disease_Project_SPRINTSXMICROSOFT\notebooks\01_data_preprocessing.ipynb"

In [3]:
# 1. Use XGBoost feature importance scores to rank variables.

xgmodel=XGBClassifier()
xgmodel.fit(X_cleaned,y_cleaned)

importances=pd.DataFrame({
    'Feature':X_cleaned.columns,
    'Importances':xgmodel.feature_importances_
})
importances=importances.sort_values(by ='Importances',ascending =False)

# keep N top features -> N is chosen to be 9 <-
selected_features_impScores = importances['Feature'].head(11).tolist()
print(f"Selected 11 Features by importance scores: {list(selected_features_impScores)}")

Selected 11 Features by importance scores: ['thal', 'cp', 'ca', 'sex', 'slope', 'oldpeak', 'fbs', 'thalach', 'exang', 'age', 'trestbps']


In [4]:
# 2. Apply Recursive Feature Elimination (RFE) to select the best predictors.
rfe = RFE(n_features_to_select=111,estimator=XGBClassifier())
rfe.fit(X_cleaned,y_cleaned)

# Get selected features
selected_features_RFE = X_cleaned.columns[rfe.support_]
print(f"Selected 13 Features by RFE: {list(selected_features_RFE)}")

Selected 13 Features by RFE: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']


In [5]:
# 3. Use Chi-Square Test to check feature significance

# scale X using min-max scale
scaler_minmax = MinMaxScaler()
X_scaled_2 = scaler_minmax.fit_transform(X_cleaned)

chi2_selector = SelectKBest(score_func=chi2,k='all')
chi2_selector.fit(X_scaled_2,y_cleaned)

# Get scores and p-values
scores = chi2_selector.scores_
p_values = chi2_selector.pvalues_

# Combine with feature names
chi2_results = pd.DataFrame({
    'Feature': X.columns,
    'Chi2 Score': scores,
    'p-value': p_values
}).sort_values(by='Chi2 Score', ascending=False)

# keep N top features -> N is chosen to be 9<-
selected_features_chi2 = chi2_results['Feature'].head(11).tolist()
print(f"Selected 11 Features by importance scores: {list(selected_features_chi2)}")

Selected 11 Features by importance scores: ['thal', 'exang', 'ca', 'oldpeak', 'slope', 'cp', 'sex', 'fbs', 'restecg', 'thalach', 'age']


In [6]:
#4. Select only the most relevant features for modeling. 
# select most common 9 features out of 6 results of 11 features

joined=selected_features_impScores+selected_features_RFE.tolist()+selected_features_chi2
counter=Counter(joined)
most_common=[item[0] for item in counter.most_common(9)]
print(f"Selected 9 Features by 6 methods: {most_common}")

X_reduced = X_cleaned[most_common]

Selected 9 Features by 6 methods: ['thal', 'cp', 'ca', 'sex', 'slope', 'oldpeak', 'fbs', 'thalach', 'exang']


In [7]:
print(X_reduced)

     thal  cp   ca  sex  slope  oldpeak  fbs  thalach  exang
0     6.0   1  0.0    1      3      2.3    1      150      0
1     3.0   4  3.0    1      2      1.5    0      108      1
2     7.0   4  2.0    1      2      2.6    0      129      1
3     3.0   3  0.0    1      3      3.5    0      187      0
4     3.0   2  0.0    0      1      1.4    0      172      0
..    ...  ..  ...  ...    ...      ...  ...      ...    ...
297   7.0   4  0.0    0      2      0.2    0      123      1
298   7.0   1  0.0    1      2      1.2    0      132      0
299   7.0   4  2.0    1      2      3.4    1      141      0
300   7.0   4  1.0    1      2      1.2    0      115      1
301   3.0   2  1.0    0      2      0.0    0      174      0

[297 rows x 9 columns]
