In [8]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler


In [9]:

df_cleaned=pd.read_csv(r"C:\Users\LOQ\OneDrive\Desktop\Heart_Disease_Project\processed\heart_cleaned.csv")
X = df_cleaned.drop(columns="target")
y= df_cleaned['target']

Random forset Classifier

In [10]:
#rank features importance for prediction 

#random forest
model=RandomForestClassifier(random_state=42)
model.fit(X,y)

importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(importance)

     Feature  Importance
7    thalach    0.125853
11        ca    0.123930
2         cp    0.120115
12      thal    0.110071
9    oldpeak    0.094418
0        age    0.092003
4       chol    0.085047
3   trestbps    0.074026
8      exang    0.058648
10     slope    0.047428
1        sex    0.035504
6    restecg    0.021319
5        fbs    0.011637


RFE Selector

In [11]:
model1=LogisticRegression(max_iter=1000)
rfe=RFE(model1,n_features_to_select=10)
rfe.fit(X,y)

rfe_results = pd.DataFrame({
    'Feature': X.columns,
    'Selected (RFE)': rfe.support_,  #boolean 
    'Ranking': rfe.ranking_  #int numbers (1 means more important , higher no means less important)
}).sort_values(by='Ranking')

print(rfe_results)

     Feature  Selected (RFE)  Ranking
1        sex            True        1
2         cp            True        1
5        fbs            True        1
4       chol            True        1
6    restecg            True        1
10     slope            True        1
9    oldpeak            True        1
8      exang            True        1
12      thal            True        1
11        ca            True        1
7    thalach           False        2
3   trestbps           False        3
0        age           False        4


Chi-Square Test

In [12]:
#between two categorical features to show there is a relationship between them and target or not (also for prediction )
#scaling to non negative values to work 
X_normalize = MinMaxScaler().fit_transform(X)

chi2_values, p_values= chi2(X_normalize,y)

chi2_res=pd.DataFrame({
    'Feature': X.columns,
    'Chi value': chi2_values,
    'p_value': p_values
})

combined = pd.merge(importance, rfe_results, on='Feature', how='outer')
combined_all=pd.merge(combined,chi2_res,on='Feature',how='outer')

combined_all.fillna(0, inplace=True)

combined_all




Unnamed: 0,Feature,Importance,Selected (RFE),Ranking,Chi value,p_value
0,age,0.092003,False,4,0.984136,0.3211799
1,ca,0.12393,True,1,24.838104,6.235265e-07
2,chol,0.085047,True,1,0.253828,0.6143927
3,cp,0.120115,True,1,6.775822,0.009240091
4,exang,0.058648,True,1,35.470265,2.589739e-09
5,fbs,0.011637,True,1,0.159152,0.6899384
6,oldpeak,0.094418,True,1,14.441725,0.0001445636
7,restecg,0.021319,True,1,4.798597,0.02848292
8,sex,0.035504,True,1,8.312017,0.003938363
9,slope,0.047428,True,1,9.201527,0.002418133


In [13]:
final= combined_all[
    (combined_all['Selected (RFE)']==True)&
    (combined_all['p_value']<0.05)&
    (combined_all['Importance']>0.01)
]

feature_names = final['Feature'].tolist()
print(feature_names)

['ca', 'cp', 'exang', 'oldpeak', 'restecg', 'sex', 'slope', 'thal']


Save Final Features 

In [14]:
import json

with open("../processed/feature_selection.json", "w") as f:
    json.dump(feature_names, f)

X.to_csv('../processed/x_features.csv',index=False)
y.to_csv('../processed/target.csv',index=False)