In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv("../data/heart_clean.csv")
X = df.drop("target", axis=1)
y = df["target"]

# Random Forest importance
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X,y)
importances = rf.feature_importances_
feat_imp = sorted(zip(X.columns, importances), key=lambda x: x[1], reverse=True)
print("Top features RF:", feat_imp[:10])

# RFE
logreg = LogisticRegression(max_iter=1000, random_state=42)
rfe = RFE(logreg, n_features_to_select=8)
rfe.fit(X,y)
print("RFE Selected:", X.columns[rfe.support_].tolist())

# Chi2
X_scaled = MinMaxScaler().fit_transform(X)
chi2_selector = SelectKBest(score_func=chi2, k=8)
chi2_selector.fit(X_scaled, y)
print("Chi2 Selected:", X.columns[chi2_selector.get_support()].tolist())

df_rfe = df[X.columns[rfe.support_].tolist() + ["target"]]
df_rfe.to_csv("../data/heart_selected.csv", index=False)


Top features RF: [('thal', np.float64(0.1298278830332388)), ('cp', np.float64(0.12286886842300201)), ('thalach', np.float64(0.12070939234146214)), ('ca', np.float64(0.11965177005546491)), ('oldpeak', np.float64(0.10297050875586732)), ('age', np.float64(0.09310446279186849)), ('chol', np.float64(0.07867322624273217)), ('trestbps', np.float64(0.07270929669649953)), ('exang', np.float64(0.04965159872816644)), ('slope', np.float64(0.046548659266172084))]
RFE Selected: ['sex', 'cp', 'fbs', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
Chi2 Selected: ['sex', 'cp', 'restecg', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
