In [1]:
import pickle as pk
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,KBinsDiscretizer
from xgboost import XGBClassifier
from sklearn.feature_selection import RFECV, chi2
from sklearn.model_selection import StratifiedKFold,train_test_split


In [2]:
data = pd.read_csv("heart_disease.csv")
X = data.iloc[:, :13]
y = data.iloc[:, 13]

feature_names = list(X.columns) if hasattr(X, 'columns') else [f'Feature_{i}' for i in range(X.shape[1])]

# Sacling,Splitting
X_scaled = StandardScaler().fit_transform(X)

X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,random_state=42,test_size=0.2)

# Linea REFCV for optimal feature selection
xgb = XGBClassifier()
rfecv = RFECV(xgb, cv=StratifiedKFold(5), scoring='accuracy')
X_selected = rfecv.fit(X_train, y_train).transform(X_scaled)

# Show which features were selected
selected_feature_mask = rfecv.get_support()
selected_feature_indices = rfecv.get_support(indices=True)
selected_feature_names = [feature_names[i] for i in selected_feature_indices]
X_selected_df = pd.DataFrame(X_selected, columns=selected_feature_names)
X_selected_df.head()



Unnamed: 0,sex,cp,fbs,thalach,exang,oldpeak,ca,thal
0,0.691095,-2.240629,2.430427,0.017494,-0.696419,1.068965,-0.721976,0.655877
1,0.691095,0.87388,-0.41145,-1.816334,1.435916,0.381773,2.478425,-0.89422
2,0.691095,0.87388,-0.41145,-0.89942,1.435916,1.326662,1.411625,1.172577
3,0.691095,-0.164289,-0.41145,1.63301,-0.696419,2.099753,-0.721976,-0.89422
4,-1.44698,-1.202459,-0.41145,0.978071,-0.696419,0.295874,-0.721976,-0.89422


In [3]:
## chi sequare test applied on labels and discrete values
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X_discretized = discretizer.fit_transform(X_scaled) # Use selected features to see whether their importance is by concidence

chi2_scores, p_values = chi2(X_discretized, y)

# Create results dataframe
results = pd.DataFrame({
    'Feature': feature_names,
    'Chi2_Score': chi2_scores,
    'P_Value': p_values,
    'Significant': p_values < 0.05  # if p_values < 0.05 then Significant = 1 (True) (important feature)
}).sort_values('Chi2_Score', ascending=False) 

results = results.query('Significant == True').T # Drop the coulmns with 0 Significant (Flase)
results

Unnamed: 0,12,8,11,9,10,1,2,5,6,7
Feature,thal,exang,ca,oldpeak,slope,sex,cp,fbs,restecg,thalach
Chi2_Score,192.764587,156.476728,154.058016,86.787258,54.133277,30.046584,30.024657,27.113845,26.052867,19.329907
P_Value,0.0,0.0,0.0,0.0,0.0,0.000005,0.000005,0.000019,0.000031,0.000677
Significant,True,True,True,True,True,True,True,True,True,True


Observations:
- RFE ---> only Eight features {sex,cp,fbs,thalach,exang,oldpeak,ca,thal}
- chi sequare test ---> Chol and trestbps are irrelevant ---> remaining features {age thal	exang	ca	oldpeak	slope	sex	cp	fbs	restecg	thalach}
- Better to use 11 remaining features for better model relevancy and performance 

In [4]:
data = data.drop(['chol','trestbps'],axis=1)
data['num'] = np.where(data['num']>0,1,0) # Binary Classification >>>> 1 --> Disease , 0 --> No Disease
data.to_csv("selected_features.csv")
data

Unnamed: 0,age,sex,cp,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,0,2,108,1,1.5,2,3.0,3.0,1
2,67,1,4,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,0,2,172,0,1.4,1,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
292,57,0,4,0,0,123,1,0.2,2,0.0,7.0,1
293,45,1,1,0,0,132,0,1.2,2,0.0,7.0,1
294,68,1,4,1,0,141,0,3.4,2,2.0,7.0,1
295,57,1,4,0,0,115,1,1.2,2,1.0,7.0,1
