In [136]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2


In [138]:
# Load the cleaned data file
df_cleaned = pd.read_csv('data cleaned.csv')
df_cleaned.head()


Unnamed: 0,age,trestbps,chol,thalch,oldpeak,ca,num,sex_Male,dataset_Hungary,dataset_VA Long Beach,...,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,slope_flat,slope_upsloping,thal_normal,thal_reversable defect
0,0.940446,0.74976,-0.262867,0.029124,1.069475,-0.718306,-0.770537,1,0,0,...,0,1,1,0,0,0,0,0,0,0
1,1.384143,1.596354,0.747722,-1.790447,0.380309,2.487269,0.857665,1,0,0,...,0,0,0,0,0,1,1,0,1,0
2,1.384143,-0.661231,-0.339138,-0.880662,1.327912,1.418744,0.043564,1,0,0,...,0,0,0,0,0,1,1,0,0,1
3,-1.943588,-0.096835,0.061285,1.632079,2.103224,-0.718306,-0.770537,1,0,0,...,1,0,0,1,0,0,0,0,1,0
4,-1.499891,-0.096835,-0.81583,0.982232,0.294163,-0.718306,-0.770537,0,0,0,...,0,0,0,0,0,0,0,1,1,0


# Feature Selection #

In [141]:
# we will create a 'target' column where 0 = no disease and 1 = disease.
df_cleaned['target'] = (df_cleaned['num'] > 0).astype(int)

# Separate features (X) and the new binary target (y)
X = df_cleaned.drop(['num', 'target'], axis=1) # Droping both num and target from features
y = df_cleaned['target'] # Use new binary target column for the model


In [143]:
# Initialize and train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)  # n_estimators is the number of trees in the forest  # random_state ensures reproducibility
model.fit(X, y)

In [145]:
# getting feature importance scores
importances = model.feature_importances_
feature_names = X.columns

# and i will create a DataFrame with the feature names and their importance scores
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

# and then sorting the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)



In [147]:
# and finally Printing the ranked list of features
print("Feature Importance Ranking:")
print(feature_importance_df)

Feature Importance Ranking:
                     feature  importance
5                         ca    0.121773
3                     thalch    0.121155
4                    oldpeak    0.090692
0                        age    0.088284
19    thal_reversable defect    0.082909
18               thal_normal    0.081677
1                   trestbps    0.080639
2                       chol    0.078263
15                exang_True    0.064546
17           slope_upsloping    0.035840
10            cp_non-anginal    0.033027
16                slope_flat    0.029943
6                   sex_Male    0.029065
13            restecg_normal    0.022402
9         cp_atypical angina    0.014768
11         cp_typical angina    0.014364
12                  fbs_True    0.009233
14  restecg_st-t abnormality    0.001157
8      dataset_VA Long Beach    0.000245
7            dataset_Hungary    0.000017


* as we see the importance ranking shows that the most important features is ex:( thalch , ca , oldpeak , thal_normal , age ,thal_reversable defect )
* and also shows that there is less importance features like the last 4 features 
 so it is recommended that we drop them to avoid overfitting the model 


In [150]:
#  the list the features to drop
features_to_drop = [
    'fbs_True',
    'dataset_Hungary',
    'restecg_st-t abnormality',
    'dataset_VA Long Beach'
]

# Droping the specified columns from the DataFrame
# and i will reassign the result to a new DataFrame for clarity (df_reduced)
df_reduced = df_cleaned.drop(features_to_drop, axis=1)

print("Original number of features:", df_cleaned.shape[1])
print("Number of features after dropping:", df_reduced.shape[1])
print("\nFirst 5 rows of the new DataFrame with reduced features:")
print(df_reduced.head())

Original number of features: 22
Number of features after dropping: 18

First 5 rows of the new DataFrame with reduced features:
        age  trestbps      chol    thalch   oldpeak        ca       num  \
0  0.940446  0.749760 -0.262867  0.029124  1.069475 -0.718306 -0.770537   
1  1.384143  1.596354  0.747722 -1.790447  0.380309  2.487269  0.857665   
2  1.384143 -0.661231 -0.339138 -0.880662  1.327912  1.418744  0.043564   
3 -1.943588 -0.096835  0.061285  1.632079  2.103224 -0.718306 -0.770537   
4 -1.499891 -0.096835 -0.815830  0.982232  0.294163 -0.718306 -0.770537   

   sex_Male  cp_atypical angina  cp_non-anginal  cp_typical angina  \
0         1                   0               0                  1   
1         1                   0               0                  0   
2         1                   0               0                  0   
3         1                   0               1                  0   
4         0                   1               0                  0   



#  Recursive Feature Elimination (RFE)

In [153]:
# we will Separate features (X) and the new binary target (y) again for Code Clarity and Reproducibility
df_cleaned['target'] = (df_cleaned['num'] > 0).astype(int)
X = df_cleaned.drop(['num', 'target'], axis=1)
y = df_cleaned['target']


In [155]:
# Initializing the base model (Random Forest Classifier)
estimator = RandomForestClassifier(n_estimators=100, random_state=42)

In [157]:
# and the Initializing RFE and specify the number of features to select (10 for example)
rfe_selector = RFE(estimator=estimator, n_features_to_select=10, step=1)

In [159]:
# now we Fit RFE to the data
rfe_selector.fit(X, y)

In [160]:
# now we have to get the selected features and their rankings
selected_features = list(X.columns[rfe_selector.support_])
feature_ranking = pd.DataFrame({
    'feature': X.columns,
    'ranking': rfe_selector.ranking_
})


In [161]:
# and finally printing the selected features and the full ranking
print("Number of features selected by RFE:", rfe_selector.n_features_)
print("\nSelected Features (Rank 1):")
print(selected_features)
print("\nFull Feature Ranking:")
print(feature_ranking.sort_values(by='ranking', ascending=True))

Number of features selected by RFE: 10

Selected Features (Rank 1):
['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'exang_True', 'slope_upsloping', 'thal_normal', 'thal_reversable defect']

Full Feature Ranking:
                     feature  ranking
0                        age        1
17           slope_upsloping        1
15                exang_True        1
18               thal_normal        1
5                         ca        1
19    thal_reversable defect        1
3                     thalch        1
2                       chol        1
1                   trestbps        1
4                    oldpeak        1
10            cp_non-anginal        2
6                   sex_Male        3
16                slope_flat        4
13            restecg_normal        5
11         cp_typical angina        6
9         cp_atypical angina        7
12                  fbs_True        8
14  restecg_st-t abnormality        9
8      dataset_VA Long Beach       10
7            dataset

* the result of RFE shows that there is most effective features for the model with ranking (1)
* and also shows any other feature with rank more than (1) is less important and considered as noise and may cause overfitting
* so i will use the rfe_selector.support_ attribute to easily create a new DataFrame that contains only the features with a rank of 1

In [163]:
# first we Get a list of the features selected by RFE
selected_features = list(X.columns[rfe_selector.support_])

In [164]:
# then we add the target column back to the list of features to keep
selected_features.append('target')

In [165]:
# and creating a new DataFrame with only the selected features and the target
df_rfe_selected = df_cleaned[selected_features]

In [166]:
print("Original number of features:", df_cleaned.shape[1])
print("Number of features after RFE selection:", df_rfe_selected.shape[1])
print("\nFirst 5 rows of the new DataFrame with RFE-selected features:")
print(df_rfe_selected.head())

Original number of features: 22
Number of features after RFE selection: 11

First 5 rows of the new DataFrame with RFE-selected features:
        age  trestbps      chol    thalch   oldpeak        ca  exang_True  \
0  0.940446  0.749760 -0.262867  0.029124  1.069475 -0.718306           0   
1  1.384143  1.596354  0.747722 -1.790447  0.380309  2.487269           1   
2  1.384143 -0.661231 -0.339138 -0.880662  1.327912  1.418744           1   
3 -1.943588 -0.096835  0.061285  1.632079  2.103224 -0.718306           0   
4 -1.499891 -0.096835 -0.815830  0.982232  0.294163 -0.718306           0   

   slope_upsloping  thal_normal  thal_reversable defect  target  
0                0            0                       0       0  
1                0            1                       0       1  
2                0            0                       1       1  
3                0            1                       0       0  
4                1            1                       0       0  


* now we have new data frame (df_rfe_selected) with only the most important features to modeling

# Chi-Square Test

In [177]:
# first we will create the binary target column again
df_cleaned['target'] = (df_cleaned['num'] > 0).astype(int)

In [181]:
# aslo separate features (X) and the new binary target (y)
# but i will only Select the categorical features for the Chi-Square test in list (categorical_features) as X
categorical_features = [
    'sex_Male',
    'dataset_Hungary',
    'dataset_VA Long Beach',
    'cp_atypical angina',
    'cp_non-anginal',
    'cp_typical angina',
    'fbs_True',
    'restecg_normal',
    'restecg_st-t abnormality',
    'exang_True',
    'slope_flat',
    'slope_upsloping',
    'thal_normal',
    'thal_reversable defect'
]


X_categorical = df_cleaned[categorical_features]
y = df_cleaned['target']

In [183]:
# now we perform Chi-Square Test
chi_scores, p_values = chi2(X_categorical, y)

# and creating a DataFrame for the results
chi_results = pd.DataFrame({
    'feature': X_categorical.columns,
    'chi_score': chi_scores,
    'p_value': p_values
})

In [185]:
# time to Sort it by p-value in ascending order to see the most significant features first
chi_results = chi_results.sort_values(by='p_value')

In [189]:
#finally printing the chi-squared results
print("Chi-Square Test Results:")
print(chi_results)

Chi-Square Test Results:
                     feature  chi_score       p_value
13    thal_reversable defect  43.564613  4.101897e-11
12               thal_normal  37.743269  8.069468e-10
9                 exang_True  36.486887  1.536967e-09
11           slope_upsloping  23.686096  1.133989e-06
4             cp_non-anginal  20.523113  5.891553e-06
10                slope_flat  20.127240  7.245734e-06
3         cp_atypical angina  15.576274  7.924274e-05
0                   sex_Male   7.629523  5.742048e-03
7             restecg_normal   4.060108  4.390769e-02
5          cp_typical angina   2.382734  1.226827e-01
8   restecg_st-t abnormality   1.307116  2.529176e-01
1            dataset_Hungary   1.151079  2.833234e-01
2      dataset_VA Long Beach   1.151079  2.833234e-01
6                   fbs_True   0.000009  9.975523e-01


* as we see in the chi-squared results that the most significant features are (first 10 features) with p-value ( less than 0.05) which is enough to be the highest significant FINAL features

# Selecting The Final Features

In [193]:
# Create the binary target column from 'num'
df_cleaned['target'] = (df_cleaned['num'] > 0).astype(int)

In [196]:
# NOW i will  Select the final list of features based on the most relevant features for modeling
#from the results of all the feature selection methods we used 

final_features = [
    'oldpeak',
    'thalch',
    'ca',
    'age',
    'chol',
    'trestbps',
    'thal_reversable defect',
    'exang_True',
    'cp_atypical angina',
    'sex_Male',
    'thal_normal',
    'cp_non-anginal'
]


In [198]:
# Add the target column to the list of features to keep
final_features.append('target')

In [200]:
# create the new DataFrame with only the selected features
df_final = df_cleaned[final_features]

In [202]:
print("Original number of columns:", df_cleaned.shape[1])
print("Final number of features for modeling:", df_final.shape[1])
print("\nFirst 5 rows of the final DataFrame:")
print(df_final.head())

Original number of columns: 22
Final number of features for modeling: 13

First 5 rows of the final DataFrame:
    oldpeak    thalch        ca       age      chol  trestbps  \
0  1.069475  0.029124 -0.718306  0.940446 -0.262867  0.749760   
1  0.380309 -1.790447  2.487269  1.384143  0.747722  1.596354   
2  1.327912 -0.880662  1.418744  1.384143 -0.339138 -0.661231   
3  2.103224  1.632079 -0.718306 -1.943588  0.061285 -0.096835   
4  0.294163  0.982232 -0.718306 -1.499891 -0.815830 -0.096835   

   thal_reversable defect  exang_True  cp_atypical angina  sex_Male  \
0                       0           0                   0         1   
1                       0           1                   0         1   
2                       1           1                   0         1   
3                       0           0                   0         1   
4                       0           0                   1         0   

   thal_normal  cp_non-anginal  target  
0            0               0

* here is the most relevant features for modeling as (df_final) 

In [207]:
# now we save this data as final_dataset for upcoming steps 
df_final.to_csv('final_dataset.csv', index=False)