# Feature Selection (Recursive Feature Selection)

## Importing Dependencies

In [18]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

## Defining Feature Selection Functions

In [38]:
## printing the shape and head
def head(df,shape_only=False):
    print(df.shape)

    if shape_only:
        return
    else:
        return df.head()
    
def save_csv(df, filename):
  """
  Saves the DataFrame to a CSV file.

  Args:
    df: A Pandas DataFrame.
    filename: The name of the CSV file to save.

  Returns:
    None.
  """

  df.to_csv(filename)

## Loading the Dataset and Declaring Variable

In [28]:
X_train_feature_engineered_df = pd.read_csv('../data/feature_engineering/X_train_feature_engineered.csv')
y_train = pd.read_csv('../data/feature_engineering/y_train.csv')

X_train_feature_engineered_df = X_train_feature_engineered_df.drop("Unnamed: 0", axis=1)
head(X_train_feature_engineered_df,shape_only=False)

(247083, 19)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,4.0,1.870076,-0.487092,0.00613,-0.953748,-0.213825,-1.454497
1,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,12.0,0.0,3.0,-0.910705,-1.663547,-0.204183,-0.734162,0.886072,-0.205256
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,9.0,0.0,4.0,-0.910705,1.083144,-0.471183,0.470102,-0.213825,-0.326905
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,12.0,0.0,2.0,-0.319105,1.370416,0.39376,-1.236844,0.436915,-0.452252
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,-0.319105,0.689364,0.39376,0.206115,-1.49722,0.027782


In [21]:
head(y_train,shape_only=False)

(247083, 2)


Unnamed: 0.1,Unnamed: 0,Heart_Disease
0,252191,0
1,93646,0
2,182562,1
3,288342,0
4,207357,0


## Defining the RFE Model

In [22]:
# Define the RFE model with a classifier (e.g., RandomForestClassifier)
rfe_model = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)

## Fitting the RFE Model to the Feature Engineered Training Data

In [29]:
# Fit the RFE model to your training data
X_train_feature_selected = rfe_model.fit_transform(X_train_feature_engineered_df, y_train["Heart_Disease"])

## Printing Ranking of Features Based on Importance

In [30]:
# Print the ranking of features based on importance
print("Feature Rankings:")
for feature, rank in zip(X_train_feature_engineered_df.columns, rfe_model.ranking_):
    print(f"Feature: {feature}, Rank: {rank}")

Feature Rankings:
Feature: 0, Rank: 6
Feature: 1, Rank: 8
Feature: 2, Rank: 14
Feature: 3, Rank: 5
Feature: 4, Rank: 15
Feature: 5, Rank: 7
Feature: 6, Rank: 10
Feature: 7, Rank: 13
Feature: 8, Rank: 11
Feature: 9, Rank: 12
Feature: 10, Rank: 2
Feature: 11, Rank: 9
Feature: 12, Rank: 4
Feature: 13, Rank: 3
Feature: 14, Rank: 1
Feature: 15, Rank: 1
Feature: 16, Rank: 1
Feature: 17, Rank: 1
Feature: 18, Rank: 1


## Comparing the Shape of the Dataset Before and After RFE

In [32]:
# Get the number of features in each DataFrame
num_features_selected = X_train_feature_selected.shape[1]
num_features_engineered = X_train_feature_engineered_df.shape[1]

# Compare the number of features
if num_features_selected == num_features_engineered:
  print("The number of features in X_train_feature_selected and X_train_feature_engineered_df are the same.")
else:
  print(f"The number of features in X_train_feature_selected is {num_features_selected}, while the number of features in X_train_feature_engineered_df is {num_features_engineered}.")

The number of features in X_train_feature_selected is 5, while the number of features in X_train_feature_engineered_df is 19.


## Viewing the Feature Selected Dataset After RFE 

In [37]:
# Convert the NumPy array to a pandas DataFrame
X_train_feature_selected_df = pd.DataFrame(X_train_feature_selected)

head(X_train_feature_selected_df)

(247083, 5)


Unnamed: 0,0,1,2,3,4
0,-0.487092,0.00613,-0.953748,-0.213825,-1.454497
1,-1.663547,-0.204183,-0.734162,0.886072,-0.205256
2,1.083144,-0.471183,0.470102,-0.213825,-0.326905
3,1.370416,0.39376,-1.236844,0.436915,-0.452252
4,0.689364,0.39376,0.206115,-1.49722,0.027782


## Saving the Feature Selected Dataset After RFE

In [40]:
save_csv(X_train_feature_selected_df, "../data/feature_selection_rfe/X_train_feature_selected.csv")
save_csv(y_train, "../data/feature_selection_rfe/y_train.csv")