# 03 - Feature Selection

In [None]:
from src.utils import load_heart_data, split_features_target, get_feature_groups
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import numpy as np
import scipy.sparse as sp
df = load_heart_data('data/heart_disease.csv', allow_synthetic=True, synthetic_path='data/sample_heart_disease.csv')
X, y = split_features_target(df)
num, cat = get_feature_groups(X)
pre = ColumnTransformer([
    ('num', Pipeline([('imputer', SimpleImputer(strategy='median')),
                      ('scaler', StandardScaler())]), num),
    ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                      ('onehot', OneHotEncoder(handle_unknown='ignore'))]), cat)
])
X_prep = pre.fit_transform(X)
if sp.issparse(X_prep):
    X_prep = X_prep.tocsr()
oh_n = pre.named_transformers_['cat']['onehot'].get_feature_names_out().shape[0]
oh_start = X_prep.shape[1] - oh_n
X_oh = X_prep[:, oh_start:]
skb = SelectKBest(score_func=chi2, k=min(10, X_oh.shape[1]))
X_oh_sel = skb.fit_transform(X_oh, y)
selected_mask = skb.get_support()
selected_oh_names = pre.named_transformers_['cat']['onehot'].get_feature_names_out()[selected_mask]
selected_oh_names

In [None]:
base_est = LogisticRegression(max_iter=1000)
rfe = RFE(estimator=base_est, n_features_to_select=min(10, X_prep.shape[1]))
rfe.fit(X_prep, y)
sum(rfe.get_support())