In [1]:
import os
import pandas as pd

base_dir = "../../Data"
base_csv_path = os.path.join(base_dir, 'm4_imputed.csv')
output_csv_path = os.path.join(base_dir, 'm4_post_feat_sel.csv')
assert os.path.exists(base_csv_path), f"base {base_csv_path} does not exist"

df = pd.read_csv(base_csv_path)

In [2]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

mean_wl_time = df['wl_time'].mean()
print(f"Mean of wl_time: {mean_wl_time}")

X = df.drop(['wl_time'], axis=1)
y = df['wl_time'].apply(lambda x: 1 if x > mean_wl_time else 0)

Mean of wl_time: 168.5


In [3]:
from sklearn.preprocessing import StandardScaler

columns_to_scale = [col for col in X.columns if X[col].nunique() > 2]
scaler = StandardScaler()
X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

In [4]:
log = LogisticRegression(max_iter=1000)

selector = RFECV(estimator=log, cv=5, scoring='f1', verbose=1)
selector.fit(X, y)

print(f"Num of features: {X.shape[1]}")
print(f"Num of features recommended after feature selection: {selector.n_features_}")

Fitting estimator with 125 features.
Fitting estimator with 124 features.
Fitting estimator with 123 features.
Fitting estimator with 122 features.
Fitting estimator with 121 features.
Fitting estimator with 120 features.
Fitting estimator with 119 features.
Fitting estimator with 118 features.
Fitting estimator with 117 features.
Fitting estimator with 116 features.
Fitting estimator with 115 features.
Fitting estimator with 114 features.
Fitting estimator with 113 features.
Fitting estimator with 112 features.
Fitting estimator with 111 features.
Fitting estimator with 110 features.
Fitting estimator with 109 features.
Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 106 features.
Fitting estimator with 105 features.
Fitting estimator with 104 features.
Fitting estimator with 103 features.
Fitting estimator with 102 features.
Fitting estimator with 101 features.
Fitting estimator with 100 features.
Fitting estimator with 99 features.
Fi

In [5]:
feature_rankings = list(zip(df.columns, selector.ranking_, selector.support_))
feature_rankings_sorted = sorted(feature_rankings, key=lambda x: x[1])

X_new = df[[col for col in df.columns if col in X.columns[selector.support_]]]
df_sel = pd.concat([X_new, y], axis=1)

log.fit(X_new, y)  # Fit to get weight coefficients

for feature, ranking, support in feature_rankings_sorted:
    print(f"Feature: {feature}, Ranking: {ranking}, Support: {support}")

Feature: hgt_cm_tcr, Ranking: 1, Support: True
Feature: func_stat_tcr, Ranking: 1, Support: True
Feature: most_rcnt_creat, Ranking: 1, Support: True
Feature: init_hgt_cm_calc, Ranking: 1, Support: True
Feature: wgt_kg_tcr_missing, Ranking: 1, Support: True
Feature: hemo_co_tcr_missing, Ranking: 1, Support: True
Feature: init_age_missing, Ranking: 1, Support: True
Feature: wl_time_missing, Ranking: 1, Support: True
Feature: inotropic_missing, Ranking: 1, Support: True
Feature: gender_F, Ranking: 1, Support: True
Feature: gender_M, Ranking: 1, Support: True
Feature: abo_B, Ranking: 1, Support: True
Feature: education_4.0, Ranking: 1, Support: True
Feature: education_5.0, Ranking: 1, Support: True
Feature: education_996.0, Ranking: 1, Support: True
Feature: iabp_tcr_1, Ranking: 1, Support: True
Feature: inotropes_tcr_0, Ranking: 1, Support: True
Feature: diab_1.0, Ranking: 1, Support: True
Feature: histry_cig_old_N, Ranking: 1, Support: True
Feature: histry_cig_old_missing, Ranking: 1, Su

In [6]:
#Print the weights of the model with the corresponding feature name
weights = list(zip(X_new.columns, log.coef_[0]))
weights_sorted = sorted(weights, key=lambda x: x[1])
for feature, weight in weights_sorted:
    print(f"Feature: {feature}, Weight: {weight}")

Feature: most_rcnt_creat, Weight: -0.9197207185221191
Feature: histry_cig_old_U, Weight: -0.7724102176047659
Feature: education_6.0, Weight: -0.7368199590218572
Feature: init_stat_missing, Weight: -0.7356322024604485
Feature: gender_M, Weight: -0.6946455780445374
Feature: abo_A, Weight: -0.6548517947378141
Feature: education_5.0, Weight: -0.5775717545065048
Feature: hgt_cm_tcr_missing, Weight: -0.5687615698349054
Feature: init_hgt_cm_calc_missing, Weight: -0.5687615698349054
Feature: num_prev_tx_0, Weight: -0.5255340159324525
Feature: inotropes_tcr_1, Weight: -0.49460398211492906
Feature: func_stat_tcr, Weight: -0.001277939171602899
Feature: init_hgt_cm_calc, Weight: 0.02255510128809141
Feature: hgt_cm_tcr, Weight: 0.023407955813162202
Feature: ethcat_1, Weight: 0.5895627812835844
Feature: inotropes_tcr_0, Weight: 0.6051865828676247
Feature: gender_F, Weight: 0.8052281789560818
Feature: diab_2.0, Weight: 0.8906573431035766
Feature: education_998.0, Weight: 1.247175284559296
Feature: ab

In [7]:
df_sel.to_csv(output_csv_path, index=False)