In [20]:
import os
import pandas as pd

base_dir = "../../Data"
base_csv_path = os.path.join(base_dir, 'm1_imputed.csv')
output_csv_path = os.path.join(base_dir, 'm1_post_feat_sel.csv')
assert os.path.exists(base_csv_path), f"base {base_csv_path} does not exist"

df = pd.read_csv(base_csv_path)

In [21]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

mean_wl_time = df['wl_time'].mean()
print(f"Mean of wl_time: {mean_wl_time}")

X = df.drop(['wl_time'], axis=1)
y = df['wl_time'].apply(lambda x: 1 if x > mean_wl_time else 0)

Mean of wl_time: 192.52839426430694


In [22]:
from sklearn.preprocessing import StandardScaler

columns_to_scale = [col for col in X.columns if X[col].nunique() > 2]
scaler = StandardScaler()
X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

In [23]:
log = LogisticRegression(max_iter=1000)

selector = RFECV(estimator=log, cv=5, scoring='f1', verbose=1)
selector.fit(X, y)

print(f"Num of features: {X.shape[1]}")
print(f"Num of features recommended after feature selection: {selector.n_features_}")

Fitting estimator with 125 features.
Fitting estimator with 124 features.
Fitting estimator with 123 features.
Fitting estimator with 122 features.
Fitting estimator with 121 features.
Fitting estimator with 120 features.
Fitting estimator with 119 features.
Fitting estimator with 118 features.
Fitting estimator with 117 features.
Fitting estimator with 116 features.
Fitting estimator with 115 features.
Fitting estimator with 114 features.
Fitting estimator with 113 features.
Fitting estimator with 112 features.
Fitting estimator with 111 features.
Fitting estimator with 110 features.
Fitting estimator with 109 features.
Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 106 features.
Fitting estimator with 105 features.
Fitting estimator with 104 features.
Fitting estimator with 103 features.
Fitting estimator with 102 features.
Fitting estimator with 101 features.
Fitting estimator with 100 features.
Fitting estimator with 99 features.
Fi

In [24]:
feature_rankings = list(zip(df.columns, selector.ranking_, selector.support_))
feature_rankings_sorted = sorted(feature_rankings, key=lambda x: x[1])

X_new = df[[col for col in df.columns if col in X.columns[selector.support_]]]
df_sel = pd.concat([X_new, y], axis=1)

log.fit(X_new, y)  # Fit to get weight coefficients

for feature, ranking, support in feature_rankings_sorted:
    print(f"Feature: {feature}, Ranking: {ranking}, Support: {support}")

Feature: thoracic_dgn, Ranking: 1, Support: True
Feature: wgt_kg_tcr, Ranking: 1, Support: True
Feature: hgt_cm_tcr, Ranking: 1, Support: True
Feature: func_stat_tcr, Ranking: 1, Support: True
Feature: most_rcnt_creat, Ranking: 1, Support: True
Feature: tot_serum_album, Ranking: 1, Support: True
Feature: hemo_co_tcr, Ranking: 1, Support: True
Feature: init_stat, Ranking: 1, Support: True
Feature: init_age, Ranking: 1, Support: True
Feature: init_hgt_cm_calc, Ranking: 1, Support: True
Feature: init_wgt_kg_calc, Ranking: 1, Support: True
Feature: wl_time, Ranking: 1, Support: True
Feature: thoracic_dgn_missing, Ranking: 1, Support: True
Feature: wgt_kg_tcr_missing, Ranking: 1, Support: True
Feature: hgt_cm_tcr_missing, Ranking: 1, Support: True
Feature: func_stat_tcr_missing, Ranking: 1, Support: True
Feature: most_rcnt_creat_missing, Ranking: 1, Support: True
Feature: tot_serum_album_missing, Ranking: 1, Support: True
Feature: hemo_co_tcr_missing, Ranking: 1, Support: True
Feature: init

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
#Print the weights of the model with the corresponding feature name
weights = list(zip(X_new.columns, log.coef_[0]))
weights_sorted = sorted(weights, key=lambda x: x[1])
for feature, weight in weights_sorted:
    print(f"Feature: {feature}, Weight: {weight}")

Feature: inotropes_tcr_1, Weight: -0.49948787484737517
Feature: abo_AB, Weight: -0.444520348664826
Feature: icu_Y, Weight: -0.4261257503828402
Feature: inotropic_missing, Weight: -0.3760069376902345
Feature: tot_serum_album_missing, Weight: -0.30690856967712005
Feature: iabp_tcr_1, Weight: -0.22740847277584214
Feature: hemo_co_tcr_missing, Weight: -0.1709182092503493
Feature: work_income_tcr_N, Weight: -0.1686376646283263
Feature: academic_level_tcr_missing, Weight: -0.1647339809659294
Feature: ventilator_tcr_1, Weight: -0.14183255596445069
Feature: abo_B, Weight: -0.14149837841027624
Feature: gender_F, Weight: -0.1322472052677724
Feature: cig_use_missing, Weight: -0.12819107830795273
Feature: education_998.0, Weight: -0.12738733266840668
Feature: num_prev_tx_0, Weight: -0.11467664885844933
Feature: education_996.0, Weight: -0.11336515105022314
Feature: prior_card_surg_tcr_U, Weight: -0.10793485597109626
Feature: histry_cig_old_N, Weight: -0.10322744997847577
Feature: rvad_at_listing_1

In [26]:
df_sel.to_csv(output_csv_path, index=False)