In [20]:
import pandas as pd

In [21]:
df = pd.read_csv("../../Data/model_0_pre_feat_selection.csv")

In [22]:

from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler



#Create X without wl_time
X = df.drop(['wl_time'], axis=1)

wl_time = df['wl_time']
mean_wl_time = wl_time.mean()
print(f"Mean of wl_time: {mean_wl_time}")
wl_time = wl_time.apply(lambda x: 1 if x > mean_wl_time else 0)

y = wl_time

#import log
log = LogisticRegression(max_iter=1000)

selector = RFECV(estimator=log, cv=5, scoring='f1', verbose=1)
selector.fit(X, y)

print(f"Num of features: {X.shape[1]}")
print(f"Num of features recommended after feature selection: {selector.n_features_}")

Mean of wl_time: 192.52839426430694
Fitting estimator with 109 features.
Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 106 features.
Fitting estimator with 105 features.
Fitting estimator with 104 features.
Fitting estimator with 103 features.
Fitting estimator with 102 features.
Fitting estimator with 101 features.
Fitting estimator with 100 features.
Fitting estimator with 99 features.
Fitting estimator with 98 features.
Fitting estimator with 97 features.
Fitting estimator with 96 features.
Fitting estimator with 95 features.
Fitting estimator with 94 features.
Fitting estimator with 93 features.
Fitting estimator with 92 features.
Fitting estimator with 91 features.
Fitting estimator with 90 features.
Fitting estimator with 89 features.
Fitting estimator with 88 features.
Fitting estimator with 87 features.
Fitting estimator with 86 features.
Fitting estimator with 85 features.
Fitting estimator with 84 features.
Fitting estimator 

In [23]:
feature_rankings = list(zip(df.columns, selector.ranking_, selector.support_))
feature_rankings_sorted = sorted(feature_rankings, key=lambda x: x[1])
for feature, ranking, support in feature_rankings_sorted:
    print(f"Feature: {feature}, Ranking: {ranking}, Support: {support}")

Feature: thoracic_dgn, Ranking: 1, Support: True
Feature: wgt_kg_tcr, Ranking: 1, Support: True
Feature: hgt_cm_tcr, Ranking: 1, Support: True
Feature: func_stat_tcr, Ranking: 1, Support: True
Feature: most_rcnt_creat, Ranking: 1, Support: True
Feature: tot_serum_album, Ranking: 1, Support: True
Feature: init_hgt_cm_calc, Ranking: 1, Support: True
Feature: init_wgt_kg_calc, Ranking: 1, Support: True
Feature: wl_time, Ranking: 1, Support: True
Feature: num_prev_tx_0, Ranking: 1, Support: True
Feature: num_prev_tx_1, Ranking: 1, Support: True
Feature: num_prev_tx_2, Ranking: 1, Support: True
Feature: num_prev_tx_3, Ranking: 1, Support: True
Feature: num_prev_tx_4, Ranking: 1, Support: True
Feature: num_prev_tx_5, Ranking: 1, Support: True
Feature: num_prev_tx_6, Ranking: 1, Support: True
Feature: num_prev_tx_7, Ranking: 1, Support: True
Feature: num_prev_tx_10, Ranking: 1, Support: True
Feature: tah_N, Ranking: 1, Support: True
Feature: tah_Y, Ranking: 1, Support: True
Feature: tah_missi

In [None]:
# Remove all features that don't have support
X_new = df[[col for col in df.columns if col in X.columns[selector.support_]]]

#Print number of features in X_new
print(f"Num of features in X_new: {X_new.shape[1]}")

#Print if X_new contains wl_time column
if 'wl_time' in X_new.columns:
    print("X_new contains wl_time column")

# Concatenate X_new and y
df_new = pd.concat([X_new, y], axis=1)


In [29]:
log.fit(X_new, y)

# Print the weights of the model with the corresponding feature name

weights = list(zip(X_new.columns, log.coef_[0]))
weights_sorted = sorted(weights, key=lambda x: x[1])
for feature, weight in weights_sorted:
    print(f"Feature: {feature}, Weight: {weight}")

Feature: icu_missing, Weight: -1.1887211948420369
Feature: abo_A1B, Weight: -0.9515599410318385
Feature: dial_ty_tcr_missing, Weight: -0.920243628703519
Feature: abo_A2B, Weight: -0.8803609753664234
Feature: inotropic_missing, Weight: -0.7779818500289889
Feature: vas_Y, Weight: -0.6504225980620125
Feature: abo_AB, Weight: -0.60940229837659
Feature: academic_level_tcr_3.0, Weight: -0.5433563558902137
Feature: inotropes_tcr_1, Weight: -0.5168954006369489
Feature: diab_missing, Weight: -0.48802669849386204
Feature: ecmo_tcr_1, Weight: -0.45282604959477324
Feature: num_prev_tx_6, Weight: -0.4170139354198818
Feature: iabp_tcr_1, Weight: -0.4034411025548296
Feature: num_prev_tx_10, Weight: -0.3810784307311349
Feature: vas_N, Weight: -0.3729030505807167
Feature: onvent_Y, Weight: -0.3686182387329694
Feature: tah_Y, Weight: -0.3648305803561656
Feature: num_prev_tx_3, Weight: -0.3420714777503039
Feature: histry_cig_old_N, Weight: -0.3377139288215613
Feature: education_998.0, Weight: -0.30179188

In [30]:
df_new.to_csv(f"../../Data/model_0_post_feat_selection.csv", index=False)