In [120]:
import os
import pandas as pd

base_dir = "../../Data"
base_csv_path = os.path.join(base_dir, 'm0_imputed.csv')
output_csv_path = os.path.join(base_dir, 'm0_post_feat_sel.csv')
assert os.path.exists(base_csv_path), f"base {base_csv_path} does not exist"

df = pd.read_csv(base_csv_path)

In [121]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

mean_wl_time = df['wl_time'].mean()
print(f"Mean of wl_time: {mean_wl_time}")

X = df.drop(['wl_time'], axis=1)
y = df['wl_time'].apply(lambda x: 1 if x > mean_wl_time else 0)

Mean of wl_time: 192.52839426430694


In [122]:
from sklearn.preprocessing import StandardScaler

columns_to_scale = [col for col in X.columns if X[col].nunique() > 2]
scaler = StandardScaler()
X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

In [123]:
log = LogisticRegression(max_iter=1000)

selector = RFECV(estimator=log, cv=5, scoring='f1', verbose=1)
selector.fit(X, y)

print(f"Num of features: {X.shape[1]}")
print(f"Num of features recommended after feature selection: {selector.n_features_}")

Fitting estimator with 113 features.
Fitting estimator with 112 features.
Fitting estimator with 111 features.
Fitting estimator with 110 features.
Fitting estimator with 109 features.
Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 106 features.
Fitting estimator with 105 features.
Fitting estimator with 104 features.
Fitting estimator with 103 features.
Fitting estimator with 102 features.
Fitting estimator with 101 features.
Fitting estimator with 100 features.
Fitting estimator with 99 features.
Fitting estimator with 98 features.
Fitting estimator with 97 features.
Fitting estimator with 96 features.
Fitting estimator with 95 features.
Fitting estimator with 94 features.
Fitting estimator with 93 features.
Fitting estimator with 92 features.
Fitting estimator with 91 features.
Fitting estimator with 90 features.
Fitting estimator with 89 features.
Fitting estimator with 88 features.
Fitting estimator with 87 features.
Fitting estima

In [124]:
feature_rankings = list(zip(df.columns, selector.ranking_, selector.support_))
feature_rankings_sorted = sorted(feature_rankings, key=lambda x: x[1])

X_new = df[[col for col in df.columns if col in X.columns[selector.support_]]]
df_sel = pd.concat([X_new, y], axis=1)

log.fit(X_new, y)  # Fit to get weight coefficients

for feature, ranking, support in feature_rankings_sorted:
    print(f"Feature: {feature}, Ranking: {ranking}, Support: {support}")

Feature: thoracic_dgn, Ranking: 1, Support: True
Feature: wgt_kg_tcr, Ranking: 1, Support: True
Feature: hgt_cm_tcr, Ranking: 1, Support: True
Feature: func_stat_tcr, Ranking: 1, Support: True
Feature: most_rcnt_creat, Ranking: 1, Support: True
Feature: tot_serum_album, Ranking: 1, Support: True
Feature: init_hgt_cm_calc, Ranking: 1, Support: True
Feature: init_wgt_kg_calc, Ranking: 1, Support: True
Feature: wl_time, Ranking: 1, Support: True
Feature: num_prev_tx_0, Ranking: 1, Support: True
Feature: num_prev_tx_1, Ranking: 1, Support: True
Feature: num_prev_tx_2, Ranking: 1, Support: True
Feature: num_prev_tx_3, Ranking: 1, Support: True
Feature: num_prev_tx_4, Ranking: 1, Support: True
Feature: num_prev_tx_5, Ranking: 1, Support: True
Feature: num_prev_tx_6, Ranking: 1, Support: True
Feature: num_prev_tx_7, Ranking: 1, Support: True
Feature: num_prev_tx_10, Ranking: 1, Support: True
Feature: tah_N, Ranking: 1, Support: True
Feature: tah_Y, Ranking: 1, Support: True
Feature: tah_missi

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [125]:
#Print the weights of the model with the corresponding feature name
weights = list(zip(X_new.columns, log.coef_[0]))
weights_sorted = sorted(weights, key=lambda x: x[1])
for feature, weight in weights_sorted:
    print(f"Feature: {feature}, Weight: {weight}")

Feature: abo_AB, Weight: -0.9111677490773694
Feature: inotropic_missing, Weight: -0.7254108899861532
Feature: inotropes_tcr_1, Weight: -0.597860125398477
Feature: icu_Y, Weight: -0.5693654248621034
Feature: iabp_tcr_1, Weight: -0.48113620824120107
Feature: academic_level_tcr_missing, Weight: -0.37158156028228223
Feature: education_996.0, Weight: -0.3408913712524209
Feature: ventilator_tcr_1, Weight: -0.3271421106640328
Feature: histry_cig_old_N, Weight: -0.3152757011902236
Feature: num_prev_tx_0, Weight: -0.2636842887695939
Feature: cig_use_missing, Weight: -0.26102810764282147
Feature: rvad_at_listing_1.0, Weight: -0.24636403386490532
Feature: tah_N, Weight: -0.21311987545045197
Feature: prior_card_surg_tcr_U, Weight: -0.21065086720479706
Feature: ecmo_tcr_1, Weight: -0.20629204436636647
Feature: work_income_tcr_N, Weight: -0.19997048706717918
Feature: education_998.0, Weight: -0.19648796472012445
Feature: onvent_Y, Weight: -0.17910429750750842
Feature: malig_tcr_N, Weight: -0.1720545

In [126]:
df_sel.to_csv(output_csv_path, index=False)