In [2]:
import os
import pandas as pd

base_dir = "../Data"
base_csv_path = os.path.join(base_dir, 'm0_simpleimputer_week6.csv')
output_csv_path = os.path.join(base_dir, 'm0_post_feat_sel.csv')
assert os.path.exists(base_csv_path), f"base {base_csv_path} does not exist"

df = pd.read_csv(base_csv_path)

In [3]:
#Print number of missing values in wl_tme
print(f'Number of missing values in wl_time: {df["wl_time"].isnull().sum()}')

#Print number of values in wl_time
print(f'Number of values in wl_time: {df["wl_time"].count()}')

wl_median = df['wl_time'].median()

Number of missing values in wl_time: 0
Number of values in wl_time: 65067


In [4]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

X = df.drop(['wl_time'], axis=1)
y = df['wl_time'].apply(lambda x: 0 if x < wl_median else 1)

In [5]:
from sklearn.preprocessing import StandardScaler

columns_to_scale = [col for col in X.columns if X[col].nunique() > 2]
scaler = StandardScaler()
X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

In [6]:
log = LogisticRegression(max_iter=1000)

selector = RFECV(estimator=log, cv=5, scoring='f1_macro', verbose=1)
selector.fit(X, y)

print(f"Num of features: {X.shape[1]}")
print(f"Num of features recommended after feature selection: {selector.n_features_}")

Fitting estimator with 99 features.
Fitting estimator with 98 features.
Fitting estimator with 97 features.
Fitting estimator with 96 features.
Fitting estimator with 95 features.
Fitting estimator with 94 features.
Fitting estimator with 93 features.


KeyboardInterrupt: 

In [None]:
feature_rankings = list(zip(df.columns, selector.ranking_, selector.support_))
feature_rankings_sorted = sorted(feature_rankings, key=lambda x: x[1])

X_new = df[[col for col in df.columns if col in X.columns[selector.support_]]]
df_sel = pd.concat([X_new, df['wl_time']], axis=1)

log.fit(X_new, y)  # Fit to get weight coefficients

for feature, ranking, support in feature_rankings_sorted:
    print(f"Feature: {feature}, Ranking: {ranking}, Support: {support}")

Feature: onvent_N, Ranking: 1, Support: True
Feature: abo_A2, Ranking: 1, Support: True
Feature: abo_A2B, Ranking: 1, Support: True
Feature: ecmo_tcr_0, Ranking: 1, Support: True
Feature: inotropes_tcr_0, Ranking: 2, Support: False
Feature: abo_A1, Ranking: 3, Support: False
Feature: iabp_tcr_0, Ranking: 4, Support: False
Feature: num_prev_tx_2, Ranking: 5, Support: False
Feature: abo_B, Ranking: 6, Support: False
Feature: num_prev_tx_7, Ranking: 7, Support: False
Feature: icu_N, Ranking: 8, Support: False
Feature: wgt_kg_tcr, Ranking: 9, Support: False
Feature: education_6.0, Ranking: 10, Support: False
Feature: education_998.0, Ranking: 11, Support: False
Feature: dial_ty_tcr_2.0, Ranking: 12, Support: False
Feature: num_prev_tx_6, Ranking: 13, Support: False
Feature: abo_O, Ranking: 14, Support: False
Feature: abo_A1B, Ranking: 15, Support: False
Feature: ecmo_tcr_1, Ranking: 16, Support: False
Feature: hgt_cm_tcr, Ranking: 17, Support: False
Feature: init_hgt_cm_calc, Ranking: 18, 

In [None]:
#Print the weights of the model with the corresponding feature name
weights = list(zip(X_new.columns, log.coef_[0]))
weights_sorted = sorted(weights, key=lambda x: x[1])
for feature, weight in weights_sorted:
    print(f"Feature: {feature}, Weight: {weight}")

Feature: ecmo_tcr_1, Weight: -1.4294289836615817
Feature: onvent_Y, Weight: -1.1220537811891662
Feature: abo_A2B, Weight: -1.0416320227438083
Feature: abo_AB, Weight: -0.9015052927397232


In [None]:
df_sel.to_csv(output_csv_path, index=False)