In [11]:
import pandas as pd

In [12]:
df = pd.read_csv("../../Data/model_0.csv")

In [13]:
mean_wl_time = df['wl_time'].mean()
print(f"Mean waiting list time: {mean_wl_time}")

df['wl_time'] = df['wl_time'].apply(lambda x: 0 if x < mean_wl_time else 1)
print(df['wl_time'].value_counts(normalize=True))

Mean waiting list time: 0.28319338586745896
0    0.716807
1    0.283193
Name: wl_time, dtype: float64


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

#Keep first 500 rows for faster processing
df = df[:500]

X = df.drop(['wl_time'], axis=1)
y = df['wl_time']

one_hot_cols = [col for col in df.columns if df[col].nunique() == 2]
columns_to_scale = [col for col in X.columns if col not in one_hot_cols]
# scale all columns except the one-hot encoded ones
scaler = StandardScaler()
X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

log = LogisticRegression(max_iter=1000)
f1_scores = cross_val_score(log, X, y, cv=5, scoring='f1').mean()
f1_macro_scores = cross_val_score(log, X, y, cv=5, scoring='f1_macro').mean()

print(f"Initial F1 score: {f1_scores}")
print(f"Initial F1 macro score: {f1_macro_scores}")

# plot confusion matrix
log.fit(X, y)
y_pred = log.predict(X)
confusion_matrix(y, y_pred)


Initial F1 score: 0.43087336634105
Initial F1 macro score: 0.6233297016483548


array([[328,  27],
       [ 78,  67]])

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFECV
import numpy as np
#Check if X contains wl_time column
if 'wl_time' in X.columns:
    print("X contains wl_time column")

selector = RFECV(estimator=log, cv=5, scoring='f1', verbose=1)
selector.fit(X, y)

print(f"Num of features: {X.shape[1]}")
print(f"Num of features recommended after feature selection: {selector.n_features_}")

Fitting estimator with 109 features.
Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 106 features.
Fitting estimator with 105 features.
Fitting estimator with 104 features.
Fitting estimator with 103 features.
Fitting estimator with 102 features.
Fitting estimator with 101 features.
Fitting estimator with 100 features.
Fitting estimator with 99 features.
Fitting estimator with 98 features.
Fitting estimator with 97 features.
Fitting estimator with 96 features.
Fitting estimator with 95 features.
Fitting estimator with 94 features.
Fitting estimator with 93 features.
Fitting estimator with 92 features.
Fitting estimator with 91 features.
Fitting estimator with 90 features.
Fitting estimator with 89 features.
Fitting estimator with 88 features.
Fitting estimator with 87 features.
Fitting estimator with 86 features.
Fitting estimator with 85 features.
Fitting estimator with 84 features.
Fitting estimator with 83 features.
Fitting estimator 

KeyboardInterrupt: 

In [None]:
feature_rankings = list(zip(df.columns, selector.ranking_, selector.support_))
feature_rankings_sorted = sorted(feature_rankings, key=lambda x: x[1])
for feature, ranking, support in feature_rankings_sorted:
    print(f"Feature: {feature}, Ranking: {ranking}, Support: {support}")

Feature: wgt_kg_tcr, Ranking: 1, Support: True
Feature: func_stat_tcr, Ranking: 1, Support: True
Feature: most_rcnt_creat, Ranking: 1, Support: True
Feature: hemo_co_tcr, Ranking: 1, Support: True
Feature: init_stat, Ranking: 1, Support: True
Feature: init_age, Ranking: 1, Support: True
Feature: wl_time, Ranking: 1, Support: True
Feature: num_prev_tx_0, Ranking: 1, Support: True
Feature: num_prev_tx_10, Ranking: 1, Support: True
Feature: tah_N, Ranking: 1, Support: True
Feature: tah_Y, Ranking: 1, Support: True
Feature: tah_missing, Ranking: 1, Support: True
Feature: vas_N, Ranking: 1, Support: True
Feature: vas_missing, Ranking: 1, Support: True
Feature: onvent_missing, Ranking: 1, Support: True
Feature: icu_N, Ranking: 1, Support: True
Feature: icu_missing, Ranking: 1, Support: True
Feature: inotropic_N, Ranking: 1, Support: True
Feature: inotropic_Y, Ranking: 1, Support: True
Feature: inotropic_missing, Ranking: 1, Support: True
Feature: gender_F, Ranking: 1, Support: True
Feature: 

In [None]:
# Remove all features that don't have support
X_new = df[[col for col in df.columns if col in X.columns[selector.support_]]]

#Print number of features in X_new
print(f"Num of features in X_new: {X_new.shape[1]}")

#Print if X_new contains wl_time column
if 'wl_time' in X_new.columns:
    print("X_new contains wl_time column")

# Concatenate X_new and y
df_new = pd.concat([X_new, y], axis=1)

Num of features in X_new: 56


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

F1_score after feature selection: 0.47857620495006514
f1_macro_scores after feature selection: 0.656198959347118


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#Print the weights of the model with the corresponding feature name
weights = list(zip(X_new.columns, log.coef_[0]))
weights_sorted = sorted(weights, key=lambda x: x[1])
for feature, weight in weights_sorted:
    print(f"Feature: {feature}, Weight: {weight}")

Feature: abo_AB, Weight: -0.9602343775263509
Feature: inotropes_tcr_1, Weight: -0.8409785768464778
Feature: inotropic_missing, Weight: -0.81257318118478
Feature: ethcat_4, Weight: -0.7088972631450621
Feature: histry_cig_old_U, Weight: -0.6663349512736618
Feature: num_prev_tx_0, Weight: -0.6293584691094181
Feature: most_rcnt_creat, Weight: -0.6201834633183423
Feature: academic_level_tcr_missing, Weight: -0.5664215081887961
Feature: work_income_tcr_missing, Weight: -0.5560408921218347
Feature: diab_missing, Weight: -0.5086207805574804
Feature: dial_ty_tcr_missing, Weight: -0.5086207805574804
Feature: education_5.0, Weight: -0.49073467372308816
Feature: diab_5.0, Weight: -0.48040957272191875
Feature: tah_missing, Weight: -0.4670651270692421
Feature: cereb_vasc_N, Weight: -0.4623233287328797
Feature: gender_M, Weight: -0.45947260783025196
Feature: prior_card_surg_tcr_Y, Weight: -0.4551141582777658
Feature: inotropic_Y, Weight: -0.4168961350147655
Feature: dial_ty_tcr_1.0, Weight: -0.383630

In [None]:
df_new.to_csv(f"../../Data/model_0_selected.csv", index=False)