In [44]:
import pandas as pd

In [45]:
df = pd.read_csv("../Data_120294_2023-03-29_cleaned.csv")

In [46]:
mean_wl_time = df['wl_time'].mean()
print(f"Mean waiting list time: {mean_wl_time}")

df['wl_time'] = df['wl_time'].apply(lambda x: 0 if x < mean_wl_time else 1)
print(df['wl_time'].value_counts(normalize=True))



Mean waiting list time: 192.52839426430694
0    0.716807
1    0.283193
Name: wl_time, dtype: float64


In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

#Keep first 500 rows for faster processing
# df = df[:500]

X = df.drop(['wl_time'], axis=1)
y = df['wl_time']

one_hot_cols = [col for col in df.columns if df[col].nunique() == 2]
columns_to_scale = [col for col in X.columns if col not in one_hot_cols]
# scale all columns except the one-hot encoded ones
scaler = StandardScaler()
X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

log = LogisticRegression(max_iter=1000)
f1_scores = cross_val_score(log, X, y, cv=5, scoring='f1').mean()
f1_macro_scores = cross_val_score(log, X, y, cv=5, scoring='f1_macro').mean()

print(f"Initial F1 score: {f1_scores}")
print(f"Initial F1 macro score: {f1_macro_scores}")

# plot confusion matrix
log.fit(X, y)
y_pred = log.predict(X)
confusion_matrix(y, y_pred)


Initial F1 score: 0.4207234479936403
Initial F1 macro score: 0.6316114297647631


array([[51243,  4245],
       [14736,  7186]])

In [48]:
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFECV
import numpy as np
#Check if X contains wl_time column
if 'wl_time' in X.columns:
    print("X contains wl_time column")

selector = RFECV(estimator=log, cv=5, scoring='f1', verbose=1)
selector.fit(X, y)

print(f"Num of features: {X.shape[1]}")
print(f"Num of features recommended after feature selection: {selector.n_features_}")

Fitting estimator with 82 features.
Fitting estimator with 81 features.
Fitting estimator with 80 features.
Fitting estimator with 79 features.
Fitting estimator with 78 features.
Fitting estimator with 77 features.
Fitting estimator with 76 features.
Fitting estimator with 75 features.
Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.
Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 fe

In [49]:
feature_rankings = list(zip(df.columns, selector.ranking_, selector.support_))
feature_rankings_sorted = sorted(feature_rankings, key=lambda x: x[1])
for feature, ranking, support in feature_rankings_sorted:
    print(f"Feature: {feature}, Ranking: {ranking}, Support: {support}")

Feature: thoracic_dgn, Ranking: 1, Support: True
Feature: wgt_kg_tcr, Ranking: 1, Support: True
Feature: hgt_cm_tcr, Ranking: 1, Support: True
Feature: func_stat_tcr, Ranking: 1, Support: True
Feature: most_rcnt_creat, Ranking: 1, Support: True
Feature: tot_serum_album, Ranking: 1, Support: True
Feature: hemo_co_tcr, Ranking: 1, Support: True
Feature: init_stat, Ranking: 1, Support: True
Feature: init_age, Ranking: 1, Support: True
Feature: init_hgt_cm_calc, Ranking: 1, Support: True
Feature: init_wgt_kg_calc, Ranking: 1, Support: True
Feature: wl_time, Ranking: 1, Support: True
Feature: tah_N, Ranking: 1, Support: True
Feature: tah_Y, Ranking: 1, Support: True
Feature: vas_N, Ranking: 1, Support: True
Feature: vas_Y, Ranking: 1, Support: True
Feature: onvent_N, Ranking: 1, Support: True
Feature: onvent_Y, Ranking: 1, Support: True
Feature: icu_N, Ranking: 1, Support: True
Feature: icu_Y, Ranking: 1, Support: True
Feature: inotropic_N, Ranking: 1, Support: True
Feature: inotropic_Y, Ra

In [50]:
# Remove all features that don't have support
X_new = df[[col for col in df.columns if col in X.columns[selector.support_]]]

#Print number of features in X_new
print(f"Num of features in X_new: {X_new.shape[1]}")

#Print if X_new contains wl_time column
if 'wl_time' in X_new.columns:
    print("X_new contains wl_time column")

log = LogisticRegression(max_iter=1000)
log.fit(X_new, y)

# Perform k-fold cross-validation and compute the initial F1 score
avg_score = cross_val_score(log, X_new, y, cv=5, scoring='f1').mean()
avg_score_macro = cross_val_score(log, X_new, y, cv=5, scoring='f1_macro').mean()
print(f"F1_score after feature selection: {avg_score}")
print(f'f1_macro_scores after feature selection: {avg_score_macro}')

Num of features in X_new: 79


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

F1_score after feature selection: 0.4125324240393315
f1_macro_scores after feature selection: 0.6261678303761762


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
#Print the weights of the model with the corresponding feature name
weights = list(zip(X_new.columns, log.coef_[0]))
weights_sorted = sorted(weights, key=lambda x: x[1])
for feature, weight in weights_sorted:
    print(f"Feature: {feature}, Weight: {weight}")

Feature: work_income_tcr_N, Weight: -0.9678089315509145
Feature: abo_AB, Weight: -0.684179938805338
Feature: inotropes_tcr_1, Weight: -0.5682821730319841
Feature: academic_level_tcr_3.0, Weight: -0.5487430856844796
Feature: education_996.0, Weight: -0.3393209865930409
Feature: academic_level_tcr_998.0, Weight: -0.31957487361415887
Feature: iabp_tcr_1, Weight: -0.29518645674106053
Feature: inotropic_N, Weight: -0.28241325123497435
Feature: tah_N, Weight: -0.23923932909024853
Feature: dial_ty_tcr_1.0, Weight: -0.2300457286732634
Feature: vas_N, Weight: -0.22789411492461137
Feature: icu_N, Weight: -0.22476052437973923
Feature: malig_tcr_N, Weight: -0.20660069321362803
Feature: ventilator_tcr_1, Weight: -0.1892022310203656
Feature: gender_F, Weight: -0.1883399258357557
Feature: cig_use_N, Weight: -0.17431678726257402
Feature: prior_card_surg_tcr_U, Weight: -0.16468900369117082
Feature: ethnicity_1, Weight: -0.15846805195506433
Feature: academic_level_tcr_1.0, Weight: -0.15822668093245973
F