In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("../Data_120294_2023-03-29_cleaned.csv")

In [4]:
mean_wl_time = df['wl_time'].mean()
print(f"Mean waiting list time: {mean_wl_time}")

df['wl_time'] = df['wl_time'].apply(lambda x: 0 if x < mean_wl_time else 1)
print(df['wl_time'].value_counts(normalize=True))

one_hot_cols = [col for col in df.columns if df[col].nunique() == 2]


Mean waiting list time: 192.52839426430694
0    0.716807
1    0.283193
Name: wl_time, dtype: float64


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

X = df.drop(['wl_time'], axis=1)
y = df['wl_time']

columns_to_scale = [col for col in X.columns if col not in one_hot_cols]
# scale all columns except the one-hot encoded ones
scaler = StandardScaler()
X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

log = LogisticRegression(max_iter=1000)
f1_scores = cross_val_score(log, X, y, cv=5, scoring='f1').mean()
f1_macro_scores = cross_val_score(log, X, y, cv=5, scoring='f1_macro').mean()

print(f"F1 score: {f1_scores}")
print(f"F1 macro score: {f1_macro_scores}")

# plot confusion matrix
log.fit(X, y)
y_pred = log.predict(X)
confusion_matrix(y, y_pred)


F1 score: 0.4207234479936403
F1 macro score: 0.6316114297647631


array([[51243,  4245],
       [14736,  7186]])

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFECV
import numpy as np

# Perform k-fold cross-validation and compute the initial F1 score
scores = cross_val_score(log, X, y, cv=5, scoring='f1')
avg_score = np.mean(scores)
print("Initial F1 score: {:.4f}".format(avg_score))

selector = RFECV(estimator=log, cv=5, scoring='f1', verbose=1)
print(f"Num of features: {X.shape[1]}")

selector.fit(X, y)

print(f"Optimal number of features: {selector.n_features_}")
print(f"The mask of the selected features: {selector.support_}")
print(f"The ranking of the features: {selector.ranking_}")

Initial F1 score: 0.4207
Num of features: 82
Fitting estimator with 82 features.
Fitting estimator with 81 features.
Fitting estimator with 80 features.
Fitting estimator with 79 features.
Fitting estimator with 78 features.
Fitting estimator with 77 features.
Fitting estimator with 76 features.
Fitting estimator with 75 features.
Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.
Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator w

In [30]:
feature_rankings = list(zip(df.columns, selector.ranking_, selector.support_))
feature_rankings_sorted = sorted(feature_rankings, key=lambda x: x[1])
for feature, ranking, support in feature_rankings_sorted:
    print(f"Feature: {feature}, Ranking: {ranking}, Support: {support}")

df.sample(5)

Feature: thoracic_dgn, Ranking: 1, Support: True
Feature: wgt_kg_tcr, Ranking: 1, Support: True
Feature: hgt_cm_tcr, Ranking: 1, Support: True
Feature: func_stat_tcr, Ranking: 1, Support: True
Feature: most_rcnt_creat, Ranking: 1, Support: True
Feature: tot_serum_album, Ranking: 1, Support: True
Feature: hemo_co_tcr, Ranking: 1, Support: True
Feature: init_stat, Ranking: 1, Support: True
Feature: init_age, Ranking: 1, Support: True
Feature: init_hgt_cm_calc, Ranking: 1, Support: True
Feature: init_wgt_kg_calc, Ranking: 1, Support: True
Feature: wl_time, Ranking: 1, Support: True
Feature: tah_N, Ranking: 1, Support: True
Feature: tah_Y, Ranking: 1, Support: True
Feature: vas_N, Ranking: 1, Support: True
Feature: vas_Y, Ranking: 1, Support: True
Feature: onvent_N, Ranking: 1, Support: True
Feature: onvent_Y, Ranking: 1, Support: True
Feature: icu_N, Ranking: 1, Support: True
Feature: icu_Y, Ranking: 1, Support: True
Feature: inotropic_N, Ranking: 1, Support: True
Feature: inotropic_Y, Ra

Unnamed: 0,thoracic_dgn,wgt_kg_tcr,hgt_cm_tcr,func_stat_tcr,most_rcnt_creat,tot_serum_album,hemo_co_tcr,init_stat,init_age,init_hgt_cm_calc,...,ventilator_tcr_0,ventilator_tcr_1,work_income_tcr_N,work_income_tcr_U,work_income_tcr_Y,academic_level_tcr_1.0,academic_level_tcr_2.0,academic_level_tcr_3.0,academic_level_tcr_996.0,academic_level_tcr_998.0
23527,1001.0,53.0703,167.64,2.0,0.8,3.8,6.0,2030.0,26.0,167.64,...,1,0,1,0,0,1,0,0,0,0
22286,1004.0,4.9,56.5,996.0,0.6,3.8,2.0896,2010.0,0.0,56.5,...,0,1,1,0,0,0,0,0,1,0
13931,1000.0,97.659,165.1,2070.0,0.7,4.0,4.9,2030.0,51.0,165.1,...,1,0,1,0,0,0,1,0,0,0
28481,1203.0,80.0,169.0,2.0,1.0,4.0,6.0,2030.0,31.0,169.0,...,1,0,1,0,0,0,0,0,0,1
6023,1007.0,84.3681,193.04,2070.0,1.2,3.8,4.8,2030.0,63.0,193.04,...,1,0,1,0,0,1,0,0,0,0


In [33]:


df_selected = df.drop(['dial_ty_tcr_999.0', 'prior_card_surg_tcr_U', 'education_996.0'], axis=1)

X_p = df_selected.drop(['wl_time'], axis=1)
y_p = df_selected['wl_time']

log = LogisticRegression(max_iter=1000)
log.fit(X_p, y_p)

#Print the weights of the model with the corresponding feature name
for i in range(len(log.coef_[0])):
    print(f"Feature: {df_selected.columns[i]}, Weight: {log.coef_[0][i]}")

log = LogisticRegression(max_iter=1000)
# Perform k-fold cross-validation and compute the initial F1 score
avg_score = cross_val_score(log, X_p, y_p, cv=5, scoring='f1').mean()
print(f"F1_score after feature selection: {avg_score}")

# Create a correlation matrix
corr_matrix = df_selected.corr()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature: thoracic_dgn, Weight: -0.0006423644663371674
Feature: wgt_kg_tcr, Weight: 0.005301462148329629
Feature: hgt_cm_tcr, Weight: 0.018482297766786365
Feature: func_stat_tcr, Weight: -0.0001279932479592956
Feature: most_rcnt_creat, Weight: -0.055939785431659596
Feature: tot_serum_album, Weight: 0.4676233203151908
Feature: hemo_co_tcr, Weight: 0.0426877077793388
Feature: init_stat, Weight: -0.00015004512933338535
Feature: init_age, Weight: -0.002781351697626133
Feature: init_hgt_cm_calc, Weight: -0.01910130760790335
Feature: init_wgt_kg_calc, Weight: 0.01112561788618512
Feature: wl_time, Weight: -0.21642445395531956
Feature: tah_N, Weight: -0.012547489282711186
Feature: tah_Y, Weight: -0.21141186022315991
Feature: vas_N, Weight: -0.017560083014072025
Feature: vas_Y, Weight: -0.09749314823227824
Feature: onvent_N, Weight: -0.1314787950029399
Feature: onvent_Y, Weight: -0.19874435406421928
Feature: icu_N, Weight: -0.030227589169759338
Feature: icu_Y, Weight: -0.24870439449241816
Featur

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

F1_score after feature selection: 0.41260395964449215
