In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("../Data_120294_2023-03-29_cleaned.csv")

In [6]:
mean_wl_time = df['wl_time'].mean()
print(f"Mean waiting list time: {mean_wl_time}")

df['wl_time'] = df['wl_time'].apply(lambda x: 0 if x < mean_wl_time else 1)
print(df['wl_time'].value_counts(normalize=True))

one_hot_cols = [col for col in df.columns if df[col].nunique() == 2]


Mean waiting list time: 192.52839426430694
0    0.716807
1    0.283193
Name: wl_time, dtype: float64


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

X = df.drop(['wl_time'], axis=1)
y = df['wl_time']

columns_to_scale = [col for col in X.columns if col not in one_hot_cols]
# scale all columns except the one-hot encoded ones
scaler = StandardScaler()
X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

log = LogisticRegression(max_iter=1000)
f1_scores = cross_val_score(log, X, y, cv=5, scoring='f1').mean()
f1_macro_scores = cross_val_score(log, X, y, cv=5, scoring='f1_macro').mean()

print(f"F1 score: {f1_scores}")
print(f"F1 macro score: {f1_macro_scores}")

# plot confusion matrix
log.fit(X, y)
y_pred = log.predict(X)
confusion_matrix(y, y_pred)


F1 score: 0.4207234479936403
F1 macro score: 0.6316114297647631


In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFECV
import numpy as np

# Perform k-fold cross-validation and compute the initial F1 score
scores = cross_val_score(log, X, y, cv=5, scoring='f1')
avg_score = np.mean(scores)
print("Initial F1 score: {:.4f}".format(avg_score))

selector = RFECV(estimator=log, cv=5, scoring='f1', verbose=1)
print(f"Num of features: {X.shape[1]}")

selector.fit(X, y)

print(f"Optimal number of features: {selector.n_features_}")
print(f"The mask of the selected features: {selector.support_}")
print(f"The ranking of the features: {selector.ranking_}")

Initial F1 score: 0.4207
Num of features: 82
Fitting estimator with 82 features.
Fitting estimator with 81 features.
Fitting estimator with 80 features.
Fitting estimator with 79 features.
Fitting estimator with 78 features.
Fitting estimator with 77 features.
Fitting estimator with 76 features.
Fitting estimator with 75 features.
Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.
Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator w

In [9]:
feature_rankings = list(zip(df.columns, selector.ranking_, selector.support_))
feature_rankings_sorted = sorted(feature_rankings, key=lambda x: x[1])
for feature, ranking, support in feature_rankings_sorted:
    print(f"Feature: {feature}, Ranking: {ranking}, Support: {support}")


Feature: thoracic_dgn, Ranking: 1, Support: True
Feature: wgt_kg_tcr, Ranking: 1, Support: True
Feature: hgt_cm_tcr, Ranking: 1, Support: True
Feature: func_stat_tcr, Ranking: 1, Support: True
Feature: most_rcnt_creat, Ranking: 1, Support: True
Feature: tot_serum_album, Ranking: 1, Support: True
Feature: hemo_co_tcr, Ranking: 1, Support: True
Feature: init_stat, Ranking: 1, Support: True
Feature: init_age, Ranking: 1, Support: True
Feature: init_hgt_cm_calc, Ranking: 1, Support: True
Feature: init_wgt_kg_calc, Ranking: 1, Support: True
Feature: wl_time, Ranking: 1, Support: True
Feature: tah_N, Ranking: 1, Support: True
Feature: tah_Y, Ranking: 1, Support: True
Feature: vas_N, Ranking: 1, Support: True
Feature: vas_Y, Ranking: 1, Support: True
Feature: onvent_N, Ranking: 1, Support: True
Feature: onvent_Y, Ranking: 1, Support: True
Feature: icu_N, Ranking: 1, Support: True
Feature: icu_Y, Ranking: 1, Support: True
Feature: inotropic_N, Ranking: 1, Support: True
Feature: inotropic_Y, Ra

In [10]:
df_selected = df.iloc[:, selector.support_]

#Print the shape of the new dataframe
print(df_selected.shape)

#Add the columns names to the new dataframe
df_selected.columns = df.columns[selector.support_]

#Print the names of the columns in the new dataframe
print(df_selected.columns)

#Do a logistic regression on the new dataframe
log = LogisticRegression(max_iter=1000)

#Fit the model
log.fit(df_selected, y)

#Print the weights of the model with the corresponding feature name
for i in range(len(log.coef_[0])):
    print(f"Feature: {df_selected.columns[i]}, Weight: {log.coef_[0][i]}")

# Perform k-fold cross-validation and compute the initial F1 score
scores = cross_val_score(log, X, y, cv=5, scoring='f1')
avg_score = np.mean(scores)
print("F1_score after feature selection: {:.4f}".format(avg_score))

# Create df from df_selected without "thoracic_dgn"
df_selected = df_selected.drop(['thoracic_dgn'], axis=1)
# Create a correlation matrix
corr_matrix = df_selected.corr()

IndexError: Boolean index has wrong length: 82 instead of 83