In [16]:
import os
import pandas as pd

base_dir = "../../Data"
base_csv_path = os.path.join(base_dir, 'm4_imputed.csv')
assert os.path.exists(base_csv_path), f"{base_csv_path} does not exist"

df = pd.read_csv(base_csv_path)

In [17]:
#Print boxplot for wl_time
df.boxplot(column=['wl_time'])

<Axes: >

In [18]:
Q1 = df['wl_time'].quantile(0.25)
Q3 = df['wl_time'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['wl_time'] < lower_bound) | (df['wl_time'] > upper_bound)]

#Number of values in df before removing outliers
print("Number of values in df before removing outliers: ", len(df))

#Remove all the outliers that have values higher than the upper bound
df = df[df['wl_time'] < upper_bound]

#Number of values in df after removing outliers
print("Number of values in df after removing outliers: ", len(df))

Number of values in df before removing outliers:  77410
Number of values in df after removing outliers:  70243


In [19]:
#Print number of missing values in wl_tme
print(f'Number of missing values in wl_time: {df["wl_time"].isnull().sum()}')

#Print number of values in wl_time
print(f'Number of values in wl_time: {df["wl_time"].count()}')

wl_median = df['wl_time'].median()

#Print median for wl_time
print(f'Median for wl_time: {wl_median}')

#Code wl_time to 0 if wl_time is is less than the median else 1
df['wl_time'] = df['wl_time'].apply(lambda x: 0 if x < wl_median else 1)

#Print percentage of 0 and 1 in wl_time
print(f'Percentage of 0 and 1 in wl_time: \n{df["wl_time"].value_counts(normalize=True) * 100}')

#Print number of missing values in wl_time
print(f'Number of missing values in wl_time: {df["wl_time"].isnull().sum()}')

Number of missing values in wl_time: 0
Number of values in wl_time: 70243
Median for wl_time: 65.0
Percentage of 0 and 1 in wl_time: 
1    50.088977
0    49.911023
Name: wl_time, dtype: float64
Number of missing values in wl_time: 0


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

X = df.drop(['wl_time'], axis=1)
y = df['wl_time']

one_hot_cols = [col for col in df.columns if df[col].nunique() == 2]
columns_to_scale = [col for col in X.columns if col not in one_hot_cols]

scaler = StandardScaler()
X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

In [21]:
from sklearn.model_selection import cross_val_predict

log = LogisticRegression(max_iter=1000)
f1_scores = cross_val_score(log, X, y, cv=5, scoring='f1').mean()
print(f"F1 score pre feature selection: {f1_scores}")

f1_macro_scores = cross_val_score(log, X, y, cv=5, scoring='f1_macro').mean()
print(f"F1 macro score pre feature selection: {f1_macro_scores}")

F1 score pre feature selection: 0.6469359783429764
F1 macro score pre feature selection: 0.643097831478922


In [22]:
import numpy as np

log.fit(X, y)

# Get the coefficients (log odds) of the logistic regression model
coefficients = log.coef_[0]

# Calculate the odds ratio for each feature
odds_ratios = np.exp(coefficients)

# Display the odds ratio for each feature
for feature, odds_ratio in zip(X.columns, odds_ratios):
    print(f"Feature: {feature}, Odds Ratio: {odds_ratio}")

f1_scores = cross_val_score(log, X, y, cv=5, scoring='f1').mean()
print(f"F1 score pre feature selection: {f1_scores}")

f1_macro_scores = cross_val_score(log, X, y, cv=5, scoring='f1_macro').mean()
print(f"F1 macro score pre feature selection: {f1_macro_scores}")


Feature: thoracic_dgn, Odds Ratio: 0.8694288812361786
Feature: wgt_kg_tcr, Odds Ratio: 1.2939929318165633
Feature: hgt_cm_tcr, Odds Ratio: 1.5243529690934392
Feature: func_stat_tcr, Odds Ratio: 0.8745717785445133
Feature: most_rcnt_creat, Odds Ratio: 1.0003408190115952
Feature: tot_serum_album, Odds Ratio: 1.2444548788987724
Feature: hemo_co_tcr, Odds Ratio: 0.9897295152967239
Feature: init_stat, Odds Ratio: 1.0455796106305872
Feature: init_age, Odds Ratio: 1.000105096986775
Feature: init_hgt_cm_calc, Odds Ratio: 0.6566330597103299
Feature: init_wgt_kg_calc, Odds Ratio: 1.1985497198951618
Feature: num_prev_tx_0, Odds Ratio: 0.907236689671749
Feature: num_prev_tx_1, Odds Ratio: 0.9037034449540554
Feature: num_prev_tx_2, Odds Ratio: 0.734016259802478
Feature: num_prev_tx_3, Odds Ratio: 1.7852223103197078
Feature: num_prev_tx_4, Odds Ratio: 1.0561045887361604
Feature: num_prev_tx_5, Odds Ratio: 0.8634131705039959
Feature: num_prev_tx_6, Odds Ratio: 0.9352448043645389
Feature: num_prev_tx_

In [23]:
import seaborn as sns

y_pred = cross_val_predict(log, X, y, cv=5)
sns.heatmap(confusion_matrix(y, y_pred), annot=True, cmap='Blues', fmt='g')

<Axes: >