In [40]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import roc_auc_score,f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from select_cohort import get_cohort
DATA_ROOT_PATH = "mimiciv"
VERSION = "2.0"
ICD9_MAPPING_PATH = "utils/ICD9_to_ICD10_mapping.txt"

In [None]:
icd_code: str = "I50", # select cohort on this ICD10 code. I50=Heart Failure.
root: bool = True, # whether to select on only the root ICD code (first three symbols).
demographic_data: bool = True, # whether to add demographic data from the patients table.
admission_data: bool = True, # whether to add admissions data from the admissions table.
diagnoses_data: bool = True, # whether to add diagnoses data from the diagnoses_icd table.
omr_data: bool = True, # whether to add outpatient measurement data from the omr table.
medication_data: bool = True, # whether to add medication data from the prescriptions table.

# select a cohort of hospital visits.
visit_df = get_cohort(icd_code,root,demographic_data,admission_data,diagnoses_data,omr_data,medication_data,os.path.join(DATA_ROOT_PATH,VERSION),ICD9_MAPPING_PATH)
visit_df

In [None]:
# select relevant features
include_features = [
    "age",
    "bmi",
    "admission_type",
    "admission_location",
    "insurance",
    "race",
    "marital_status",
    "gender",
    "n_diagnoses",
    "n_medications",
    "los_hours",
    "bp_systolic",
    "bp_diastolic",
    "hospital_expire_flag"
]
visit_df = visit_df[include_features]

# drop rows with missing values
visit_df = visit_df.dropna().reset_index(drop=True)

# include target feature (within-visit mortality prediction)
y = (visit_df["hospital_expire_flag"] == 1).astype(int)
visit_df = visit_df.drop("hospital_expire_flag",axis=1)

visit_df

In [None]:
# one-hot encode categorical columns
X = []
numerical_cols = []
for col in visit_df.columns:
    try:
        X.append(visit_df[[col]].astype(float))
        numerical_cols.append(col)
    except:
        ohe = OneHotEncoder(sparse_output=False)
        data = ohe.fit_transform(visit_df[[col]])
        data = pd.DataFrame(data,columns=ohe.get_feature_names_out())
        X.append(data)
X = pd.concat(X,axis=1)

X

In [None]:
# perform ML experiment (ROCAUC on imbalanced dataset)
seed = 0 
x_tr,x_te,y_tr,y_te = train_test_split(X,y,stratify=y,train_size=0.7,random_state=seed,shuffle=True)

scaler = StandardScaler()
x_tr[numerical_cols] = scaler.fit_transform(x_tr[numerical_cols])
x_te[numerical_cols] = scaler.transform(x_te[numerical_cols])

model = LogisticRegression(random_state=seed)
model.fit(x_tr,y_tr)

pred_probs = model.predict_proba(x_te)
auc = roc_auc_score(y_te,pred_probs[:,1])
float(auc)

In [None]:
# perform ML experiment (F1 on rebalanced dataset)
y.name = "target"
df = pd.concat([X,y],axis=1)
case_df = df[df.target == 1]
control_df = df[df.target == 0]
control_df = control_df.sample(len(case_df),random_state=seed)
df = pd.concat([case_df,control_df])
y = df.target
X = df.drop("target",axis=1)

x_tr, x_te, y_tr, y_te = train_test_split(X, y, stratify=y, train_size=0.7, random_state=seed)
scaler = StandardScaler()
x_tr[numerical_cols] = scaler.fit_transform(x_tr[numerical_cols])
x_te[numerical_cols] = scaler.transform(x_te[numerical_cols])

model = LogisticRegression(random_state=seed).fit(x_tr, y_tr)
preds = model.predict(x_te)
f1 = f1_score(y_te, preds)
f1
