In [40]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import roc_auc_score,f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from select_cohort import get_cohort
DATA_ROOT_PATH = "mimiciv"
VERSION = "2.0"
ICD9_MAPPING_PATH = "utils/ICD9_to_ICD10_mapping.txt"

In [41]:
icd_code: str = "I50", # select cohort on this ICD10 code. I50=Heart Failure.
root: bool = True, # whether to select on only the root ICD code (first three symbols).
demographic_data: bool = True, # whether to add demographic data from the patients table.
admission_data: bool = True, # whether to add admissions data from the admissions table.
diagnoses_data: bool = True, # whether to add diagnoses data from the diagnoses_icd table.
omr_data: bool = True, # whether to add outpatient measurement data from the omr table.
medication_data: bool = True, # whether to add medication data from the prescriptions table.

# select a cohort of hospital visits.
visit_df = get_cohort(icd_code,root,demographic_data,admission_data,diagnoses_data,omr_data,medication_data,os.path.join(DATA_ROOT_PATH,VERSION),ICD9_MAPPING_PATH)
visit_df

selecting cohort...
adding demographic data...
adding admissions data...
adding #diagnoses data...
adding omr data...
adding #medication data...


Unnamed: 0,subject_id,hadm_id,gender,age,anchor_year,anchor_year_group,dod,yob,min_valid_year,admittime,...,hospital_expire_flag,los,los_hours,n_diagnoses,chartdate_x,bmi,chartdate_y,bp_systolic,bp_diastolic,n_medications
0,12024697,20302177,M,83,2110,2008 - 2010,2111-05-24,2027,2119,2109-12-14 22:50:00,...,0,31 days 16:03:00,760.050000,16,NaT,,NaT,,,269.0
1,14779071,20963511,F,85,2110,2017 - 2019,,2025,2110,2110-01-12 19:02:00,...,0,9 days 16:12:00,232.200000,31,2110-01-13,20.4,2110-03-10,118.0,59.0,86.0
2,15918556,27863083,M,77,2110,2008 - 2010,2110-05-11,2033,2119,2110-01-13 20:03:00,...,0,0 days 19:01:00,19.016667,8,NaT,,NaT,,,11.0
3,12794063,22640655,F,35,2110,2017 - 2019,,2075,2110,2110-01-17 01:29:00,...,0,2 days 11:32:00,59.533333,5,2110-01-17,32.5,NaT,,,13.0
4,13201095,28453791,F,88,2110,2014 - 2016,2110-01-25,2022,2113,2110-01-18 14:46:00,...,1,6 days 18:54:00,162.900000,22,NaT,,NaT,,,118.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64298,16703369,27928947,F,69,2201,2008 - 2010,,2132,2210,2211-11-09 02:26:00,...,0,2 days 15:49:00,63.816667,15,2212-02-29,23.4,2211-05-21,121.0,57.0,32.0
64299,15273135,25809426,F,71,2200,2008 - 2010,,2129,2209,2211-12-02 23:03:00,...,0,6 days 17:26:00,161.433333,17,2211-12-03,39.7,2211-12-02,158.0,65.0,41.0
64300,11973788,27306647,F,71,2206,2011 - 2013,,2135,2212,2212-01-19 15:43:00,...,0,4 days 01:38:00,97.633333,29,2212-01-19,39.9,2212-01-27,144.0,64.0,43.0
64301,11973788,23238116,F,71,2206,2011 - 2013,,2135,2212,2212-01-28 12:08:00,...,0,4 days 05:40:00,101.666667,18,2212-01-28,29.2,2212-01-27,144.0,64.0,42.0


In [42]:
# select relevant features
include_features = [
    "age",
    "bmi",
    "admission_type",
    "admission_location",
    "insurance",
    "race",
    "marital_status",
    "gender",
    "n_diagnoses",
    "n_medications",
    "los_hours",
    "bp_systolic",
    "bp_diastolic",
    "hospital_expire_flag"
]
visit_df = visit_df[include_features]

# drop rows with missing values
visit_df = visit_df.dropna().reset_index(drop=True)

# include target feature (within-visit mortality prediction)
y = (visit_df["hospital_expire_flag"] == 1).astype(int)
visit_df = visit_df.drop("hospital_expire_flag",axis=1)

visit_df

Unnamed: 0,age,bmi,admission_type,admission_location,insurance,race,marital_status,gender,n_diagnoses,n_medications,los_hours,bp_systolic,bp_diastolic
0,83,21.7,EW EMER.,EMERGENCY ROOM,Other,BLACK/AFRICAN,SINGLE,F,26,205.0,529.216667,150.0,72.0
1,91,19.3,URGENT,TRANSFER FROM HOSPITAL,Medicare,WHITE,MARRIED,M,28,96.0,300.250000,97.0,52.0
2,70,32.0,EW EMER.,EMERGENCY ROOM,Other,WHITE,MARRIED,M,5,30.0,61.800000,135.0,70.0
3,64,30.7,OBSERVATION ADMIT,PHYSICIAN REFERRAL,Other,WHITE,MARRIED,M,5,13.0,73.000000,143.0,81.0
4,81,30.9,URGENT,TRANSFER FROM HOSPITAL,Medicare,WHITE,WIDOWED,F,29,38.0,113.500000,154.0,55.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43152,69,23.4,OBSERVATION ADMIT,TRANSFER FROM HOSPITAL,Medicare,WHITE,MARRIED,F,15,32.0,63.816667,121.0,57.0
43153,71,39.7,EW EMER.,WALK-IN/SELF REFERRAL,Medicare,BLACK/AFRICAN AMERICAN,SINGLE,F,17,41.0,161.433333,158.0,65.0
43154,71,39.9,OBSERVATION ADMIT,PHYSICIAN REFERRAL,Medicare,WHITE - OTHER EUROPEAN,MARRIED,F,29,43.0,97.633333,144.0,64.0
43155,71,29.2,EW EMER.,PHYSICIAN REFERRAL,Medicare,WHITE - OTHER EUROPEAN,MARRIED,F,18,42.0,101.666667,144.0,64.0


In [43]:
# one-hot encode categorical columns
X = []
numerical_cols = []
for col in visit_df.columns:
    try:
        X.append(visit_df[[col]].astype(float))
        numerical_cols.append(col)
    except:
        ohe = OneHotEncoder(sparse_output=False)
        data = ohe.fit_transform(visit_df[[col]])
        data = pd.DataFrame(data,columns=ohe.get_feature_names_out())
        X.append(data)
X = pd.concat(X,axis=1)

X

Unnamed: 0,age,bmi,admission_type_AMBULATORY OBSERVATION,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,...,marital_status_MARRIED,marital_status_SINGLE,marital_status_WIDOWED,gender_F,gender_M,n_diagnoses,n_medications,los_hours,bp_systolic,bp_diastolic
0,83.0,21.7,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,26.0,205.0,529.216667,150.0,72.0
1,91.0,19.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,28.0,96.0,300.250000,97.0,52.0
2,70.0,32.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,5.0,30.0,61.800000,135.0,70.0
3,64.0,30.7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,5.0,13.0,73.000000,143.0,81.0
4,81.0,30.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,29.0,38.0,113.500000,154.0,55.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43152,69.0,23.4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,15.0,32.0,63.816667,121.0,57.0
43153,71.0,39.7,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,17.0,41.0,161.433333,158.0,65.0
43154,71.0,39.9,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,29.0,43.0,97.633333,144.0,64.0
43155,71.0,29.2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,18.0,42.0,101.666667,144.0,64.0


In [44]:
# perform ML experiment (ROCAUC on imbalanced dataset)
seed = 0 
x_tr,x_te,y_tr,y_te = train_test_split(X,y,stratify=y,train_size=0.7,random_state=seed,shuffle=True)

scaler = StandardScaler()
x_tr[numerical_cols] = scaler.fit_transform(x_tr[numerical_cols])
x_te[numerical_cols] = scaler.transform(x_te[numerical_cols])

model = LogisticRegression(random_state=seed)
model.fit(x_tr,y_tr)

pred_probs = model.predict_proba(x_te)
auc = roc_auc_score(y_te,pred_probs[:,1])
float(auc)

0.852472301086068

In [45]:
# perform ML experiment (F1 on rebalanced dataset)
y.name = "target"
df = pd.concat([X,y],axis=1)
case_df = df[df.target == 1]
control_df = df[df.target == 0]
control_df = control_df.sample(len(case_df),random_state=seed)
df = pd.concat([case_df,control_df])
y = df.target
X = df.drop("target",axis=1)

x_tr, x_te, y_tr, y_te = train_test_split(X, y, stratify=y, train_size=0.7, random_state=seed)
scaler = StandardScaler()
x_tr[numerical_cols] = scaler.fit_transform(x_tr[numerical_cols])
x_te[numerical_cols] = scaler.transform(x_te[numerical_cols])

model = LogisticRegression(random_state=seed).fit(x_tr, y_tr)
preds = model.predict(x_te)
f1 = f1_score(y_te, preds)
f1


0.748730964467005