In [None]:
import copy
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import make_scorer, accuracy_score
# from pmlb import pmlb   
import time

dataset = 'data/mimic/mimic4_admissions.csv'
sample_size = np.inf
random_state = 42

# train a model to predict admissions

In [None]:
from io_mimic import read_file

##################################################
# setup data
##################################################
tmp = read_file(
    dataset,
    one_hot_encode = 'ohc',
    label='y',
    freetext_features=[ 'chiefcomplaint'],
#     label_features=[
#         'ethnicity',
#         'insurance'
#     ],
    sample_size = sample_size
)
if len(tmp) == 2:
    features, labels = tmp 
elif len(tmp) == 3:
    print('Note: changing "groups" from',groups,'to',tmp[2])
    features, labels, groups = tmp 
print('features:')
print(features.head())
print(features.shape)
# generate train/test split
X_train, X_test, y_train, y_test = train_test_split(features, labels,
                                                train_size=0.75,
                                                test_size=0.25,
                                                shuffle=True,
                                                random_state=random_state,
                                                stratify=labels
                                                )                                                      

In [None]:
from io_mimic import one_hot_encode_labels
df = X_train
for col in ['ethnicity','insurance']:
    df = one_hot_encode_labels(df, col)
df['binary outcome'] = y_train
df.to_csv('data/mimic/development_dataset.csv',index=False)

In [None]:
df.columns[-10:]

# define the estimator

We define a penalized (lasso) logistic regression model with median imputation and standard scaling as preprocessing steps. 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingGridSearchCV
import numpy as np
from tempfile import mkdtemp
cachedir = mkdtemp()

base_model = LogisticRegression(n_jobs=1, solver='saga',penalty='l1')
# base_model = RandomForestClassifier(n_jobs=1)

categorical_features = ['insurance','ethnicity'] #X_train.select_dtypes(['object','category','int']).columns 
numeric_features = [col for col in X_train.select_dtypes('float').columns] # columns if col not in categorical_features]
print('categorical features:',categorical_features)
print('numeric features:',numeric_features)

numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

# from sklearn.ensemble import RandomForestClassifier
# est = RandomForestClassifier()
preprocessor = ColumnTransformer(
    [
        ("num", numeric_transformer, numeric_features),
        (
            "cat",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            categorical_features,
        ),
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
)
est = make_pipeline(preprocessor, base_model)

In [None]:
# train model
########################################
# configure estimators
########################################
setatts = {
    'random_state':random_state, 
}
np.random.seed(random_state)
for k,v in setatts.items():
    if hasattr(est, k):
        setattr(est, k, v)
# print(40*'=','Evaluating ',est,' on ',dataset,40*'=',sep='\n')


print('X_train:',X_train.shape)
print('y_train:',y_train.shape)


################################################## 
# Fit models
################################################## 
print('training',est)
t0 = time.time()
# with warnings.catch_warnings():
#     warnings.simplefilter("ignore")
est.fit(X_train, y_train)
train_time = time.time() - t0
print('Training finished in',train_time,'seconds')

# measure disparities
- make a dataframe of model predictions and demographics
- run `measure_disparity.py` on the dataframe

In [None]:
# make dataframe
demographics = [c for c in X_test.columns if any(g in c for g in ['ethnicity','gender','insurance'])]
demographics
import json
with open('data/mimic/mimic4_admissions.csv.label_encodings.json','r') as f:
    enc = json.load(f)
X_nice = X_test.copy()
for d in demographics:
    print(d)
    if d in enc.keys():
        print(enc[d]['classes_'])
        print(X_nice[d].unique())
        X_nice[d] = X_nice[d].apply(lambda x: enc[d]['classes_'][x])
df = pd.DataFrame(X_nice[demographics])
df['model prediction'] = est.predict_proba(X_test)[:,1]
df['model label'] = est.predict(X_test)
df['sample weights'] = np.ones((len(X_test),))
df['binary outcome'] = y_test

df.to_csv('lr_model_mimic4_admission.csv', index=False)

In [None]:
df

# run measure_disparity.py on model output

In [None]:
%run measure_disparity.py --dataset lr_model_mimic4_admission.csv -- --interactive

In [None]:
from measure_disparity import measure_disparity
df_fairness = measure_disparity('lr_model_mimic4_admission.csv')

In [None]:
df_fairness