In [None]:
import pandas as pd
# import modin.pandas as pd
import numpy as np
import pickle as pkl
from os import listdir, walk
from os.path import isfile, join
from dask.distributed import Client
import dask.dataframe as dd

import datetime

import json
import time

import matplotlib.pyplot as plt
import scienceplots
plt.style.use(['science', 'nature', 'no-latex'])
pd.options.mode.chained_assignment = None

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression


from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, recall_score, precision_score, average_precision_score, auc, brier_score_loss

import lightgbm as lgb


%matplotlib inline

# Load Data

In [None]:
# load main dataset
aam_df = pd.read_hdf('/data/adarsh/fda_project_data/aam_model_dataset.h5')

In [None]:
# define trauma level based on hospital
# hospitals not listed don't have 
aam_df['trauma_level'] = aam_df.hospital_jhh * 1 + aam_df.hospital_sh * 2 + aam_df.hospital_bmc * 2 + 0

In [None]:
commorbidity_df = pd.read_csv("/data/adarsh/fda_project_data/study_cohort_commorbidity_features.csv")

In [None]:
# > 500 beds
aam_df['hospital_size_large'] = aam_df.hospital_jhh * 1

In [None]:
# split datasets based on hospital
hospital_ids = ['hospital_bmc', "hospital_hcgh", "hospital_jhh", "hospital_smh", "hospital_sh"]

# Make train/test splits by hospital

In [None]:
# make encounter level dataset
pos_ids = aam_df.query('label == 1').enc_id.unique()
all_ids = aam_df.enc_id.unique()
neg_ids = np.array(list(set(all_ids).difference(set(pos_ids))))

event_rows = aam_df.query('enc_id in @pos_ids').groupby('enc_id', as_index=False).nth(0)
non_event_rows = aam_df.query('enc_id in @neg_ids').groupby('enc_id', as_index=False).nth(-1)
aam_df = pd.concat([event_rows, non_event_rows])

In [None]:
aam_df.sex.mean()

In [None]:
# Make a train and test set from each hospital
hospital_df_dict = {}
test_frac = 0.2


for hospital_id in hospital_ids:
    hospital_df = aam_df.query(hospital_id + ' == 1')
    
    enc_ids = hospital_df.enc_id.unique()
    train_ids, test_ids = train_test_split(enc_ids, shuffle=True, test_size=test_frac, random_state=1234)
    
    hospital_train = hospital_df.query('enc_id in @train_ids')
    hospital_test = hospital_df.query('enc_id in @test_ids')
    
    hospital_df_dict[hospital_id + '_train'] = hospital_train
    hospital_df_dict[hospital_id + '_test'] = hospital_test
    
    print("Processed {}".format(hospital_id))

# Train models for each hospital

In [None]:
# scale continuous features so logistic regression and regularization behave well

continuous_features = ['anion_gap', 'bicarbonate_sq', 'glucose',
       'hematocrit_cu', 'lactate', 'bun_log', 'creatinine_log_sq', 'sodium',
       'troponin', 'wbc', 'nbp_dias_latest_sq','nbp_sys_instability', 'nbp_sys_latest_cu', 'heart_rate_latest_cu',
       'heart_rate_instability_log_sq', 'spo2_instability_log',
       'spo2_latest_logit_cu', 'spo2_worst_logit', 'resp_rate_instability_log',
       'temperature_instability_log_sq', 'temperature_latest_sq',
       'resp_rate_latest_cu', 'resp_rate_worst', 'gcs_latest',
       'anion_gap_bicarbonate_ratio', 'shock_index', 'los_log', 'age_log']
discrete_features = ['troponin_missing', 'sex', 'season_1', 'season_2', 'season_3', 'time_of_day_1',
       'time_of_day_2', 'time_of_day_3', 'ed_admit']# , 
                     # 'hospital_bmc','hospital_hcgh', 'hospital_jhh', 'hospital_sh', 'hospital_smh']

In [None]:
baseline_features = ['nbp_sys_instability', 'nbp_sys_latest_cu', 'heart_rate_latest_cu',
       'heart_rate_instability_log_sq', 'spo2_instability_log',
       'spo2_latest_logit_cu', 'spo2_worst_logit', 'resp_rate_instability_log',
       'temperature_instability_log_sq', 'temperature_latest_sq',
       'resp_rate_latest_cu', 'resp_rate_worst', 'gcs_latest', 'age_log', 'sex']

# news_proxy_features = ['resp_rate_worst', 'spo2_worst_logit', 'temperature_latest_sq', 'nbp_sys_latest_cu', 
#                        'heart_rate_latest_cu', 'gcs_latest']

# baseline_features = news_proxy_features + ['age_log', 'sex']


In [None]:
len(baseline_features), len(continuous_features + discrete_features)

In [None]:
from tqdm import tqdm

In [None]:
# train models for each hospital
import flaml.default

# gbm = flaml.default.LGBMClassifier()
hospital_models = {}
hospital_scalers = {}

hospital_baseline_models = {}

for hospital in tqdm(hospital_ids):
    
    scaler = StandardScaler()

    
    train_df = hospital_df_dict[hospital + '_train'].copy()
    
    scaler.fit(train_df[continuous_features])
    train_df[continuous_features] = scaler.transform(train_df[continuous_features])
    train_X = train_df[continuous_features + discrete_features].values
    train_y = train_df.label.values
    
#     lgbm = lgb.LGBMClassifier(n_estimators=200, num_leaves=139, min_child_samples=8, learning_rate=0.05, verbose=10)
#     # hyperparams, estimator_name, X_trans, y_trans = gbm.suggest_hyperparams(train_X, train_y)
#     lgbm.fit(train_X, train_y, verbose=10)
    lgbm = LogisticRegression(random_state=1234, n_jobs=-1, verbose=10, C=1e3)
    lgbm.fit(train_X, train_y)
    
    # baseline = LogisticRegression(random_state=1234, n_jobs=-1, C=1e3)
    
    baseline = lgb.LGBMClassifier(n_estimators=200, num_leaves=139, min_child_samples=8, learning_rate=0.05, verbose=10)
    # hyperparams, estimator_name, X_trans, y_trans = gbm.suggest_hyperparams(train_X, train_y)
    baseline.fit(train_df[baseline_features].values, train_y, verbose=10)
    # baseline.fit(train_df[baseline_features].values, train_y)
    
    hospital_models[hospital] = lgbm
    hospital_scalers[hospital] = scaler
    hospital_baseline_models[hospital] = baseline
    print("Training {} done".format(hospital))
    

# For each model, make predictons on all test sets

In [None]:
# get test labels
hospital_test_y = {}

for hospital in hospital_ids:
    test_df = hospital_df_dict[hospital + '_test']
    test_y = test_df.label.values
    
    hospital_test_y[hospital] = test_y

In [None]:
# get test_predictions
hospital_test_preds = {}
hospital_baseline_test_preds = {}
auroc_dict = {}
auprc_dict = {}

auroc_baseline_dict = {}
auprc_baseline_dict = {}

for model_hospital in hospital_ids:
    model = hospital_models[model_hospital]
    
    baseline_model = hospital_baseline_models[model_hospital]
    
    scaler = hospital_scalers[model_hospital]
    for test_hospital in hospital_ids:
        
        key = (model_hospital, test_hospital)
        test_df = hospital_df_dict[test_hospital + '_test'].copy()
        test_df[continuous_features] = scaler.transform(test_df[continuous_features])

        test_X = test_df[continuous_features + discrete_features].values
        test_y = test_df.label.values
        
        # predict on test_X
        test_preds = model.predict_proba(test_X)[:, -1]
        hospital_test_preds[key] = test_preds
        
        baseline_test_preds = baseline_model.predict_proba(test_df[baseline_features].values)[:, -1]
        hospital_baseline_test_preds[key] = baseline_test_preds
        
        # track performance
        auroc_dict[key] = roc_auc_score(test_y, test_preds)
        auprc_dict[key] = average_precision_score(test_y, test_preds)
        
        auroc_baseline_dict[key] = roc_auc_score(test_y, baseline_test_preds)
        auprc_baseline_dict[key] = average_precision_score(test_y, baseline_test_preds)
    
    print("Done processing {}".format(model_hospital))

In [None]:
auroc_baseline_dict

In [None]:
auroc_dict

# Pick a source hospital and collect predictions on pooled test data

In [None]:
hosp = 'hospital_hcgh' # worst transfer
scaler = hospital_scalers[hosp]

hcgh_test_dfs = []

for test_hospital in hospital_ids:
    test_df = hospital_df_dict[test_hospital + '_test'].copy()
    test_df[continuous_features] = scaler.transform(test_df[continuous_features])
    
    test_df['aam_prediction'] = hospital_test_preds[(hosp, test_hospital)]
    
    test_df['baseline_prediction'] = hospital_baseline_test_preds[(hosp, test_hospital)]
    
    hcgh_test_dfs.append(test_df)

In [None]:
hcgh_combined_test_df = pd.concat(hcgh_test_dfs)

In [None]:
# hcgh_combined_test_df.to_hdf('/data/adarsh/fda_project_data/hcgh_combined_test_df_7_11_2022.h5', key='s', mode='w')

In [None]:
hcgh_combined_test_df

# AFISP step 3: Retraining Diagnostic [Run after afisp notebook]

## Training Size diagnostic

In [None]:
train_n = hospital_df_dict['hospital_hcgh_train'].shape[0]

In [None]:
def get_diagnostic1_results(rule, num_points=5, n_trials=10):
    con_df = pd.concat([hospital_df_dict[x + '_train'] for x in ['hospital_bmc',
     'hospital_jhh',
     'hospital_smh',
     'hospital_sh']])
    
    con_df = pd.concat([hospital_df_dict[x + '_train'] for x in hospital_ids])
    
    merged_df = con_df.merge(commorbidity_df, on ='enc_id', how='left')
    
    
    init_n = merged_df.query('hospital_hcgh == 1').query(rule).shape[0]

    diag_df = merged_df.query(rule)
    
    max_n = diag_df.shape[0] - 1
    
    # training size grid
    # data_amounts=np.round(np.exp(np.linspace(np.log(1), np.log(max_n), num_points))).astype(int)
    data_amounts = np.round(np.linspace(50, max_n, num=num_points)).astype(int)
    print(data_amounts, init_n, max_n)
    
    # data_amounts = [0, 100, 500, 1000, 1500, 2000, 2500, 3179]

    diagnostic_models = {}
    diagnostic_test_preds = {}
    
    # train model
    scaler = hospital_scalers['hospital_hcgh']
    diag_test_df = hcgh_combined_test_df.merge(commorbidity_df, on ='enc_id', how='left')

    result_aucs = {}
    result_sub_aucs = {}
    result_sub_aucs_lower = {}
    result_sub_aucs_upper = {}
    result_full_aucs_lower = {}
    result_full_aucs_upper = {}
    for amt in data_amounts:
        
        preds_by_train_amount = np.zeros((n_trials, hcgh_combined_test_df.shape[0]))
        
        for trial in range(n_trials):
            train_df = hospital_df_dict['hospital_hcgh' + '_train'].copy()
            # ignore rows corresonding to the rule
            
            train_df = train_df[~train_df.merge(commorbidity_df, on ='enc_id', how='left').eval(rule).values]

            train_df[continuous_features] = scaler.transform(train_df[continuous_features])

            if amt > 0:
                # sample rows from the subgroup
                sample = diag_df.sample(n=amt)
                sample[continuous_features] = scaler.transform(sample[continuous_features])
                train_df = pd.concat([train_df, sample])
            train_X = train_df[continuous_features + discrete_features].values
            train_y = train_df.label.values

       
            lgbm = LogisticRegression(random_state=1234, n_jobs=-1, C=1e3)
            lgbm.fit(train_X, train_y)
            diagnostic_models[amt] = lgbm


            model_hospital = 'hospital_hcgh'
            collect_test_preds = []
            for test_hospital in hospital_ids:

                key = (amt, test_hospital)
                test_df = hospital_df_dict[test_hospital + '_test'].copy()
                test_df[continuous_features] = scaler.transform(test_df[continuous_features])

                test_X = test_df[continuous_features + discrete_features].values
                test_y = test_df.label.values

                # predict on test_X
                test_preds = lgbm.predict_proba(test_X)[:, -1]
                collect_test_preds += list(test_preds)
                
            preds_by_train_amount[trial] = np.array(collect_test_preds)
        # finished training n_trials times
        
        full_aucs = np.array([roc_auc_score(diag_test_df.label, preds_by_train_amount[i]) for i in range(n_trials)])
        inds = np.array(diag_test_df.query(rule).index)
        sub_aucs = np.array([roc_auc_score(diag_test_df.label.values[inds], preds_by_train_amount[i][inds]) for i in range(n_trials)])
                
        result_aucs[amt] = np.mean(full_aucs)
        result_sub_aucs[amt] = np.mean(sub_aucs)
        result_sub_aucs_lower[amt] = np.percentile(sub_aucs, 2.5)
        result_sub_aucs_upper[amt] = np.percentile(sub_aucs, 97.5)
        result_full_aucs_lower[amt] = np.percentile(full_aucs, 2.5)
        result_full_aucs_upper[amt] = np.percentile(full_aucs, 97.5)
        print("N = {}, Full AUC = {:.3f}, Sub AUC = {:.3f}".format(amt, result_aucs[amt], result_sub_aucs[amt]))
 
    return diag_test_df, data_amounts, init_n, result_aucs, result_sub_aucs, result_sub_aucs_lower, result_sub_aucs_upper, result_full_aucs_lower, result_full_aucs_upper



In [None]:
rule = "anemia == 1 and nonspecific_lung_disease >= 1"

In [None]:
diag_test_df, data_amounts, init_n, full_aucs, sub_aucs, sub_lower, sub_upper, full_lower, full_upper = get_diagnostic1_results(rule, n_trials=10, num_points=7)

In [None]:
diag_test_df, data_amounts, init_n, full_aucs, sub_aucs, sub_lower, sub_upper, full_lower, full_upper = get_diagnostic1_results(rule, n_trials=10, num_points=7)

In [None]:
plt.plot(data_amounts, list(full_aucs.values()), '--.')
plt.fill_between(data_amounts, list(full_lower.values()), list(full_upper.values()), alpha=0.25)

plt.ylabel('Full Population AUROC')
plt.xlabel('Subgroup # of Training Examples')
plt.axvline(x=init_n, ls='--', color='#fb8072', label='Original # of Training Samples')
plt.axhline(y=0.986, ls='--', color='#b3de69', label='Original Full Population AUROC')
plt.xlim(0, 2500)
plt.legend()
plt.savefig('figs/pub_fullgroup_perf_anemia_lung_disease.pdf', dpi=360)

In [None]:
plt.plot(data_amounts, list(sub_aucs.values()), '--.')
plt.fill_between(data_amounts, list(sub_lower.values()), list(sub_upper.values()), alpha=0.25)
plt.ylabel('Subgroup AUROC')
plt.xlabel('Subgroup # of Training Examples')
plt.axvline(x=init_n, ls='--', color='#fb8072', label='Original # of Training Samples')
plt.xlim(0, 2500)
plt.legend(loc='lower right')
plt.savefig('figs/pub_subgroup_perf_anemia_lung_disease.pdf', dpi=360)