In [None]:
""" 
MIMIC experiment based on the dataset prepared by prepare_admid_diagnosis.ipynb

The features are the ICD codes, the labels are generated by whether the patient diagnosis contains target diagnosis (i.e., one ICD code)

In this notebook, we want to run over all responses
"""

In [None]:
from IPython.display import Image
Image(filename='../../outputs/pipeline_figs/EHR_MIMIC_pipeline.png')

In [None]:
import sys
sys.path.append("/home/wanxinli/deep_patient/")

from ast import literal_eval
from common import *
from mimic_common import *
from multiprocess import Pool
import os
import random
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import time


In [None]:
output_dir = os.path.join(os.path.expanduser("~"), f"deep_patient/outputs/mimic")
print(f"Will save outputs to {output_dir}")

In [None]:
""" 
Read in the original dataframe
"""
admid_diagnosis_df = pd.read_csv("../../outputs/mimic/ADMID_DIAGNOSIS.csv", index_col=0, header=0, converters={'ICD codes': literal_eval})
print(admid_diagnosis_df)

""" 
Print number of patients for each category
"""
print("female label 0", admid_diagnosis_df.loc[(admid_diagnosis_df['label'] == 0) & (admid_diagnosis_df['gender'] == 'F')].shape[0])
print("female label 1", admid_diagnosis_df.loc[(admid_diagnosis_df['label'] == 1) & (admid_diagnosis_df['gender'] == 'F')].shape[0])
print("male label 0", admid_diagnosis_df.loc[(admid_diagnosis_df['label'] == 0) & (admid_diagnosis_df['gender'] == 'M')].shape[0])
print("male label 1", admid_diagnosis_df.loc[(admid_diagnosis_df['label'] == 1) & (admid_diagnosis_df['gender'] == 'M')].shape[0])


In [None]:
"""
Train deep patient model and generate representations for targets and sources
"""

def custom_train_reps(target_features, source_features, n_components, pca_explain=False):
    """ 
    Customized training algorithm for generating target representations and source representations

    :param bool pca_explain: print the explained variance of each components
    
    :returns: target representations, source representations
    """
    source_pca = PCA(n_components=n_components)
    target_pca = PCA(n_components=n_components)
    target_reps = target_pca.fit_transform(target_features)
    source_reps = source_pca.fit_transform(source_features)

    if pca_explain:
        source_exp_var = source_pca.explained_variance_ratio_
        source_cum_sum_var = np.cumsum(source_exp_var)
        target_exp_var = target_pca.explained_variance_ratio_
        target_cum_sum_var = np.cumsum(target_exp_var)
        print("Cummulative variance explained by the source PCA is:", source_cum_sum_var)
        print("Cummulative variance explained by the target PCA is:", target_cum_sum_var)

    return target_reps, source_reps

In [21]:
def multi_proc_parallel(score_path, n_components, label_code, custom_train_reps, \
        male_count, female_count, iteration=20):
    """ 
    Code cannot be parallized when passing the dataframe (full_df) as a parameter
    Hence, cannot be put into mimic_common.py
    """
    
    p = Pool(10)

    # note: the following line cannnot be used for parallelization either
    # admid_diagnosis_df = pd.read_csv("../../outputs/mimic/ADMID_DIAGNOSIS.csv", index_col=0, header=0, converters={'ICD codes': literal_eval})

    def iteration_wrapper(iter):
        """ 
        Wrapper function for one iteration, returns result statistics, for parallel computing

        :param int iter: the current iteration
        """
        # print(f"iteration: {iter}\n")
        cur_res = entire_proc_binary(n_components, label_code, admid_diagnosis_df, custom_train_reps, male_count, female_count)
        return cur_res

    res = p.map(iteration_wrapper, np.arange(0, iteration, 1))
    res_df = pd.DataFrame(res, columns = ['target_accuracy', 'target_precision', 'target_recall', 'target_f1', \
                                          'source_accuracy', 'source_precision', 'source_recall', 'source_f1', \
                                            'trans_source_accuracy', 'trans_source_precision', 'trans_source_recall', 'trans_source_f1'])
    res_df.to_csv(score_path, index=False, header=True)
    return res



In [22]:
""" 
Run the entire proc for all response (i.e., label_code) 
Responses are selected by select_codes.ipynb and saved in ../../outputs/mimic/selected_summary_mimic.csv
"""

n_components = 50
male_count = 120
female_count = 100
label_code_path = os.path.join(output_dir, "selected_summary_mimic.csv")
label_code_df = pd.read_csv(label_code_path, header=0, index_col=None)
label_codes = list(label_code_df['ICD code'])[:50]
for label_code in label_codes:
    start_time = time.time()
    print(f"label code {label_code} started")
    score_path = os.path.join(output_dir, f"exp3_{label_code}_score.csv")
    multi_proc_parallel(score_path, n_components, label_code, custom_train_reps, \
            male_count, female_count, iteration=100)
    end_time = time.time()
    print(f"runtime for {label_code} is: {end_time-start_time}")

label code 1983 started




runtime for 1983 is: 408.32729148864746
label code 1985 started




runtime for 1985 is: 241.23037886619568
label code 29680 started




runtime for 29680 is: 246.5484299659729
label code 30000 started




runtime for 30000 is: 246.5361843109131
label code 3004 started




runtime for 3004 is: 247.4682068824768
label code 30390 started




runtime for 30390 is: 244.3283565044403
label code 30391 started




runtime for 30391 is: 241.5486135482788
label code 2720 started




runtime for 2720 is: 244.70529961585999
label code 2724 started




runtime for 2724 is: 245.8808662891388
label code 2749 started




runtime for 2749 is: 246.87364530563354
label code 2760 started




runtime for 2760 is: 243.5849404335022
label code 2761 started




runtime for 2761 is: 250.28563237190247
label code 2762 started




runtime for 2762 is: 242.04327607154846
label code 2763 started




runtime for 2763 is: 247.7518389225006
label code 27652 started




runtime for 27652 is: 240.27137899398804
label code 2767 started




runtime for 2767 is: 247.7740261554718
label code 2768 started




runtime for 2768 is: 250.94560718536377
label code 27800 started




runtime for 27800 is: 248.02069449424744
label code 27801 started




runtime for 27801 is: 245.51006197929382
label code 2800 started




runtime for 2800 is: 249.53920793533325
label code 2809 started




runtime for 2809 is: 239.54829931259155
label code 25050 started




runtime for 25050 is: 248.2491328716278
label code 25060 started




runtime for 25060 is: 247.16688346862793
label code 25080 started




runtime for 25080 is: 246.4805109500885
label code 2536 started




runtime for 2536 is: 247.0260317325592
label code 3481 started




runtime for 3481 is: 355.99823451042175
label code 34830 started




runtime for 34830 is: 501.8996698856354
label code 3484 started




runtime for 3484 is: 470.29673624038696
label code 3485 started




In [None]:
# label_code = "00845"
# n_components = 50
# male_count = 120
# female_count = 100
# score_path = os.path.join(output_dir, f"exp3_{label_code}_score.csv")
# multi_proc(score_path, n_components, label_code, admid_diagnosis_df, custom_train_reps, \
#                  male_count, female_count, iteration=100)

In [None]:
# box_plot(score_path, filter=False)

In [None]:
# hist_plot(score_path, filter=False)