In [None]:
""" 
MIMIC experiment based on the dataset prepared by prepare_admid_diagnosis.ipynb

The features are the ICD codes, the labels are generated by whether the patient diagnosis contains target diagnosis (i.e., one ICD code)
"""

In [None]:
from IPython.display import Image
Image(filename='../../outputs/pipeline_figs/EHR_MIMIC_pipeline.png')

In [None]:
import sys
sys.path.append("/home/wanxinli/EHR-OT/")

from common import *
from mimic_common import *
from multiprocess import Pool
from ast import literal_eval
import random
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score


In [None]:
def select_df(df, label_code, male_count, female_count):
    """ 
    Select row in the dataframe df with balanced number of labels for males and females
    Specifically, we want to reduce the number of rows with label 0 for males and females

    :param Dataframe df: the dataframe to select samples with label 0 and label 1
    :param str label_code: the ICD code for determining labels. This code should be removed from ICD codes.
    :param int target_count: the number of samples with label 1s and label 0s for target (male). 
    :param int source_count: the number of samples with label 1s and label 0s for source (female). 
    """

    # select samples based on counts
    female_1_indices = []
    female_0_indices = []
    male_1_indices = []
    male_0_indices = []

    # generate label column based on label_code
    if 'label' in df.columns:
        df = df.drop(['label'], axis=1)
    labels = []
    for index, row in df.iterrows():
        if label_code in row['ICD codes']:
            labels.append(1)
        else:
            labels.append(0)
    df['label'] = labels

    for index, row in df.iterrows():
        if row['label'] == 0 and row['gender'] == 'F':
            female_0_indices.append(index)
        elif row['label'] == 0 and row['gender'] == 'M':
            male_0_indices.append(index)
        elif row['label'] == 1 and row['gender'] == 'F':
            female_1_indices.append(index)
        elif row['label'] == 1 and row['gender'] == 'M':
            male_1_indices.append(index)
    
    # indices to delete from the dataframe
    # sample the same number of label 0s and label 1s
    delete_female_0_indices = random.sample(female_0_indices, len(female_0_indices)-female_count)
    delete_male_0_indices = random.sample(male_0_indices, len(male_0_indices)-male_count)
    delete_female_1_indices = random.sample(female_1_indices, len(female_1_indices)-female_count)
    delete_male_1_indices = random.sample(male_1_indices, len(male_1_indices)-male_count)

    delete_female_0_indices.extend(delete_male_0_indices)
    delete_female_0_indices.extend(delete_female_1_indices)
    delete_female_0_indices.extend(delete_male_1_indices)
    
    df = df.drop(delete_female_0_indices, axis=0, inplace=False)

    # remove label_code from ICD code features
    for index, row in df.iterrows():
        if label_code in row['ICD codes']:
            new_codes = row['ICD codes']
            new_codes.remove(label_code)
            df.at[index, 'ICD codes'] = new_codes
    
    return df


In [None]:
""" 
Read in the original dataframe
"""
admid_diagnosis_df = pd.read_csv("../../outputs/mimic/ADMID_DIAGNOSIS.csv", index_col=0, header=0, converters={'ICD codes': literal_eval})
print(admid_diagnosis_df)

""" 
Print number of patients for each category
"""
print("female label 0", admid_diagnosis_df.loc[(admid_diagnosis_df['label'] == 0) & (admid_diagnosis_df['gender'] == 'F')].shape[0])
print("female label 1", admid_diagnosis_df.loc[(admid_diagnosis_df['label'] == 1) & (admid_diagnosis_df['gender'] == 'F')].shape[0])
print("male label 0", admid_diagnosis_df.loc[(admid_diagnosis_df['label'] == 0) & (admid_diagnosis_df['gender'] == 'M')].shape[0])
print("male label 1", admid_diagnosis_df.loc[(admid_diagnosis_df['label'] == 1) & (admid_diagnosis_df['gender'] == 'M')].shape[0])

""" 
Select subset of the original dataframe
"""
male_count = 120
female_count = 50
label_code = "45620"
admid_diagnosis_select_df = select_df(admid_diagnosis_df, label_code, male_count=male_count, female_count=female_count)


""" 
Print number of patients for each category
"""
print("female label 0", admid_diagnosis_select_df.loc[(admid_diagnosis_select_df['label'] == 0) & (admid_diagnosis_select_df['gender'] == 'F')].shape[0])
print("female label 1", admid_diagnosis_select_df.loc[(admid_diagnosis_select_df['label'] == 1) & (admid_diagnosis_select_df['gender'] == 'F')].shape[0])
print("male label 0", admid_diagnosis_select_df.loc[(admid_diagnosis_select_df['label'] == 0) & (admid_diagnosis_select_df['gender'] == 'M')].shape[0])
print("male label 1", admid_diagnosis_select_df.loc[(admid_diagnosis_select_df['label'] == 1) & (admid_diagnosis_select_df['gender'] == 'M')].shape[0])

""" 
Dataframe after selection
"""

print(admid_diagnosis_select_df)

""" 
Plot code distribution
"""
plot_code_distn(admid_diagnosis_select_df)

In [None]:
"""
Train deep patient model and generate representations for targets and sources
"""

def custom_train_reps(target_features, source_features, n_components, pca_explain=False):
    """ 
    Customized training algorithm for generating target representations and source representations

    :param bool pca_explain: print the explained variance of each components
    
    :returns: target representations, source representations
    """
    source_pca = PCA(n_components=n_components)
    target_pca = PCA(n_components=n_components)
    target_reps = target_pca.fit_transform(target_features)
    source_reps = source_pca.fit_transform(source_features)

    if pca_explain:
        source_exp_var = source_pca.explained_variance_ratio_
        source_cum_sum_var = np.cumsum(source_exp_var)
        target_exp_var = target_pca.explained_variance_ratio_
        target_cum_sum_var = np.cumsum(target_exp_var)
        print("Cummulative variance explained by the source PCA is:", source_cum_sum_var)
        print("Cummulative variance explained by the target PCA is:", target_cum_sum_var)

    return target_reps, source_reps

In [None]:
""" 
One iteration
"""

n_components = 50
entire_proc(n_components, label_code, admid_diagnosis_df, custom_train_reps, male_count, female_count, pca_explain=True)

In [None]:
def multi_proc_parallel(score_path, n_components, label_code, custom_train_reps, \
        male_count, female_count, iteration=20):
    """ 
    Code cannot be parallized when passing the dataframe (full_df) as a parameter
    Hence, cannot be put into mimic_common.py
    """
    
    p = Pool(32)

    # note: the following line cannnot be used for parallelization either
    # admid_diagnosis_df = pd.read_csv("../../outputs/mimic/ADMID_DIAGNOSIS.csv", index_col=0, header=0, converters={'ICD codes': literal_eval})

    def iteration_wrapper(iter):
        """ 
        Wrapper function for one iteration, returns result statistics, for parallel computing

        :param int iter: the current iteration
        """
        print(f"iteration: {iter}\n")
        cur_res = entire_proc(n_components, label_code, admid_diagnosis_df, custom_train_reps, male_count, female_count)
        return cur_res

    res = p.map(iteration_wrapper, np.arange(0, iteration, 1))
    res_df = pd.DataFrame(res, columns = ['target_accuracy', 'target_f1', 'source_accuracy', 'source_f1', 'trans_source_accuracy', 'trans_source_f1'])
    res_df.to_csv(score_path, index=False, header=True)
    return res



In [None]:
score_path = "../../outputs/mimic/exp_2_score.csv"

multi_proc_parallel(score_path, n_components, label_code, custom_train_reps, \
        male_count, female_count, iteration=100)

In [None]:
""" 
Not in use, since we want to run jobs in parallel
"""

""" 
Run the entire procedure multiple times
"""

# score_path = "../../outputs/mimic/exp_2_score.csv"
# res = multi_proc(score_path, n_components, label_code, admid_diagnosis_df, custom_train_reps, \
#                  male_count, female_count, iteration=100)

In [None]:
box_plot(score_path, filter=False)

In [None]:
hist_plot(score_path, filter=False)