In [1]:
""" 
Run deep patient on MIMIC dataset
"""
# Python version 3.8.10 and numpy version 1.22.3 works
import sys 
print(sys.version)
import json
import numpy as np
print(np.__version__)

import matplotlib.pyplot as plt
import pandas as pd

sys.path.insert(0, '/home/wanxinli/deep_patient/synthetic_exp')
from common import *

import sys  
sys.path.insert(0, '/home/wanxinli/deep_patient')
from deep_patient.sda import SDA

base_dir = "/home/wanxinli/deep_patient"
data_dir = "outputs/mimic"


3.8.10 (default, Jun 22 2022, 20:18:18) 
[GCC 9.4.0]
1.22.3




In [2]:
patient_agg_df = pd.read_csv("../mimiciii/PATIENTS_50_AGG.csv", index_col=0, header=0, converters={"ICD codes": json.loads})
male_df = patient_agg_df.loc[patient_agg_df['gender'] == 'M']
female_df = patient_agg_df.loc[patient_agg_df['gender'] == 'F']
male_seqs = male_df['ICD codes']
female_seqs = female_df['ICD codes']

Use 100 as the feature dimension for males and females (decided from the previous plots)

In [3]:
def get_mimic_data(data_path = "../mimiciii/PATIENTS_AGG.csv", n_feaures = 100):

    """ 
    Get male sequences, male labels, female sequences and female labels for MIMIC dataset

    :param data_path: the path to the file storing processed patient information with the codes and labels
    :param n_features: number of codes (features) per patient, 100 is the default number
    """

    patient_agg_df = pd.read_csv(data_path, \
        index_col=0, header=0, converters={"ICD codes": json.loads})
    male_df = patient_agg_df.loc[patient_agg_df['gender'] == 'M']
    female_df = patient_agg_df.loc[patient_agg_df['gender'] == 'F']

    # get male labels and female labels
    male_labels = np.array(male_df['expire'])
    female_labels = np.array(female_df['expire'])

    # get male sequences and female sequences
    male_seqs = male_df['ICD codes']
    female_seqs = female_df['ICD codes']

    def pad_seqs(seqs):
        """ 
        Pad number of codes per patient to n_features (i.e., pass from the wrapper function)

        :returns: padded sequences, each row has n_features codes
        """
        for i in range(len(seqs)): # for the ith patient
            seq = seqs[i]
        
            # if there are more than 100 codes for the patient, take the first 100 codes
            if len(seq) > n_feaures: 
                seqs[i] = seq[:n_feaures]
            # if there are less than 100 codes for the patient, pad with 0 up to the 100th code
            else:
                seqs[i] += [0] * (n_feaures - len(seq))
        return seqs
    
    male_seqs = np.array(pad_seqs(list(male_seqs)))
    female_seqs = np.array(pad_seqs(list(female_seqs)))
        
    return male_seqs, male_labels, female_seqs, female_labels


In [4]:
""" 
Get the proportation of expired patients for males and females
"""

male_seqs, male_labels, female_seqs, female_labels = \
    get_mimic_data(data_path="../mimiciii/PATIENTS_50_AGG.csv", n_feaures=20)
print("print the percent of expired patient for males and females")
print(np.count_nonzero(male_labels == 1)/len(male_labels))
print(np.count_nonzero(female_labels == 1)/len(female_labels))


print the percent of expired patient for males and females
0.4743041702289351
0.5108340394625348


In [5]:
print("print shapes")
print("male_seqs shape is:", male_seqs.shape)
print("male_labels shape is:", male_labels.shape)
print("female_seqs shape is:", female_seqs.shape)
print("female_labels shape is:", female_labels.shape)

print shapes
male_seqs shape is: (10527, 20)
male_labels shape is: (10527,)
female_seqs shape is: (8261, 20)
female_labels shape is: (8261,)


In [6]:
"""
Train deep patient model and generate representations for males and females
"""

def custom_train_reps(male_seqs, female_seqs):
    """ 
    Customized training algorithm for generating male representations and female representations
    
    :returns: male representations, female representations
    """

    # customized parameters
    nhidden = 5
    nlayer = 2
    params = {
        'epochs': 50,
        'batch_size': 16,
        'corrupt_lvl': 0.05}

    # for males
    # initiate the model
    male_sda = SDA(male_seqs.shape[1],
                nhidden=nhidden,
                nlayer=nlayer,
                param=params)

    # train the model
    male_sda.train(male_seqs)

    # apply the mode
    male_reps = male_sda.apply(male_seqs)

    # for females
    # initiate the model
    female_sda = SDA(female_seqs.shape[1],
                nhidden=nhidden,
                nlayer=nlayer,
                param=params)

    # train the model
    female_sda.train(female_seqs)

    # apply the mode
    female_reps = female_sda.apply(female_seqs)
    return male_reps, female_reps


In [7]:
""" 
Wrap up everything
"""

def entire_proc_mimic(custom_train_reps, data_path = "../mimiciii/PATIENTS_AGG.csv", n_features = 100):
    """ 
    Executes the entire procedure for MIMIC dataset including
        - preprocess to obtain male sequences, male labels, female sequences and female labels
        - transport female representations to male representations
        - train logistic regression model using male representations and male expires
        - calculate accuracy statistics for males, females and transported females

    :param function sim_func: simulation function
    :param function custom_train_reps: customized deep patient function for training representations
    :returns: the accuracy scores
    """
    male_seqs, male_labels, female_seqs, female_labels = \
        get_mimic_data(data_path=data_path, n_feaures=n_features)
    male_reps, female_reps = custom_train_reps(male_seqs, female_seqs)
    trans_female_reps = trans_female2male(male_reps, female_reps, max_iter = 10000000)
    male_accuracy, male_precision, male_recall, \
        female_accuracy, female_precision, female_recall, \
        trans_female_accuracy, trans_female_precision, trans_female_recall = \
        cal_stats_binary(male_reps, male_labels, female_reps, female_labels, trans_female_reps, max_iter = 10000000)
    return male_accuracy, male_precision, male_recall, \
        female_accuracy, female_precision, female_recall, \
        trans_female_accuracy, trans_female_precision, trans_female_recall 
 

In [8]:
""" 
Run the entire procedure multiple times
"""

male_accuracy, male_precision, male_recall, \
    female_accuracy, female_precision, female_recall, \
    trans_female_accuracy, trans_female_precision, trans_female_recall = \
    entire_proc_mimic(custom_train_reps, data_path="../mimiciii/PATIENTS_50_AGG.csv", n_features=20)

initializing: 2-layer SDAs

initialized: DA [layer: 1]
(*) no. of visible units: 20
(*) no. of hidden units: 5
(*) data corruption level: 0.05
(*) learning rate: 0.10
(*) batch size: 16
(*) no. of epochs: 50

initialized: DA [layer: 2]
(*) no. of visible units: 5
(*) no. of hidden units: 5
(*) data corruption level: 0.05
(*) learning rate: 0.10
(*) batch size: 16
(*) no. of epochs: 50

training: 2-layer SDAs

training: DA [layer: 1]
(*) preprocessing: normalize features
(*) epoch 1, cost 3.867
(*) epoch 2, cost 3.419
(*) training time: 0.13 sec.
applying: DA [layer: 1]

training: DA [layer: 2]
(*) preprocessing: normalize features
(*) epoch 1, cost 3.121
(*) epoch 2, cost 2.976
(*) training time: 0.11 sec.

training time: 1.85 sec.

applying: 2-layer SDA
(*) applying: DA [layer: 1]
(*) applying: DA [layer: 2]
initializing: 2-layer SDAs

initialized: DA [layer: 1]
(*) no. of visible units: 20
(*) no. of hidden units: 5
(*) data corruption level: 0.05
(*) learning rate: 0.10
(*) batch si

In [9]:
""" 
No simulation, can only plot the one-time statistics

"""

male_accuracy, male_precision, male_recall, \
    female_accuracy, female_precision, female_recall, \
    trans_female_accuracy, trans_female_precision, trans_female_recall


(0.5788923719958202,
 0.5920447074293228,
 0.3607049869817745,
 0.5069604164144776,
 0.5091794679655302,
 0.9661137440758294,
 0.4931606343057741,
 0.5068836045056321,
 0.28791469194312796)