In [None]:
""" 
Run deep patient on MIMIC dataset
"""
# Python version 3.8.10 and numpy version 1.22.3 works
import getpass
user_id = getpass.getuser()

import sys 
print(sys.version)
import json
import numpy as np
print(np.__version__)

import matplotlib.pyplot as plt
import pandas as pd

sys.path.insert(0, f'/home/{user_id}synthetic_exp')
from common import *

import sys  
sys.path.insert(0, f'/home/{user_id}/OTTEHR')
from OTTEHR.sda import SDA

base_dir = f"/home/{user_id}/OTTEHR"
data_dir = "outputs/mimic"


Use 100 as the feature dimension for males and females (decided from the previous plots)

In [None]:
def get_mimic_data(n_feaures = 100):

    """ 
    Get male sequences, male labels, female sequences and female labels for MIMIC dataset

    :param n_features: number of codes (features) per patient, 100 is the default number
    """

    patient_agg_df = pd.read_csv("../mimiciii/ADMISSIONS_AGG.csv", \
        index_col=None, header=0, converters={"ICD codes": json.loads})
    male_df = patient_agg_df.loc[patient_agg_df['gender'] == 'M']
    female_df = patient_agg_df.loc[patient_agg_df['gender'] == 'F']

    # get male labels and female labels
    male_labels = np.array(male_df['duration'])
    female_labels = np.array(female_df['duration'])

    # get male sequences and female sequences
    male_seqs = male_df['ICD codes']
    female_seqs = female_df['ICD codes']

    def pad_seqs(seqs):
        """ 
        Pad number of codes per patient to n_features (i.e., pass from the wrapper function)

        :returns: padded sequences, each row has n_features codes
        """
        for i in range(len(seqs)): # for the ith patient
            seq = seqs[i]
        
            # if there are more than 100 codes for the patient, take the first 100 codes
            if len(seq) > n_feaures: 
                seqs[i] = seq[:n_feaures]
            # if there are less than 100 codes for the patient, pad with 0 up to the 100th code
            else:
                seqs[i] += [0] * (n_feaures - len(seq))
        return seqs
    
    male_seqs = np.array(pad_seqs(list(male_seqs)))
    female_seqs = np.array(pad_seqs(list(female_seqs)))
        
    return male_seqs, male_labels, female_seqs, female_labels


In [None]:
"""
Train deep patient model and generate representations for males and females
"""

def custom_train_reps(male_seqs, female_seqs):
    """ 
    Customized training algorithm for generating male representations and female representations
    
    :returns: male representations, female representations
    """

    # customized parameters
    nhidden = 20
    nlayer = 3
    params = {
        'epochs': 50,
        'batch_size': 32,
        'corrupt_lvl': 0.05}

    # for males
    # initiate the model
    male_sda = SDA(male_seqs.shape[1],
                nhidden=nhidden,
                nlayer=nlayer,
                param=params)

    # train the model
    male_sda.train(male_seqs)

    # apply the mode
    male_reps = male_sda.apply(male_seqs)

    # for females
    # initiate the model
    female_sda = SDA(female_seqs.shape[1],
                nhidden=nhidden,
                nlayer=nlayer,
                param=params)

    # train the model
    female_sda.train(female_seqs)

    # apply the mode
    female_reps = female_sda.apply(female_seqs)
    return male_reps, female_reps


In [None]:
""" 
Wrap up everything
"""

def entire_proc_mimic(custom_train_reps):
    """ 
    Executes the entire procedure for MIMIC dataset including
        - preprocess to obtain male sequences, male labels, female sequences and female labels
        - transport female representations to male representations
        - train logistic regression model using male representations and male expires
        - calculate accuracy statistics for males, females and transported females

    :param function sim_func: simulation function
    :param function custom_train_reps: customized deep patient function for training representations
    :returns: the accuracy scores
    """
    male_seqs, male_labels, female_seqs, female_labels = get_mimic_data()
    male_reps, female_reps = custom_train_reps(male_seqs, female_seqs)
    print("male shape is:", male_reps.shape)
    trans_female_reps = trans_female2male(male_reps, female_reps)
    male_score, female_score, trans_female_score = \
        cal_stats_cts(male_reps, male_labels, female_reps, female_labels, trans_female_reps)
    return male_score, female_score, trans_female_score
 

In [None]:
""" 
Run the entire procedure multiple times
"""

male_score, female_score, trans_female_score = \
    entire_proc_mimic(custom_train_reps)

In [None]:
""" 
No simulation, can only plot the coefficient of determination of the predictions

Note that The best possible score is 1.0 and it can be negative \
    (because the model can be arbitrarily worse). \
    A constant model that always predicts the expected value of y, \
    disregarding the input features, would get a score of 0.0.
"""

male_score, female_score, trans_female_score


In [None]:
""" 
TODO: consider
- do synthetic experiments on continuous labels
- MIMIC: more complicated regression models 
- MIMIC: other padding methods
"""