In [28]:
import sys
sys.path.append("/home/wanxinli/deep_patient")

from deep_patient.sda import SDA
import numpy as np
import ot
import pandas as pd
from scipy import sparse
from sklearn import linear_model


In [29]:
"""
Generate synthetic dataset for males in the following way:
(integer) medical code range: [0, 20]
explanatory data dimension: [100, 10] for males 
label: expire, = 1 if a row has both 1 and 2
"""

# male_seqs = np.random.randint(low=0, high=20, size=[100, 10])
# print("male_seqs shape is:", male_seqs.shape)

def gen_label(seqs, code_1, code_2):
    expires = []
    for seq in seqs:
        if code_1 in seq and code_2 in seq: # our mechnism for determining expired label
            expires.append(1)
        else:
            expires.append(0)
    return np.array(expires)

def gen_male_seqs_labels():
    """
    Generate male seqs and labels (expires)
    """
    male_seqs = np.random.randint(low=0, high=20, size=[100, 10])
    male_expires = gen_label(male_seqs, 1, 2)
    return male_seqs, male_expires


In [30]:
"""
Generate synthetic dataset for females in the following way:
swap 1 with 3, 2 with 4 in the male dataset
"""

def gen_female_seqs_labels(male_seqs): 
    """
    Generate female seqs and labels (expires)
    """
    female_seqs = []
    for male_seq in male_seqs:
        female_seq = []
        for code in male_seq:
            if code == 1:
                female_seq.append(3)
            elif code == 3:
                female_seq.append(1)
            elif code == 2:
                female_seq.append(4)
            elif code == 4:
                female_seq.append(2)
            else:
                female_seq.append(code)
        female_seqs.append(female_seq)

    female_seqs = np.array(female_seqs)
    female_expires = gen_label(female_seqs, 3, 4)
    return female_seqs, female_expires



In [31]:
"""
Train deep patient model and generate representations for males and females
"""

def custom_train_reps(male_seqs, female_seqs):
    """ 
    Customized training algorithm for generating male representations and female representations
    
    :returns: male representations, female representations
    """

    # customized parameters
    nhidden = 5
    nlayer = 1

    # for males
    # initiate the model
    male_sda = SDA(male_seqs.shape[1],
                nhidden=nhidden,
                nlayer=nlayer,
                param={
        'epochs': 10,
        'batch_size': 5,
        'corrupt_lvl': 0.05
    })

    # train the model
    male_sda.train(male_seqs)

    # apply the mode
    male_reps = male_sda.apply(male_seqs)

    # for females
    # initiate the model
    female_sda = SDA(female_seqs.shape[1],
                nhidden=nhidden,
                nlayer=nlayer,
                param={
        'epochs': 10,
        'batch_size': 5,
        'corrupt_lvl': 0.05
    })

    # train the model
    female_sda.train(female_seqs)

    # apply the mode
    female_reps = female_sda.apply(female_seqs)
    return male_reps, female_reps


In [32]:
""" 
Transport female representations to male representations
We would expect a one to one mapping by row ID
"""

def trans_female2male(male_reps, female_reps):
    """ 
    Optimal transport (without entropy regularization) female representations \
        to male representations

    :returns: transported female representations
    """
    ot_emd = ot.da.EMDTransport()
    ot_emd.fit(Xs=female_reps, Xt=male_reps)
    trans_female_reps = ot_emd.transform(Xs=female_reps)
    return trans_female_reps


In [33]:
""" 
Caculate statistics
"""

def cal_stats(male_reps, male_expires, female_reps, female_expires, trans_female_reps):
    """ 
    Calculate accuracy statistics based on logistic regression between the \
        patient representations and expire labels
    
    :returns: accuracy for males using male model, accuracy for females using male model, \
        accuracy for transported females using male model
    """
    male_logit_model = linear_model.LogisticRegression()
    male_logit_model.fit(male_reps, male_expires)
    male_logit_score = male_logit_model.score(male_reps, male_expires)
    female_logit_score = male_logit_model.score(female_reps, female_expires)
    trans_female_logit_score = male_logit_model.score(trans_female_reps, female_expires)
    return male_logit_score, female_logit_score, trans_female_logit_score


In [34]:
""" 
Wrap up everything
"""

def entire_proc():
    """ 
    Executes the entire procedure including
        - generate male and female sequences and expires (labels)
        - generate male and female representations
        - transport female representations to male representations
        - train logistic regression model using male representations and male expires
        - calculate accuracy statistics for males, females and transported females

    :returns: the accuracy scores
    """
    male_seqs, male_expires = gen_male_seqs_labels()
    female_seqs, female_expires = gen_female_seqs_labels(male_seqs)
    male_reps, female_reps = custom_train_reps(male_seqs, female_seqs)
    trans_female_reps = trans_female2male(male_reps, female_reps)
    male_logit_score, female_logit_score, trans_female_logit_score = \
        cal_stats(male_reps, male_expires, female_reps, female_expires, trans_female_reps)
    return male_logit_score, female_logit_score, trans_female_logit_score
    

In [35]:
""" 
Run entire procedure on multiple simulations and print accuracy statistics
"""
male_logit_scores = []
female_logit_scores = [] 
trans_female_logit_scores = []
n_sim = 100
for i in range(n_sim):
    male_logit_score, female_logit_score, trans_female_logit_score = \
        entire_proc()
    male_logit_scores.append(male_logit_score)
    female_logit_scores.append(female_logit_score)
    trans_female_logit_scores.append(trans_female_logit_score)



initializing: 1-layer SDAs

initialized: DA [layer: 1]
(*) no. of visible units: 10
(*) no. of hidden units: 5
(*) data corruption level: 0.05
(*) learning rate: 0.10
(*) batch size: 5
(*) no. of epochs: 10

training: 1-layer SDAs

training: DA [layer: 1]
(*) preprocessing: normalize features
(*) epoch 1, cost 6.915
(*) epoch 2, cost 6.563
(*) training time: 0.00 sec.

training time: 0.29 sec.

applying: 1-layer SDA
(*) applying: DA [layer: 1]
initializing: 1-layer SDAs

initialized: DA [layer: 1]
(*) no. of visible units: 10
(*) no. of hidden units: 5
(*) data corruption level: 0.05
(*) learning rate: 0.10
(*) batch size: 5
(*) no. of epochs: 10

training: 1-layer SDAs

training: DA [layer: 1]
(*) preprocessing: normalize features
(*) epoch 1, cost 8.474
(*) epoch 2, cost 6.630
(*) epoch 3, cost 6.521
(*) training time: 0.00 sec.

training time: 0.16 sec.

applying: 1-layer SDA
(*) applying: DA [layer: 1]
initializing: 1-layer SDAs

initialized: DA [layer: 1]
(*) no. of visible units:

In [36]:
""" 
Constructs a dataframe to demonstrate the accuracy statistics
"""
score_df = pd.DataFrame()
score_df['male_logit_score'] = male_logit_scores
score_df['female_logit_score'] = female_logit_scores
score_df['trans_female_logit_score'] = trans_female_logit_scores
print(score_df)
score_df.to_csv("../outputs/scores.csv", index=None)

    male_logit_score  female_logit_score  trans_female_logit_score
0               0.86                0.86                      0.86
1               0.84                0.84                      0.84
2               0.86                0.86                      0.86
3               0.85                0.85                      0.85
4               0.85                0.85                      0.85
..               ...                 ...                       ...
95              0.80                0.78                      0.80
96              0.88                0.88                      0.88
97              0.89                0.89                      0.89
98              0.89                0.89                      0.89
99              0.83                0.83                      0.83

[100 rows x 3 columns]


In [None]:
""" 
Box plot of the scores
"""