In [1]:
import numpy as np
import pickle
from sklearn import linear_model
from sklearn.metrics import roc_auc_score, classification_report, roc_curve
import warnings
warnings.filterwarnings('ignore')

In [2]:
input_path = 'E:/CS_Master_Degree_UIUC/CS598_DeepLearning_for_Health_Data/Project/paper290/DataProcessed/'
data_path = 'E:/CS_Master_Degree_UIUC/CS598_DeepLearning_for_Health_Data/Project/paper290/MIMIC data/'
out_path = 'E:/CS_Master_Degree_UIUC/CS598_DeepLearning_for_Health_Data/Project/paper290/Output/'

In [3]:
# The vocabulary sizes were all updated per preprocessed results
vocabsize_icd = 1071 #all 6985
vocabsize_meds = 4525
vocabsize_labs = 302 #all 710
vocabsize = vocabsize_icd+vocabsize_meds+vocabsize_labs

input_seqs_icd = np.array(pickle.load(open(input_path +'MIMICIIIPROCESSED.3digitICD9.seqs', 'rb')))
input_seqs_meds = np.array(pickle.load(open(input_path +'MIMICIIIPROCESSED.meds.seqs', 'rb')))
input_seqs_labs = np.array(pickle.load(open(input_path +'MIMICIIIPROCESSED.abnlabs.seqs', 'rb')))
input_seqs_fullicd = np.array(pickle.load(open(input_path +'MIMICIIIPROCESSED.seqs', 'rb')))
labels = np.array(pickle.load(open(input_path +'MIMICIIIPROCESSED.morts', 'rb')))

In [21]:
# fout = open("logistic_regression_interpretations.txt", 'w')

def combine_encounter(seqs, length):
    ret_vector = np.zeros(length)
    for i, enc in enumerate(seqs):
#         print(i)
        for code in enc:
            ret_vector[code] = 1

    return ret_vector

# for modeling using only diagnoses icd9 feature
input_icds = np.array([combine_encounter(input_seqs_icd[i], vocabsize_icd) for i in range(0, len(input_seqs_icd))])

# for modeling using only med feature
input_meds = np.array([combine_encounter(input_seqs_meds[i], vocabsize_meds) for i in range(0, len(input_seqs_meds))])

# for modeling using only abnormal lab feature
input_labs = np.array([combine_encounter(input_seqs_labs[i], vocabsize_labs) for i in range(0, len(input_seqs_labs))])

# for modeling with concatenated features
input_seqs = np.array([np.concatenate((combine_encounter(input_seqs_icd[i], vocabsize_icd), 
                                       combine_encounter(input_seqs_meds[i], vocabsize_meds), 
                                       combine_encounter(input_seqs_labs[i], vocabsize_labs)), axis=0) for i in range(0, len(input_seqs_icd))])

In [26]:
trainratio = 0.7
validratio = 0.1
testratio = 0.2

trainlindex = int(len(input_seqs_icd)*trainratio)
validlindex = int(len(input_seqs_icd)*(trainratio + validratio))

labnames = {}
lab_dict_file = open(data_path + 'D_LABITEMS.csv', 'r')
lab_dict_file.readline()
for line in lab_dict_file:
    tokens = line.strip().split(',')
    labnames[tokens[1].replace('"','')] = tokens[2]
lab_dict_file.close()

icdnames = {}
icd_dict_file = open(data_path + 'D_ICD_DIAGNOSES.csv', 'r')
icd_dict_file.readline()
for line in icd_dict_file:
    tokens = line.strip().split(',')
    icdnames[tokens[1].replace('"','')] = tokens[2]
icd_dict_file.close()

icditems = pickle.load(open(input_path + 'MIMICIIIPROCESSED.types', 'rb'))
meditems = pickle.load(open(input_path + 'MIMICIIIPROCESSED.meds.types', 'rb'))
labitems = pickle.load(open(input_path + 'MIMICIIIPROCESSED.abnlabs.types', 'rb'))

In [27]:
def get_ICD(icd):
    ret_str = ""
    icd_key_lst = list(icditems.keys())
    icd_val_ind = list(icditems.values())[icd]
    icd_str = icd_key_lst[icd_val_ind]
    actual_key = icd_str.replace(".", "")[2:]
    if actual_key in icdnames:
        ret_str = icdnames[actual_key]
    else:
        ret_str = icd_str
    return ret_str

def get_med(med):
    med_key_lst = list(meditems.keys())
    med_val_ind = list(meditems.values())[med]
    ret_str = med_key_lst[med_val_ind]
    return ret_str

def get_lab(lab):
    lab_key_lst = list(labitems.keys())
    lab_val_ind = list(labitems.values())[lab]
    ret_str = labnames[lab_key_lst[lab_val_ind]]
    return ret_str

def get_factor(i, patid):
    if i<1071:
        return get_ICD(pattofullicd_dict[patid][i]), 0
    elif i<1071+4525:
        return get_med(i-1071), 1
    else:
        return get_lab(i-4525-1071), 2

### Modeling with only diagnoses ICD codes

In [28]:
best_aucrocs = []
for run in range(10):
    print('Run', run)

    perm = np.random.permutation(input_icds.shape[0])
    rinput_seqs = input_icds[perm]
    rlabels = labels[perm]

    train_input_seqs = rinput_seqs[:trainlindex]
    train_labels = rlabels[:trainlindex]

    valid_input_seqs = rinput_seqs[trainlindex:validlindex]
    valid_labels = rlabels[trainlindex:validlindex]

    test_input_seqs = rinput_seqs[validlindex:]
    test_labels = rlabels[validlindex:]
    test_input_seqs_interpretations = rinput_seqs[validlindex:]

    model = linear_model.LogisticRegression(solver='liblinear')

    model.fit(train_input_seqs, train_labels)

    vpredict_probabilities = np.array([a[1] for a in model.predict_proba(valid_input_seqs)])
    print("Validation AUC_ROC: ", roc_auc_score(valid_labels, vpredict_probabilities))

    predict_probabilities = np.array([a[1] for a in model.predict_proba(test_input_seqs)])
    print("Test AUC_ROC: ", roc_auc_score(test_labels, predict_probabilities))

    best_aucrocs.append(roc_auc_score(test_labels, predict_probabilities))

print("Average AUCROC:", np.mean(best_aucrocs), "+/-", np.std(best_aucrocs))

Run 0
Validation AUC_ROC:  0.8625110248598417
Test AUC_ROC:  0.8745527599170151
Run 1
Validation AUC_ROC:  0.872348434149606
Test AUC_ROC:  0.8705578450164293
Run 2
Validation AUC_ROC:  0.8580634580665418
Test AUC_ROC:  0.8685471317086528
Run 3
Validation AUC_ROC:  0.8700937540855376
Test AUC_ROC:  0.8712719688703326
Run 4
Validation AUC_ROC:  0.8725046550163139
Test AUC_ROC:  0.8714452298717611
Run 5
Validation AUC_ROC:  0.8664110867339795
Test AUC_ROC:  0.8692525675805549
Run 6
Validation AUC_ROC:  0.868168445374463
Test AUC_ROC:  0.8669152057997642
Run 7
Validation AUC_ROC:  0.8642507380481348
Test AUC_ROC:  0.8743292370796094
Run 8
Validation AUC_ROC:  0.8719781562946063
Test AUC_ROC:  0.8782778668075079
Run 9
Validation AUC_ROC:  0.8665964687124305
Test AUC_ROC:  0.8665874702201887
Average AUCROC: 0.8711737282871816 +/- 0.0034937829929205665


### Modeling with on medication feature

In [30]:
best_aucrocs = []
for run in range(10):
    print('Run', run)

    perm = np.random.permutation(input_meds.shape[0])
    rinput_seqs = input_meds[perm]
    rlabels = labels[perm]

    train_input_seqs = rinput_seqs[:trainlindex]
    train_labels = rlabels[:trainlindex]

    valid_input_seqs = rinput_seqs[trainlindex:validlindex]
    valid_labels = rlabels[trainlindex:validlindex]

    test_input_seqs = rinput_seqs[validlindex:]
    test_labels = rlabels[validlindex:]
    test_input_seqs_interpretations = rinput_seqs[validlindex:]

    model = linear_model.LogisticRegression(solver='liblinear')

    model.fit(train_input_seqs, train_labels)

    vpredict_probabilities = np.array([a[1] for a in model.predict_proba(valid_input_seqs)])
    print("Validation AUC_ROC: ", roc_auc_score(valid_labels, vpredict_probabilities))

    predict_probabilities = np.array([a[1] for a in model.predict_proba(test_input_seqs)])
    print("Test AUC_ROC: ", roc_auc_score(test_labels, predict_probabilities))

    best_aucrocs.append(roc_auc_score(test_labels, predict_probabilities))

print("Average AUCROC:", np.mean(best_aucrocs), "+/-", np.std(best_aucrocs))

Run 0
Validation AUC_ROC:  0.8289482845161186
Test AUC_ROC:  0.8381359144510755
Run 1
Validation AUC_ROC:  0.8174112732933784
Test AUC_ROC:  0.8436212129308622
Run 2
Validation AUC_ROC:  0.8437161567436261
Test AUC_ROC:  0.83297411034039
Run 3
Validation AUC_ROC:  0.8349390228117031
Test AUC_ROC:  0.8380151715646871
Run 4
Validation AUC_ROC:  0.8390939828241775
Test AUC_ROC:  0.8374440605970046
Run 5
Validation AUC_ROC:  0.8504438369814038
Test AUC_ROC:  0.838266647272158
Run 6
Validation AUC_ROC:  0.829046602096121
Test AUC_ROC:  0.843218401505365
Run 7
Validation AUC_ROC:  0.8444933583791762
Test AUC_ROC:  0.8398961912905492
Run 8
Validation AUC_ROC:  0.8382423123799126
Test AUC_ROC:  0.8382886592013117
Run 9
Validation AUC_ROC:  0.8327802764749581
Test AUC_ROC:  0.839106497030648
Average AUCROC: 0.8388966866184051 +/- 0.0028522082771989476


### Modeling with only lab feature

In [31]:
best_aucrocs = []
for run in range(10):
    print('Run', run)

    perm = np.random.permutation(input_labs.shape[0])
    rinput_seqs = input_labs[perm]
    rlabels = labels[perm]

    train_input_seqs = rinput_seqs[:trainlindex]
    train_labels = rlabels[:trainlindex]

    valid_input_seqs = rinput_seqs[trainlindex:validlindex]
    valid_labels = rlabels[trainlindex:validlindex]

    test_input_seqs = rinput_seqs[validlindex:]
    test_labels = rlabels[validlindex:]
    test_input_seqs_interpretations = rinput_seqs[validlindex:]

    model = linear_model.LogisticRegression(solver='liblinear')

    model.fit(train_input_seqs, train_labels)

    vpredict_probabilities = np.array([a[1] for a in model.predict_proba(valid_input_seqs)])
    print("Validation AUC_ROC: ", roc_auc_score(valid_labels, vpredict_probabilities))

    predict_probabilities = np.array([a[1] for a in model.predict_proba(test_input_seqs)])
    print("Test AUC_ROC: ", roc_auc_score(test_labels, predict_probabilities))

    best_aucrocs.append(roc_auc_score(test_labels, predict_probabilities))

print("Average AUCROC:", np.mean(best_aucrocs), "+/-", np.std(best_aucrocs))

Run 0
Validation AUC_ROC:  0.816334671410292
Test AUC_ROC:  0.8180436563438287
Run 1
Validation AUC_ROC:  0.8148430081547692
Test AUC_ROC:  0.8134909943369953
Run 2
Validation AUC_ROC:  0.8101621466234638
Test AUC_ROC:  0.815339305792318
Run 3
Validation AUC_ROC:  0.8219084873679714
Test AUC_ROC:  0.815064318232513
Run 4
Validation AUC_ROC:  0.8275171521127935
Test AUC_ROC:  0.8237439294925875
Run 5
Validation AUC_ROC:  0.8297562726216542
Test AUC_ROC:  0.821424266118638
Run 6
Validation AUC_ROC:  0.8235573693961252
Test AUC_ROC:  0.8193252633704455
Run 7
Validation AUC_ROC:  0.8121092845888931
Test AUC_ROC:  0.8100109444926219
Run 8
Validation AUC_ROC:  0.8018404259342211
Test AUC_ROC:  0.8211440359050048
Run 9
Validation AUC_ROC:  0.8192805150949648
Test AUC_ROC:  0.8111088879996443
Average AUCROC: 0.8168695602084597 +/- 0.004369481199989567


### Modeling with concatenated features (diagnoses icds +  medications + abnormal lab components)

In [32]:
best_aucrocs = []
for run in range(10):
    print('Run', run)

    perm = np.random.permutation(input_seqs.shape[0])
    rinput_seqs = input_seqs[perm]
    rinput_seqs_fullicd = input_seqs_fullicd[perm]
    rlabels = labels[perm]
    r_input_icd = input_seqs_icd[perm]

    train_input_seqs = rinput_seqs[:trainlindex]
    train_input_seqs_fullicd = rinput_seqs_fullicd[:trainlindex]
    train_labels = rlabels[:trainlindex]

    valid_input_seqs = rinput_seqs[trainlindex:validlindex]
    valid_input_seqs_fullicd = rinput_seqs_fullicd[trainlindex: validlindex]
    valid_labels = rlabels[trainlindex:validlindex]

    test_input_seqs = rinput_seqs[validlindex:]
    test_input_seqs_fullicd = rinput_seqs_fullicd[validlindex:]
    test_labels = rlabels[validlindex:]
    test_input_seqs_interpretations = r_input_icd[validlindex:]

    model = linear_model.LogisticRegression(solver='liblinear')

    model.fit(train_input_seqs, train_labels)

    vpredict_probabilities = np.array([a[1] for a in model.predict_proba(valid_input_seqs)])
    print("Validation AUC_ROC: ", roc_auc_score(valid_labels, vpredict_probabilities))

    predict_probabilities = np.array([a[1] for a in model.predict_proba(test_input_seqs)])
    print("Test AUC_ROC: ", roc_auc_score(test_labels, predict_probabilities))

    best_aucrocs.append(roc_auc_score(test_labels, predict_probabilities))

print("Average AUCROC:", np.mean(best_aucrocs), "+/-", np.std(best_aucrocs))

Run 0
Validation AUC_ROC:  0.9035571922286929
Test AUC_ROC:  0.9081647220437435
Run 1
Validation AUC_ROC:  0.9018085605521937
Test AUC_ROC:  0.9022555420627703
Run 2
Validation AUC_ROC:  0.9062489733377497
Test AUC_ROC:  0.901601217462055
Run 3
Validation AUC_ROC:  0.8966805367619923
Test AUC_ROC:  0.901580479230438
Run 4
Validation AUC_ROC:  0.8997072145482338
Test AUC_ROC:  0.8935310175911682
Run 5
Validation AUC_ROC:  0.9003184303072719
Test AUC_ROC:  0.8976782830671866
Run 6
Validation AUC_ROC:  0.9082330140405183
Test AUC_ROC:  0.9000080175174793
Run 7
Validation AUC_ROC:  0.9020573177255856
Test AUC_ROC:  0.9011704346355244
Run 8
Validation AUC_ROC:  0.900692498566354
Test AUC_ROC:  0.9027827575514267
Run 9
Validation AUC_ROC:  0.8963414499828181
Test AUC_ROC:  0.9008529323044622
Average AUCROC: 0.9009625403466254 +/- 0.0035302471069723084


### Find the top 10 risk ICD factors for patient having predicted death

In [33]:
interpretation_file = open(out_path + "Log_Reg_Interpretations.txt", 'w')

pattofullicd_dict = {}
for i in range(len(test_input_seqs_interpretations)):
    icdtofullicd_dict = {}
    for j in range(len(test_input_seqs_interpretations[i])):
        for k in range(len(test_input_seqs_interpretations[i][j])):
            icdtofullicd_dict[test_input_seqs_interpretations[i][j][k]] = test_input_seqs_fullicd[i][j][k]
    pattofullicd_dict[i] = icdtofullicd_dict

coeffs = np.array(model.coef_[0])
for patid in range(len(test_input_seqs)):
    test_input = test_input_seqs[patid]

    scores = (test_input*coeffs)
    # scores = coeffs
    risk_scores = []
    for i in range(len(scores)):
        if test_input[i]>0:
            factors = get_factor(i, patid)
            risk_scores.append((factors[0], scores[i]))
    risk_scores.sort(key=lambda tup: tup[1], reverse=True)

    top_risk_factors = risk_scores[:10]

    if (predict_probabilities[patid] > 0.5):
        interpretation_file.write("ID: " + str(patid) + " True label: "+str(test_labels[patid])+"\n")
        for rf in top_risk_factors:
            interpretation_file.write(str(rf)+"\n")
        interpretation_file.write("\n")

interpretation_file.close()


# fpr, tpr, _ = roc_curve(test_labels, predict_probabilities)
# pickle.dump({"FPR":fpr, "TPR":tpr}, open('roc_lr.p', 'wb'))
# actual_predictions = (predict_probabilities>0.5)*1

# print classification_report(test_labels, actual_predictions)
# coeffs = np.array(model.coef_[0])

# icd_scores = {}
# icd_totals = {}
# med_scores = {}
# med_totals = {}
# lab_scores = {}
# lab_totals = {}

# for patid in range(len(test_input_seqs)):
# 	test_input = test_input_seqs[patid]
# 	scores = (test_input*coeffs)
# 	# scores = coeffs
# 	for i in range(len(scores)):
# 		if test_input[i]>0:
# 			factors = get_factor(i, patid)
# 			if factors[1] == 0:
# 				if factors[0] in icd_scores:
# 					icd_scores[factors[0]] += scores[i]
# 					icd_totals[factors[0]] += 1
# 				else:
# 					icd_scores[factors[0]] = scores[i]
# 					icd_totals[factors[0]] = 1
# 			elif factors[1] == 1:
# 				if factors[0] in med_scores:
# 					med_scores[factors[0]] += scores[i]
# 					med_totals[factors[0]] += 1
# 				else:
# 					med_scores[factors[0]] = scores[i]
# 					med_totals[factors[0]] = 1
# 			else:
# 				if factors[0] in lab_scores:
# 					lab_scores[factors[0]] += scores[i]
# 					lab_totals[factors[0]] += 1
# 				else:
# 					lab_scores[factors[0]] = scores[i]
# 					lab_totals[factors[0]] = 1


# icd_averages = []
# med_averages = []
# lab_averages = []

# for factor in icd_scores:
# 	icd_averages.append((factor, icd_scores[factor]/icd_totals[factor]))
# icd_averages.sort(key=lambda tup: tup[1], reverse=True)
# fout.write("ICD codes:\n")
# for item in icd_averages:
# 	fout.write(item[0]+"-"+str(item[1])+"\n")
# fout.write("\n")

# for factor in med_scores:
# 	med_averages.append((factor, med_scores[factor]/med_totals[factor]))
# med_averages.sort(key=lambda tup: tup[1], reverse=True)
# fout.write("Medications:\n")
# for item in med_averages:
# 	fout.write(item[0]+"-"+str(item[1])+"\n")
# fout.write("\n")

# for factor in lab_scores:
# 	lab_averages.append((factor, lab_scores[factor]/lab_totals[factor]))
# lab_averages.sort(key=lambda tup: tup[1], reverse=True)
# fout.write("Lab components:\n")
# for item in lab_averages:
# 	fout.write(item[0]+"-"+str(item[1])+"\n")
# fout.write("\n")

# scores = [(scores[i], get_factor(i)) for i in range(len(scores)) if test_input[i]>0]
# scores.sort(key=lambda tup: tup[0], reverse=True)

# for factor in scores:
# 	if factor[1] in ["\"Encephalopathy NOS\"", "\"Bleed esoph var oth dis\"", "\"Lactulose Enema\"", "\"Cirrhosis of liver NOS\"", "\"Urin tract infection NOS\"", "\"Phytonadione\"", "\"Hy kid NOS w cr kid I-IV\"", "\"Mal neo liver", "\"Red blood cells\"", "\"RDW\"", "\"Hemoglobin\"", "\"0.9% Sodium Chloride\""]:
# 		print factor

#for factor in scores: