In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix
from mimicloader import *

In [3]:
class confusion_matrix_metrics(object):
    def __init__(self,M):
        self.M = M
        self.TP = M[0,0]
        self.FN = M[0,1]
        self.FP = M[1,0]
        self.TN = M[1,1]
    
    def sensitivity(self):
        return self.TP/(self.TP + self.FN)
    def specificity(self):
        return self.TN/(self.TN + self.FP)
    def precision(self):
        return self.TP/(self.TP + self.FP)
    def negative_predictive_value(self):
        return self.TN/(self.TN + self.FN)
    def miss_rate(self):
        return 1 - self.sensitivity()
    def fall_out(self):
        return 1 - self.specificity()
    def false_discovery(self):
        return 1 - self.precision()
    def false_omission(self):
        return 1 - self.negative_predictive_value()
    def threat_score(self):
        return self.TP/(self.TP + self.FN + self.FP)
    def accuracy(self):
        return (self.TP + self.TN)/(self.TP + self. TN + self.FP + self.FN)
    def balanced_accuracy(self):
        return self.sensitivity()/2 + self.specificity()/2
    
filepath = 'mimic_dataset.csv'
loader = MIMICLoader()
df = loader.load(filepath)
df1 = df[df.hospital_expire_flag == 1]
df2 = df[df.hospital_expire_flag == 0]
df2 = df2.sample(n=len(df1))
df3 = df1.append(df2)
y = df3.hospital_expire_flag
X = df3.drop(columns=['hospital_expire_flag'])

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,random_state=42)

clf = AdaBoostClassifier(n_estimators=1000,learning_rate=0.5)
clf.fit(X_train,y_train)
clf.score(X_test,y_test)



0.8044485634847081

In [4]:
clf = RandomForestClassifier(n_estimators=1000,criterion='gini')
clf.fit(X_train,y_train)
yhat = clf.predict(X_test)
clf.score(X_test,y_test)

0.8396663577386468

In [22]:

df = pd.read_csv('mimic_dataset.csv')
df.drop(columns=['Unnamed: 0'],inplace=True)
df = df[df['patientweight'].notna()]
fillcols = {'hospital_expire_flag':0,'age':df.age.mean(),'NumDrugs':0,'num_procedures':0,'curr_service':0,'num_serv':0,'num_transfers':0,'curr_careunit':0,\
            'avg_los':df.avg_los.mean(),'tot_los':df.tot_los.mean(),'num_unique_reads':df.num_unique_reads.mean(),\
           'total_reads':df.total_reads.mean(),'uinique_caregivers':df.uinique_caregivers.mean(),'total_icd9':df.total_icd9.mean(),'total_icu_hours':0,\
           'avg_icu_hours':0,'total_icu_stays':0,'avg_num_drug_administered':0,'max_drug_administered':0,'total_input_drugs':0,'tot_routes':0,\
           'tot_org':0,'org_name':0,'org_itemid':0}
df.fillna(value=fillcols,inplace=True)
serv = df.curr_service.unique()
care = df.curr_careunit.unique()
org = df.org_name.unique()

def replace_stuff(s):
    new_dic = {}
    i = 1
    for j in s:
        if j != 0:
            new_dic[j] = i
            i +=1
    return new_dic
df.replace({'curr_service':replace_stuff(serv),'curr_careunit':replace_stuff(care),'org_name':replace_stuff(org)},inplace=True)
df = df.apply(np.int64)
df = df.drop(columns=['subject_id','hadm_id','total_input_drugs','total_icu_hours','max_drug_administered','org_name'])
df1 = df[df.hospital_expire_flag == 1]
df2 = df[df.hospital_expire_flag == 0]
df2 = df2.sample(n=len(df1))
df3 = df1.append(df2)
X = df3.drop(columns=['hospital_expire_flag'])
y = df3.hospital_expire_flag
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
df = pd.read_csv('mimic_dataset.csv')
df

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,hospital_expire_flag,age,NumDrugs,num_procedures,curr_service,num_serv,num_transfers,...,avg_icu_hours,total_icu_stays,avg_num_drug_administered,max_drug_administered,total_input_drugs,tot_routes,patientweight,tot_org,org_name,org_itemid
0,0,17575,187131,0,18,,9.0,TRAUM,1.0,1.0,...,3.0,1.0,25.461538,93.0,331.0,2.0,,1.0,"STAPHYLOCOCCUS, COAGULASE NEGATIVE",80155.0
1,1,51820,148131,0,89,20.0,2.0,MED,1.0,1.0,...,2.0,1.0,,,,,82.0,2.0,GRAM NEGATIVE ROD(S),80058.0
2,2,25835,162690,0,305,53.0,,MED,1.0,2.0,...,1.0,1.0,,,,,42.7,4.0,POSITIVE FOR METHICILLIN RESISTANT STAPH AUREUS,80293.0
3,3,71915,198577,0,89,22.0,,MED,1.0,3.0,...,1.0,1.0,,,,,50.4,0.0,,
4,4,368,138061,0,302,79.0,1.0,MED,1.0,3.0,...,2.0,1.0,34.600000,116.0,173.0,3.0,,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61200,61200,5298,183445,0,88,,5.0,CSURG,2.0,4.0,...,1.0,1.0,10.333333,32.0,124.0,3.0,,,,
61201,61201,13964,175807,0,88,,,NSURG,1.0,2.0,...,1.0,1.0,6.500000,11.0,13.0,2.0,,0.0,,
61202,61202,11789,145800,0,88,,6.0,CSURG,2.0,3.0,...,2.0,1.0,14.647059,78.0,249.0,3.0,,2.0,ESCHERICHIA COLI,80002.0
61203,61203,17022,116381,0,88,,1.0,NSURG,1.0,2.0,...,3.0,1.0,34.857143,118.0,244.0,3.0,,,,


In [21]:
df.patientweight.mean()

81.37921022067364

In [22]:
model = AdaBoostClassifier()
model.fit(X_train,y_train)
yhat = model.predict(X_test)
M = confusion_matrix(y_true=y_test,y_pred=yhat)
metrics = confusion_matrix_metrics(M)
metrics.sensitivity()
metrics.specificity()

0.28709055876685935

In [23]:
metrics.balanced_accuracy()

0.6318092812425977

### Class Balancing

In [None]:
df1 = df[df.hospital_expire_flag == 1]
df2 = df[df.hospital_expire_flag == 0]
df2 = df2.sample(n=len(df1))
df3 = df1.append(df2)
X = df3.drop(columns=['hospital_expire_flag'])
y = df3.hospital_expire_flag
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = AdaBoostClassifier(n_estimators=1000,learning_rate=0.5)
model.fit(X_train1,y_train1)
yhat1 = model.predict(X_test1)
yhat = model.predict(X_test)
M1 = confusion_matrix(y_true=y_test1,y_pred=yhat1)
M = confusion_matrix(y_test,yhat)
metrics = confusion_matrix_metrics(M)
metrics1 = confusion_matrix_metrics(M1)
metrics.balanced_accuracy()

In [None]:
model1 = LogisticRegression()
model1.fit(X_train1,y_train1)
yhat1 = model1.predict(X_test1)
yhat = model1.predict(X_test)
M1 = confusion_matrix(y_true=y_test1,y_pred=yhat1)
M = confusion_matrix(y_test,yhat)
metrics = confusion_matrix_metrics(M)
metrics1 = confusion_matrix_metrics(M1)
metrics.balanced_accuracy()

In [None]:
plt.figure(figsize=(18,18))
sns.heatmap(df.corr(),annot=True)

In [None]:
model1 = RandomForestClassifier(n_estimators=1000,criterion='gini')
model1.fit(X_train1,y_train1)
yhat1 = model1.predict(X_test1)
yhat = model1.predict(X_test)
M1 = confusion_matrix(y_true=y_test1,y_pred=yhat1)
M = confusion_matrix(y_test,yhat)
metrics = confusion_matrix_metrics(M)
metrics1 = confusion_matrix_metrics(M1)

In [None]:
M

In [None]:
metrics.precision()

In [None]:
metrics.sensitivity()

In [None]:
metrics.specificity()

In [None]:
metrics.negative_predictive_value()

In [None]:
metrics.accuracy()

In [None]:
metrics.balanced_accuracy()

In [None]:
import numba
class StumpLearner(object):
    def __init__(self):
        self.direction = 1
        self.node_feat_index = None
        self.thresh = None
        self.weight = None

class AdaBoost(object):

    def __init__(self, T):
        self.T = T

    def fit(self, X, y,random_selection=5):
        n_observations, n_feats = X.shape
        D = 1/n_observations*np.ones(n_observations)
        self.random_selection = random_selection
        self.classifier = []

        for _ in range(self.T):
            classifier = StumpLearner()
            min_err = float('inf')

            # pick random subset
            feats = np.random.randint(0,n_feats,self.random_selection)

            for feat_i in feats:
                thresholds = np.unique(X[:,feat_i])

                # bin thresholds, make threshold matrix from beginning?
                for threshold in thresholds:
                    direction = 1
                    prediction = np.ones(len(y))
                    prediction[X[:,feat_i] < threshold] = -1
                    #error = sum(D[y != prediction])
                    error = speedy_sum(D,y,prediction)

                    if error > 1/2:
                        error = 1-error
                        direction = -1
                    if error<min_err:
                        classifier.direction = direction
                        classifier.thresh = threshold
                        classifier.node_feat_index = feat_i
                        min_err = error

            classifier.weight = np.float(1/2)*np.log(1/error - 1)
            prediction = np.ones(len(y))
            prediction[(classifier.direction * X[:, classifier.node_feat_index] < classifier.direction * classifier.thresh)] = -1
            D = D*np.exp(-classifier.weight * y * prediction)
            D = D/np.sum(D)
            self.classifier.append(classifier)

    def predict(self,X):
        n_observations = X.shape[0]
        y_hat = np.zeros((n_observations,1))
        for WL in self.classifier:
            prediction = np.ones(np.shape(y_hat))
            prediction[(WL.direction * X[:, WL.node_feat_index] < WL.direction * WL.thresh)] = -1
            y_hat += WL.weight*prediction

        return np.sign(y_hat).flatten()

@jit
def speedy_sum(D,p,y):
    return np.sum(D[y != p])

In [None]:
confusion_matrix(yte,yhat1)

In [None]:
import random

import pandas as pd
import numpy as np


class MIMICLoader:
    def load(self, path=''):
        df = pd.read_csv(path)
        df = df[df['patientweight'].notna()]
        fillcols = {'hospital_expire_flag': 0, 'age': df.age.mean(), 'NumDrugs': 0, 'num_procedures': 0,
                    'curr_service': 0, 'num_serv': 0, 'num_transfers': 0, 'curr_careunit': 0, \
                    'avg_los': df.avg_los.mean(), 'tot_los': df.tot_los.mean(),
                    'num_unique_reads': df.num_unique_reads.mean(), \
                    'total_reads': df.total_reads.mean(), 'uinique_caregivers': df.uinique_caregivers.mean(),
                    'total_icd9': df.total_icd9.mean(), 'total_icu_hours': 0, \
                    'avg_icu_hours': 0, 'total_icu_stays': 0, 'avg_num_drug_administered': 0,
                    'max_drug_administered': 0, 'total_input_drugs': 0, 'tot_routes': 0, \
                    'tot_org': 0, 'org_name': 0, 'org_itemid': 0}
        df.fillna(value=fillcols,inplace=True)
        return df

    def getDeceased(self, data=pd.DataFrame, number=None):
        dead = data.loc[data['hospital_expire_flag'] == 1]
        dead = dead.drop(columns='hospital_expire_flag')

        if(number):
            indicies = np.arange(0, number)
            random.shuffle(indicies)
            dead = dead.iloc[indicies]

        return dead

    def getLiving(self, data=pd.DataFrame, number=None):
        living = data.loc[data['hospital_expire_flag'] == 0]
        living = living.drop(columns='hospital_expire_flag')

        if(number):
            indicies = np.arange(0, number)
            random.shuffle(indicies)
            living = living.iloc[indicies]

        return living

    def train_test_split(self, data=pd.DataFrame, train_size=.8, kfolds=5, rand_seed=42, reduced=False):
        '''
        Loads the MIMIC csv at the path. Returns a tuple of:
            array of indicies for k-fold training data location
            array of indicies for k-fold testing data location
            array of indicies for validation data location
        Note, the validation data is indexed BEFORE doing K-Folds, so there is
        no overlap of the validation indexes with k-folds.
        :param data: Pandas dataframe of the MIMIC data
        :param train_size: percent of data to put in the training / validation
        :param kfolds: Number of folds
        :param rand_seed: Shuffle seed
        :param reduced: If true, uses only 10% of the data, useful for testing
        :return:
        '''
        rows = len(data.index)

        if (reduced):
            rows = int(rows * .1)
            data = data[:rows]

        indicies = np.arange(start=0, stop=len(data.index), step=1)
        random.seed(rand_seed)
        random.shuffle(indicies)

        train_length = int(len(indicies) * train_size)
        train_index = indicies[:train_length]
        val_index = indicies[train_length:]

        train_set = []
        test_set = []

        fold_length = int(train_length / kfolds)
        for i in range(kfolds):
            train_index_np = np.array(train_index)
            test = train_index_np[i * fold_length:(i + 1) * fold_length]
            train = np.delete(train_index_np, train_index_np[i * fold_length:(i + 1) * fold_length])
            train_set.append(train)
            test_set.append(test)

        return train_set, test_set, val_index

In [None]:

ml = MIMICLoader()
data = ml.load('mimic_dataset.csv')
kfolds_train, kfolds_test, validation = \
    ml.train_test_split(data, train_size=.8, kfolds=5, rand_seed=42, reduced=True)

living = ml.getLiving(data, number=60)
dead = ml.getDeceased(data, number=60)

train, test, val = ml.train_test_split(living)

x = 0