In [1]:
# Common imports
import numpy as np
import numpy.random as rnd
import os
import pandas as pd

# To make this notebook's output stable across runs
rnd.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Show all columns when displaying dataframes
pd.set_option('display.max_columns', None)

In [2]:
csv_path = 'MIMIC II/ADMISSIONS.csv'
admissions = pd.read_csv(csv_path)

csv_path = 'MIMIC II/PATIENTS.csv'
patients = pd.read_csv(csv_path)

In [3]:
# Clean up dataframes
admissions = admissions.drop(['ROW_ID', 'DEATHTIME', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'EDREGTIME', 'EDOUTTIME', 'EDOUTTIME', 'HAS_CHARTEVENTS_DATA'], axis=1)
patients = patients.drop(['ROW_ID', 'DOD', 'DOD_HOSP', 'DOD_SSN', 'EXPIRE_FLAG'], axis = 1)

In [4]:
import functools
dfs = [admissions, patients]
hospital = functools.reduce(lambda left,right: pd.merge(left,right,on='SUBJECT_ID'), dfs)

In [5]:
hospital = hospital.drop(['SUBJECT_ID', 'HADM_ID', 'DOB'], axis = 1)
hospital.head()

Unnamed: 0,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,ETHNICITY,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,GENDER
0,2196-04-09 12:26:00,2196-04-10 15:54:00,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,WHITE,BENZODIAZEPINE OVERDOSE,0,F
1,2153-09-03 07:15:00,2153-09-08 19:10:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,WHITE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,M
2,2157-10-18 19:34:00,2157-10-25 14:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,WHITE,BRAIN MASS,0,M
3,2139-06-06 16:14:00,2139-06-09 12:48:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,WHITE,INTERIOR MYOCARDIAL INFARCTION,0,M
4,2160-11-02 02:06:00,2160-11-05 14:55:00,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,WHITE,ACUTE CORONARY SYNDROME,0,M


In [6]:
# Split data
from sklearn.model_selection import train_test_split

#train_set (80%)
#test_set (20%)
train_set, test_set = train_test_split(hospital, test_size=0.2, random_state=42)


In [7]:
# Separate predictors and labels
hospital = train_set.drop('HOSPITAL_EXPIRE_FLAG', axis=1)
    #create a copy of the data while not affecting train_set
hospital_labels = train_set['HOSPITAL_EXPIRE_FLAG'].copy()
    #clean training set

In [8]:
# Convert dates into day of week
import datetime

admTime = hospital['ADMITTIME']
dischTime = hospital['DISCHTIME']

def get_day_of_week (value):
    date = []
    int_date = []
    day_of_week = []
    
    #replace dashes and colons with spaces to make breaking the string up easier
    value = value.replace('-', ' ')
    value = value.replace(':', ' ')
    
    #split into a string outputting [year, month, day, hour, minutes, seconds]
    date = value.split(' ')
    
    #delete hour/min/sec, convert to integers
    del date[3:6]
    int_date = [int(i) for i in date]
    
    #get day of week and return it
    day_of_week = datetime.date(int_date[0], int_date[1], int_date[2]).weekday()
    
    return day_of_week


admTime_weekday = [get_day_of_week(value) for value in admTime]
dischTime_weekday = [get_day_of_week(value) for value in dischTime]

weekday_data = pd.DataFrame({'admTime_weekday' : admTime_weekday, 'dischTime_weekday' : dischTime_weekday})

# Delete dates
hospital = hospital.drop(['ADMITTIME', 'DISCHTIME'], axis = 1)
hospital = hospital.join(weekday_data)

In [9]:
hospital = hospital.fillna('No_Data')
hospital.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47180 entries, 20267 to 56422
Data columns (total 9 columns):
ADMISSION_TYPE        47180 non-null object
ADMISSION_LOCATION    47180 non-null object
DISCHARGE_LOCATION    47180 non-null object
INSURANCE             47180 non-null object
ETHNICITY             47180 non-null object
DIAGNOSIS             47180 non-null object
GENDER                47180 non-null object
admTime_weekday       47180 non-null object
dischTime_weekday     47180 non-null object
dtypes: object(9)
memory usage: 4.8+ MB


In [10]:
#make a copy of hospital to 1hot encode
hospital_enc = hospital.copy()

In [11]:
#have to separate discharge times because the columns overlap with admission time
hospital_dischTime = hospital_enc['dischTime_weekday'].copy
hospital_enc = hospital_enc.drop('dischTime_weekday', axis = 1)
hospital_enc.head()

Unnamed: 0,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,ETHNICITY,DIAGNOSIS,GENDER,admTime_weekday
20267,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME HEALTH CARE,Private,OTHER,CORONARY ARTERY DISEASE\CATH,M,0
11968,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Medicaid,BLACK/AFRICAN AMERICAN,SEIZURE,M,1
13090,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,BLACK/AFRICAN AMERICAN,ASTHMA EXACERBATION,F,0
57540,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME HEALTH CARE,Private,WHITE,PULMONARY EMBOLUS,M,No_Data
28262,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,WHITE,CHRONIC RENAL FAILURE,F,1


In [12]:
# Encode categories into 1hot
hospital_columns = hospital.columns.values.tolist()
running_total_columns = []
hospital_enc = hospital.copy()
    #get column names, create a list to use in nested loops, and create a clean copy of hospital to encode

for column in hospital_columns:
    one_hot = pd.get_dummies(hospital_enc[column])
        #convert column to 1hot
    one_hot_columns = one_hot.columns.values.tolist()
        #creates a list of column names for 1hot
    if 'No_Data' in one_hot_columns:
        one_hot = one_hot.drop('No_Data', axis = 1)
        one_hot_columns.remove('No_Data')
            #gets rid of all the No_Data columns entirely
    for i in one_hot_columns:
        if i in running_total_columns:
            one_hot = one_hot.drop(i, axis = 1)
    for i in one_hot_columns:
        running_total_columns.append(i)
            #attaches new 1hot column
    hospital_enc = hospital_enc.drop(column, axis = 1)
    hospital_enc = hospital_enc.join(one_hot)

In [13]:
# use discharge times later

In [14]:
hospital_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47180 entries, 20267 to 56422
Columns: 13123 entries, ELECTIVE to 6.0
dtypes: uint8(13123)
memory usage: 592.1 MB


In [15]:
x = pd.get_dummies(hospital['admTime_weekday'])

In [16]:
# Pull out all the cardiac related diagnoses
cardiac_columns = []
x = ['MYOCARDIAL', 'CORONARY', 'INFARCTION', 'BYPASS', 'GRAFT/SDA', 'DISEASE\CORONARY', 'ARTERY']
string = 'ACUTE MYOCARDIAL INFARCTION'
hospital_enc_columns = hospital_enc.columns.values.tolist()

for column in hospital_enc_columns:
    if any(word in str(column) for word in x):
        if column not in cardiac_columns: 
            cardiac_columns.append(column)          
# Create one massive dataframe of all cardiac diagnoses         
cardiac_all = hospital_enc[cardiac_columns].copy()

# Combine them all down into a one column dataframe
cardiac = cardiac_all.max(axis = 1).to_frame(name = 'CardiacDiagnoses')
cardiac.head()

Unnamed: 0,CardiacDiagnoses
20267,1
11968,0
13090,0
57540,0
28262,0


In [17]:
# Now organize the information about days of the week
day_of_week = hospital_enc[[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]].copy()
day_of_week.columns = ['Monday', 'Tuesday', 'Wednesday', ' Thursday', 'Friday', 'Saturday', 'Sunday']
day_of_week.head()

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
20267,1,0,0,0,0,0,0
11968,0,1,0,0,0,0,0
13090,1,0,0,0,0,0,0
57540,0,0,0,0,0,0,0
28262,0,1,0,0,0,0,0


In [26]:
# Combine dataframes
my_data = cardiac.join(day_of_week)
    
    #get rid of all rows with all zeroes
my_data = my_data[(my_data.T != 0).any()]
    #get rid of all non-cardiac diagnoses
my_data = my_data[my_data.CardiacDiagnoses != 0]
my_data = my_data.drop('CardiacDiagnoses', axis = 1)

my_data.head()

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
20267,1,0,0,0,0,0,0
33502,0,0,0,0,0,0,1
29864,0,0,0,1,0,0,0
29167,0,1,0,0,0,0,0
35370,1,0,0,0,0,0,0


In [39]:
hospital_labels.to_frame()
cardiac_data = my_data.join(hospital_labels)
cardiac_labels = cardiac_data['HOSPITAL_EXPIRE_FLAG'].copy()
cardiac_values = cardiac_data.drop('HOSPITAL_EXPIRE_FLAG', axis = 1)

In [41]:
cardiac_values.head()

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
20267,1,0,0,0,0,0,0
33502,0,0,0,0,0,0,1
29864,0,0,0,1,0,0,0
29167,0,1,0,0,0,0,0
35370,1,0,0,0,0,0,0


In [40]:
cardiac_labels.head()

20267    0
33502    0
29864    0
29167    0
35370    0
Name: HOSPITAL_EXPIRE_FLAG, dtype: int64

In [42]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

lin_reg.fit(cardiac_values, cardiac_labels)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [46]:
print('Mortality intercept: ' , lin_reg.intercept_)
print('Coefficients are: ' , lin_reg.coef_)
pd.DataFrame(list(zip(cardiac_values.columns, lin_reg.coef_)), columns = ['Features', 'Estimated Coefficients'])

Mortality intercept:  0.028636884307
Coefficients are:  [ 0.02085117  0.02503038  0.0262716   0.01632714  0.00701374  0.03098371
  0.02042854]


Unnamed: 0,Features,Estimated Coefficients
0,Monday,0.020851
1,Tuesday,0.02503
2,Wednesday,0.026272
3,Thursday,0.016327
4,Friday,0.007014
5,Saturday,0.030984
6,Sunday,0.020429


In [54]:
#target predictions
print('estimated values: ', lin_reg.predict(cardiac_values)[0:10])
#actual targets
print(cardiac_labels[0:10])

estimated values:  [ 0.04948805  0.04906542  0.04496403  0.05366726  0.04948805  0.02863688
  0.03565062  0.02863688  0.05490849  0.04496403]
20267    0
33502    0
29864    0
29167    0
35370    0
52275    0
14521    0
53126    0
1952     0
30032    1
Name: HOSPITAL_EXPIRE_FLAG, dtype: int64


In [61]:
from sklearn.metrics import mean_squared_error

cardiac_predictions = lin_reg.predict(cardiac_values)
lin_mse = mean_squared_error(cardiac_labels, cardiac_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.2075392293275693

In [62]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(cardiac_labels, cardiac_predictions)
lin_mae

0.086145063419762788

In [63]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(cardiac_values, cardiac_labels)
    #model is trained

cardiac_predictions = tree_reg.predict(cardiac_values)
tree_mse = mean_squared_error(cardiac_labels, cardiac_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
    #evaluate on training set

0.2075392293275693

In [64]:
from sklearn.ensemble import RandomForestRegressor
    #ensemble learning: building a model on top of other models
    #RandomForestRegressor class creates several decision trees on random subsets of
        #the features and then averagin their predictions
forest_reg = RandomForestRegressor()
forest_reg.fit(cardiac_values, cardiac_labels)
cardiac_predictions = forest_reg.predict(cardiac_values)
forest_mse = mean_squared_error(cardiac_labels, cardiac_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

0.20755058696786466

In [65]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(cardiac_values, cardiac_labels)
cardiac_predictions = svm_reg.predict(cardiac_values)
svm_mse = mean_squared_error(cardiac_labels, cardiac_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

0.21489330100876236

In [67]:
from sklearn.model_selection import GridSearchCV
    #Searches through hyperparameter combinations for the best ones through cross validation
param_grid = [
        {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6]},
        {'bootstrap': [False], 'n_estimators': [3, 6], 'max_features': [2, 3, 4]},
    ]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(cardiac_values, cardiac_labels)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6]}, {'bootstrap': [False], 'n_estimators': [3, 6], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [68]:
grid_search.best_params_

{'max_features': 2, 'n_estimators': 3}

In [69]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=2, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=3, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [70]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.207855744639 {'max_features': 2, 'n_estimators': 3}
0.207989323033 {'max_features': 2, 'n_estimators': 10}
0.207875327368 {'max_features': 2, 'n_estimators': 30}
0.207931795885 {'max_features': 4, 'n_estimators': 3}
0.207889786267 {'max_features': 4, 'n_estimators': 10}
0.207907453811 {'max_features': 4, 'n_estimators': 30}
0.207896865839 {'max_features': 6, 'n_estimators': 3}
0.2078649022 {'max_features': 6, 'n_estimators': 10}
0.207938564114 {'max_features': 6, 'n_estimators': 30}
0.207898373017 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
0.207898373017 {'bootstrap': False, 'max_features': 2, 'n_estimators': 6}
0.207898373017 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
0.207898373017 {'bootstrap': False, 'max_features': 3, 'n_estimators': 6}
0.207898373017 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
0.207898373017 {'bootstrap': False, 'max_features': 4, 'n_estimators': 6}


In [71]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([ 0.12419621,  0.21926819,  0.27321965,  0.14504293,  0.07934975,
        0.14061636,  0.01830691])

In [73]:
pd.DataFrame(list(zip(cardiac_values.columns, feature_importances)), columns = ['Features', 'Feature Importances'])

Unnamed: 0,Features,Feature Importances
0,Monday,0.124196
1,Tuesday,0.219268
2,Wednesday,0.27322
3,Thursday,0.145043
4,Friday,0.07935
5,Saturday,0.140616
6,Sunday,0.018307


In [74]:
#Takeaways
    #Wednesday is the best day for prediction
    #Sunday is the worst