Notebook to generate prediction model using all possible features
--------------------------------------------------------------------
<br>
193 Item IDs used for prediction

In [1]:
import sklearn 
import numpy as np 
import pandas as pd 
from datetime import datetime

In [2]:
def load_data(path1,path2,path3,path4):
    df_icu_stays = pd.read_csv(path1)
    df_sepsis_occurance = pd.read_csv(path2)
    df_events = pd.read_csv(path3)
    df_chart_events = pd.read_csv(path4)
    return df_icu_stays, df_sepsis_occurance, df_events, df_chart_events

path1 = 'm100s2/ICUSTAYS.csv'
path2 = 'm100s2/sample_ids_2.csv'
path3 = 'm100s2/sample_inputevents_mv_2.csv'
path4 = "m100s2/sample_chartevents_2.csv"

df_icu_stays, df_sepsis_occurance, df_events, df_chart_events = load_data(path1,path2,path3,path4)

  if self.run_code(code, result):


In [None]:
print(df_events.shape, df_sepsis_occurance.shape, df_icu_stays.shape, df_chart_events.shape)

(110089, 31) (100, 4) (61532, 12) (3168992, 15)


In [None]:
def join_data(df_events,df_sepsis_occurance, df_chart_events): 
    df = pd.merge(df_events, df_sepsis_occurance, on='hadm_id', how='inner')
    df1 = pd.merge(df, df_chart_events, on='hadm_id', how='inner')
    return df1

df_joined = join_data(df_events, df_sepsis_occurance, df_chart_events)

In [None]:
all_vals = df_joined['itemid_y'].values
ids = [220052, 22045, 224690, 223761, 220277,  227013]
for i in ids:
    print(len(all_vals[all_vals == i]))

In [6]:
#Convert columns to datetime objects 
df_joined['sepsist0'] = df_joined['sepsist0'].apply(lambda i: datetime.strptime(i,'%Y-%m-%d %H:%M:%S'))
df_joined['starttime'] = df_joined['starttime'].apply(lambda i: datetime.strptime(i,'%Y-%m-%d %H:%M:%S'))

In [7]:
#Create time difference in hours column between sepsis t0 and starttime
df_joined['time_diff'] = (df_joined['sepsist0'] - df_joined['starttime'])
df_joined['time_diff'] = df_joined['time_diff'].apply(lambda i: i.seconds/3600)

In [8]:
print(df_joined.iloc[12])

row_id                                               6866
subject_id_x                                        29983
hadm_id                                            135689
icustay_id_x                                       224079
starttime                             2177-01-06 10:31:00
endtime                               2177-01-06 13:03:00
itemid                                             222168
amount                                            924.574
amountuom                                              mg
rate                                              50.0636
rateuom                                        mcg/kg/min
storetime                             2177-01-06 13:29:00
cgid                                                17525
orderid                                            155962
linkorderid                                       5895253
ordercategoryname                                01-Drips
secondaryordercategoryname       02-Fluids (Crystalloids)
ordercomponent

# Set Window Size

In [9]:
hours = 5
#1 if developed sepsis with 8 hours else 0
df_joined['curr_sepsis'] = df_joined['time_diff'].apply(lambda i: 1 if i <= hours else 0)

In [10]:
#Number of events with sepsis and without sepsis
print(len(df_joined['curr_sepsis'][df_joined['curr_sepsis'] == 1].values))
print(len(df_joined['curr_sepsis'][df_joined['curr_sepsis'] == 0].values))

26420
93667


# Feature Selection

In [13]:
#Here we select our features and drop NaNs
def filter_features(df,features,pred_label):
    all_feats = features+pred_label
    new_df = df[all_feats]
    new_df = new_df.dropna(axis=0)
    new_x = new_df[features]
    new_y = new_df[pred_label]
    return new_x, new_y 

features = ['amount','rate','patientweight','totalamount','originalamount','originalrate']
pred_label = ['curr_sepsis']
filtered_x, filtered_y = filter_features(df_joined,features,pred_label)

<center><h4>Rate Feature</h4></center>

In [11]:
#Creating the sparse rate feature vectors
all_items = df_joined['itemid'].values
all_rates = df_joined['rate'].values
all_y = df_joined['curr_sepsis']
item_ids = list(set(df_joined['itemid'].values)) 
remap_items = range(len(item_ids))
dict_items = dict(zip(item_ids,remap_items))
item_ids = list(set(df_joined['itemid'].values)) 
item_id_feature = np.zeros((df_joined.shape[0],len(item_ids)))
x = []
y = []
for i in range(all_items.shape[0]): 
    val = all_items[i]
    rate = all_rates[i]
    remap_val = dict_items[val]
    z = np.zeros((1,len(item_ids)))[0]
    z[remap_val] = rate
    x.append(z)
    y.append(all_y[i])
rate_feat_x = np.array(x)
y = np.array(y)

#Removing rows with NaNs
nan_index = np.argwhere(np.isnan(rate_feat_x))
nan_rows = []
for i in nan_index: 
    nan_rows.append(i[0])
rate_feat_x = np.delete(rate_feat_x,nan_rows,0)
rate_feat_y = np.delete(y,nan_rows,0)

In [14]:
#Create train/test split
from sklearn.cross_validation import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(filtered_x, filtered_y, test_size=0.20)

#Train/Test split for rate sparse feature vector
Rxtrain, Rxtest, Rytrain, Rytest = train_test_split(rate_feat_x, rate_feat_y, test_size=0.20)

<center><h4>Building A Random Forest Model</h4></center>

In [12]:
#Create Random Forest Model 
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 25, max_depth= None,random_state= 11 )
rf.fit(xtrain, ytrain)
prediction = rf.predict(xtest)



In [13]:
#Mean Squared Error of Random Forest
from sklearn.metrics import mean_squared_error
print(mean_squared_error(ytest,prediction))

0.177361608372


In [14]:
#Accuracy of Random Forest
from sklearn.metrics import accuracy_score 
print(accuracy_score(ytest,prediction))

0.822638391628


In [15]:
from sklearn.externals import joblib
# now you can save it to a file
joblib.dump(rf, 'decision_tree.pkl') 
# and later you can load it
rf = joblib.load('decision_tree.pkl')

<h4>Rate Sparse Feature:</h4>

In [124]:
rf = RandomForestClassifier(n_estimators= 25, max_depth= None,random_state= 11 )
rf.fit(Rxtrain, Rytrain)
Rprediction = rf.predict(Rxtest)
print(accuracy_score(Rytest,Rprediction))

0.767419443679


<center><h4>Building A Logistic Regression Model</h4></center>

In [17]:
from sklearn.linear_model import LogisticRegression 
lr = LogisticRegression(C=10) 
lr.fit(xtrain, ytrain)
predictions = lr.predict(xtest)

  y = column_or_1d(y, warn=True)


In [18]:
print(mean_squared_error(ytest,predictions))
print(accuracy_score(ytest,predictions))

0.216813549986
0.783186450014


In [19]:
from sklearn.externals import joblib
# now you can save it to a file
joblib.dump(rf, 'logistic_regression.pkl') 
# and later you can load it
lr = joblib.load('logistic_regression.pkl')