# Import Data and Clean/Impute

In [67]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
pd.set_option('display.max_columns', 500)

Read in files from train, test validation sets

In [68]:
# import glob
# import errno
# path = 'ICU_Mini/*.txt'
# mini_files = glob.glob(path)

In [69]:
import glob
import errno
path = 'data/set-a/*.txt'
files = glob.glob(path)

In [70]:
path = 'data/set-c/*.txt'
test_files = glob.glob(path)

In [71]:
path = 'data/set-b/*.txt'
val_files = glob.glob(path)

In [72]:
all_files = files
all_files.extend(test_files)
all_files.extend(val_files)

In [73]:
# Read in a mountain of text data!
def read_mountain_data(files):
    counts = pd.DataFrame()
    vital_means = pd.DataFrame()
    vital_std = pd.DataFrame()
    with tqdm(total=100) as pbar:
        for file in sorted(files):
            with open(file) as f:
                content = f.readlines()
            contents = [i.rstrip().split(',') for i in content]
            data = pd.DataFrame(contents[1:])
            data[2] = data[2].astype(float)
            counts = counts.append(data.drop(columns = 2).groupby(1).count().transpose(), sort=True)
            vital_means = vital_means.append(data.groupby(1).mean().transpose(), sort=True)
            pbar.update(.025)
        return [vital_means, counts]

In [74]:
# Read in standard deviations, and clean data in process
def read_standard_deviations(files):
    vital_std = pd.DataFrame()
    with tqdm(total=100) as pbar:
        for file in sorted(files):
            with open(file) as f:
                content = f.readlines()
            contents = [i.rstrip().split(',') for i in content]
            del contents[0]
            filtered = [x for x in contents if not (((x[1] == 'NIDiasABP') & (float(x[2]) < 10.))
                                        |((x[1] == 'NISysABP') & (float(x[2]) < 10.))
                                        |((x[1] == 'NIMAP') & (float(x[2]) < 10.)) 
                                       |((x[1] == 'Height') & (float(x[2]) < 90.))
                                       |((x[1] == 'Weight') & (float(x[2]) < 30.))
                                       |((x[1] == 'Gender') & (float(x[2]) < 0.)))]
            data = pd.DataFrame(filtered)
            data[2] = data[2].astype(float)
            vital_std = vital_std.append(data.groupby(1).std().transpose(), sort=True)
            pbar.update(.025)
        return vital_std

In [75]:
data = read_mountain_data(all_files)
train_data = data[0]
counts = data[1]

300.000000000001it [04:01,  1.24it/s]                           


In [76]:
pickle_out = open("all_feature_means.pickle","wb")
pickle.dump(train_data, pickle_out)
pickle_out.close()
pickle_out = open("all_feature_counts.pickle","wb")
pickle.dump(counts, pickle_out)
pickle_out.close()

More cleaning of impossible values: 0 Blood pressure (all patients were alive during the 48 hours, so it must simply mean a machine was disconnected).
Also, all patients were adults (at least 15 years old), so removing impossible weights/heights (under 80 cm and under 30 kg).

In [77]:
train_data.loc[(train_data.Height < 90),'Height'] = np.NaN
train_data.loc[(train_data.Gender < 0),'Gender'] = np.NaN
train_data.loc[(train_data.Weight < 30),'Weight'] = np.NaN

In [78]:
#Dropping MechVent, the training data was identical for all patients
train_data = train_data.drop(columns = 'MechVent')

In [79]:
#Merging Invasive/Noninvasive BP's by keeping invasive if available, replacing with NI only when necessary
bp = ['DiasABP','SysABP','MAP']
NIbp = ['NIDiasABP','NISysABP','NIMAP']

def merge_BP(df):
    for index, col in enumerate(bp):
        df[col].fillna(df[NIbp[index]], inplace=True)
        del df[NIbp[index]]

In [80]:
merge_BP(train_data)

In [81]:
#Removing impossibly low BP values while at it
train_data.loc[(train_data.DiasABP < 10),'DiasABP'] = np.NaN
train_data.loc[(train_data.SysABP < 10),'SysABP'] = np.NaN
train_data.loc[(train_data.MAP < 10),'MAP'] = np.NaN

In [82]:
#Read in outcomes and pretty them up for use
def find_outcomes(text):
    with open(text) as f:
        content = f.readlines()
        contents = [i.rstrip().split(',') for i in content[1:]]
    outcomes = [[i[0],i[5]] for i in contents]
    outcomes = pd.DataFrame(outcomes)
    outcomes = outcomes.rename(columns = {0:'RecordID',1:'Outcome'})
    outcomes = outcomes.astype(int)
    return outcomes

In [83]:
train_outcomes = find_outcomes('data/Outcomes-a.txt')

In [84]:
#Reducing Features down
categories = ['Age','BUN', 'Creatinine', 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3',
       'HCT', 'HR', 'ICUType', 'K', 'MAP', 'Mg',
       'Na', 'PaCO2', 'PaO2', 'Platelets','Gender',
       'RecordID', 'SysABP', 'Temp', 'Urine', 'WBC', 'Weight', 'pH']

In [85]:
for_impute = train_data[categories]
for_impute.RecordID = for_impute.RecordID.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [86]:
pickle_out = open("for_impute","wb")
pickle.dump(for_impute, pickle_out)
pickle_out.close()

In [87]:
regtrain = pd.merge(for_impute.fillna(for_impute.mean()),train_outcomes, on='RecordID')

In [88]:
pickle_out = open("regtrain.pickle","wb")
pickle.dump(regtrain, pickle_out)
pickle_out.close()

### Use MICE to impute missing data

In [89]:
pickle_in = open("for_impute","rb")
for_impute = pickle.load(pickle_in)

In [90]:
imp = IterativeImputer(max_iter=10, random_state=0)
for_impute = for_impute.drop(columns=['RecordID'])
imputed = imp.fit_transform(for_impute)
imputed = pd.DataFrame(imputed, columns=for_impute.columns)

In [91]:
pickle.dump(imp, open("imputer.p","wb"))

In [92]:
imputed

Unnamed: 0,Age,BUN,Creatinine,DiasABP,FiO2,GCS,Glucose,HCO3,HCT,HR,ICUType,K,MAP,Mg,Na,PaCO2,PaO2,Platelets,Gender,SysABP,Temp,Urine,WBC,Weight,pH
0,54.0,10.500000,0.750000,50.147059,0.550904,14.923077,160.000000,27.000000,32.500000,70.810811,4.0,4.200000,71.559118,1.700000,136.500000,45.680866,144.352932,203.000000,0.0,114.382353,37.357143,171.052632,10.300000,83.175702,7.445505
1,76.0,18.333333,1.100000,58.897059,0.560000,13.333333,125.500000,22.333333,28.655556,80.794118,2.0,3.900000,76.940299,2.300000,137.000000,38.857143,210.142857,178.600000,1.0,113.411765,36.939130,151.560976,11.266667,80.670588,7.395000
2,44.0,4.666667,0.333333,67.125000,0.500000,5.923077,134.333333,25.000000,28.460000,83.759259,3.0,4.260000,90.437500,1.720000,138.333333,35.500000,134.500000,89.666667,0.0,125.687500,37.800000,124.951220,4.700000,56.700000,7.495000
3,68.0,17.666667,0.766667,65.051724,0.599456,14.944444,117.333333,27.666667,37.442857,70.983333,3.0,4.000000,83.885517,2.033333,139.333333,43.653200,136.788088,330.000000,1.0,121.551724,36.223077,545.833333,9.400000,84.600000,7.538804
4,88.0,35.000000,1.000000,45.720930,0.554402,15.000000,102.500000,19.000000,29.550000,74.958333,3.0,4.320000,74.946512,1.550000,139.500000,34.773717,146.533925,103.000000,0.0,133.395349,36.880000,62.131579,4.300000,66.577589,7.451807
5,64.0,16.750000,0.975000,73.622222,0.466667,8.666667,204.666667,19.750000,37.225000,88.531915,1.0,4.150000,88.688889,2.000000,137.750000,35.142857,110.000000,210.750000,1.0,115.688889,37.577778,136.333333,16.100000,114.000000,7.405714
6,68.0,32.500000,3.600000,79.000000,0.510290,15.000000,105.000000,24.666667,31.600000,68.338983,3.0,3.775000,109.301887,1.900000,139.000000,40.117290,128.618458,329.666667,0.0,166.500000,36.630769,62.970588,6.366667,87.000000,7.478648
7,78.0,64.600000,0.680000,39.266667,0.536364,11.846154,126.200000,13.600000,33.233333,70.945205,3.0,4.380000,64.766667,2.633333,139.600000,30.533333,130.400000,96.333333,0.0,125.550000,37.005556,43.810811,20.000000,48.400000,7.274000
8,64.0,22.000000,0.700000,64.478261,0.615678,15.000000,112.500000,23.000000,28.300000,127.239130,3.0,4.200000,84.477391,1.650000,139.000000,40.550552,100.544280,696.000000,0.0,124.478261,36.900000,240.000000,15.200000,60.700000,7.642809
9,74.0,19.333333,1.133333,58.410714,0.633333,14.083333,110.000000,24.666667,29.100000,85.189655,2.0,4.350000,79.517857,1.800000,140.000000,44.875000,219.000000,145.666667,1.0,124.892857,36.683333,108.085106,10.166667,68.582759,7.350000


In [93]:
pickle_out = open("imputed.pickle","wb")
pickle.dump(imputed, pickle_out)
pickle_out.close()

In [94]:
all_train = pd.concat([imputed,train_outcomes['Outcome']], axis=1)

In [95]:
pickle_out = open("all_train.pickle","wb")
pickle.dump(all_train, pickle_out)
pickle_out.close()

In [96]:
pickle_in = open("all_train.pickle","rb")
all_train = pickle.load(pickle_in)

In [97]:
all_train.columns

Index(['Age', 'BUN', 'Creatinine', 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3',
       'HCT', 'HR', 'ICUType', 'K', 'MAP', 'Mg', 'Na', 'PaCO2', 'PaO2',
       'Platelets', 'Gender', 'SysABP', 'Temp', 'Urine', 'WBC', 'Weight', 'pH',
       'Outcome'],
      dtype='object')

In [100]:
x = pd.DataFrame(all_train.loc[1,:]).transpose().drop(columns=["Outcome"])
x.index=[0]
y = pd.DataFrame(pd.Series(2), columns=["RecordID"])
z = pd.DataFrame(pd.Series(1400), columns=["Timestamp"])
x = pd.concat([z,y,x], axis=1)

In [101]:
x

Unnamed: 0,Timestamp,RecordID,Age,BUN,Creatinine,DiasABP,FiO2,GCS,Glucose,HCO3,HCT,HR,ICUType,K,MAP,Mg,Na,PaCO2,PaO2,Platelets,Gender,SysABP,Temp,Urine,WBC,Weight,pH
0,1400,2,76.0,18.333333,1.1,58.897059,0.56,13.333333,125.5,22.333333,28.655556,80.794118,2.0,3.9,76.940299,2.3,137.0,38.857143,210.142857,178.6,1.0,113.411765,36.93913,151.560976,11.266667,80.670588,7.395


In [102]:
x.to_csv("data.csv",index=None)

In [104]:
x = pd.read_csv("data.csv")

In [105]:
x.columns

Index(['Timestamp', 'RecordID', 'Age', 'BUN', 'Creatinine', 'DiasABP', 'FiO2',
       'GCS', 'Glucose', 'HCO3', 'HCT', 'HR', 'ICUType', 'K', 'MAP', 'Mg',
       'Na', 'PaCO2', 'PaO2', 'Platelets', 'Gender', 'SysABP', 'Temp', 'Urine',
       'WBC', 'Weight', 'pH'],
      dtype='object')

In [106]:
x

Unnamed: 0,Timestamp,RecordID,Age,BUN,Creatinine,DiasABP,FiO2,GCS,Glucose,HCO3,HCT,HR,ICUType,K,MAP,Mg,Na,PaCO2,PaO2,Platelets,Gender,SysABP,Temp,Urine,WBC,Weight,pH
0,1400,2,76.0,18.333333,1.1,58.897059,0.56,13.333333,125.5,22.333333,28.655556,80.794118,2.0,3.9,76.940299,2.3,137.0,38.857143,210.142857,178.6,1.0,113.411765,36.93913,151.560976,11.266667,80.670588,7.395


In [107]:
x['Glucose'].isnull()[0]

False

In [110]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
import pickle as p

def scale(X_train, X_test, X_val = np.empty([0,])):
    std=p.load(open("scaler.p","rb"))
    X_tr = std.transform(X_train.values)
    X_te = std.transform(X_test.values)
    
    #When I want to test the set and the data
    if X_val.size != 0 :
        X_va = std.transform(X_val.values)
        return X_tr, X_te, X_va
    return X_tr, X_te

def predict():
    data = pd.read_csv("data.csv")
#     print(data)
    timestamp = data['Timestamp'][0]
    mif = p.load(open("mif.p","rb"))
    missing_iv = []
    for vital in mif:
        if data[vital].isnull()[0]==True:
            missing_iv.append(vital)
    data = data.drop(columns=['Timestamp', 'RecordID'])
    data.loc[(data.Gender < 0),'Gender'] = np.NaN
    data.loc[(data.Weight < 30),'Weight'] = np.NaN
    data.loc[(data.DiasABP < 10),'DiasABP'] = np.NaN
    data.loc[(data.SysABP < 10),'SysABP'] = np.NaN
    data.loc[(data.MAP < 10),'MAP'] = np.NaN
    imp=p.load(open("imputer.p","rb"))
    data_t = imp.transform(data)
    data_t = pd.DataFrame(data_t, columns=data.columns)
    data_t, data_t = scale(data_t, data_t)
    model = p.load(open("XGB.pickle.dat","rb"))
    outcome = (model.predict_proba(data_t)[:, 1] >= .311)
    if outcome==True:
        time_p = "The patient is still in a critical stage. Right now, the most accurate prediction in terms of survival cannot be made."
    else:
        time_p = "The patient can now safely survive according to the given vital. This is after "+str(timestamp)+" hrs post admission"
    prob = model.predict_proba(data_t)
    return [outcome[0],prob[0, 1], missing_iv, time_p]

In [111]:
o,p,y,w = predict()

In [112]:
w

'The patient can now safely survive according to the given vital. This is after 1400 hrs post admission'

In [114]:
p

0.008871699

In [None]:
def predict():
    data = pd.read_csv("master.csv")
    recordID = 1
    record_data= data.loc['RecordID'==1,:].mean
    timestamp = data['Timestamp'][0]
    mif = p.load(open("mif.p","rb"))
    missing_iv = []
    for vital in mif:
        if data[vital].isnull()[0]==True:
            missing_iv.append(vital)
    data = data.drop(columns=['Timestamp', 'RecordID'])
    data.loc[(data.Gender < 0),'Gender'] = np.NaN
    data.loc[(data.Weight < 30),'Weight'] = np.NaN
    data.loc[(data.DiasABP < 10),'DiasABP'] = np.NaN
    data.loc[(data.SysABP < 10),'SysABP'] = np.NaN
    data.loc[(data.MAP < 10),'MAP'] = np.NaN
    imp=p.load(open("imputer.p","rb"))
    data_t = imp.transform(data)
    data_t = pd.DataFrame(data_t, columns=data.columns)
    data_t, data_t = scale(data_t, data_t)
    model = p.load(open("XGB.pickle.dat","rb"))
    outcome = (model.predict_proba(data_t)[:, 1] >= .311)
    if outcome==True:
        time_p = "The patient is still in a critical stage. Right now, the most accurate prediction in terms of survival cannot be made."
    else:
        time_p = "The patient can now safely survive according to the given vital. This is after "+str(timestamp)+" hrs post admission"
    prob = model.predict_proba(data_t)
    return [outcome[0],prob[0, 1], missing_iv, time_p]

In [135]:
recordID = 1
master = pd.read_csv("master.csv")
data = master[master['RecordID']==recordID]

In [60]:
data = pd.DataFrame(data.mean()).transpose()

In [61]:
ts = data['Timestamp'].tolist()

In [62]:
ts = sorted(ts)

In [63]:
timestamp = ts[-1]

In [136]:
data

Unnamed: 0,Timestamp,RecordID,Age,BUN,Creatinine,DiasABP,FiO2,GCS,Glucose,HCO3,HCT,HR,ICUType,K,MAP,Mg,Na,PaCO2,PaO2,Platelets,Gender,SysABP,Temp,Urine,WBC,Weight,pH
0,1200,1,44,4.666667,0.333333,67.125,0.5,5.923077,,25,28.46,83.759259,3,4.26,90.4375,,138.333333,35.5,134.5,89.666667,0,125.6875,37.8,124.951219,,56.7,7.495
1,1200,1,44,4.666667,0.333333,67.125,0.5,5.923077,,25,28.46,83.759259,3,4.26,90.4375,,138.333333,35.5,134.5,89.666667,0,125.6875,37.8,124.951219,,56.7,7.495


In [137]:
data.transpose().to_html(classes='table table-striped" id= "a_nice_table', border=0, header=None, bold_rows=True)

'<table border="0" class="dataframe table table-striped" id = "a_nice_table">\n  <tbody>\n    <tr>\n      <th>Timestamp</th>\n      <td>1200.000000</td>\n      <td>1200.000000</td>\n    </tr>\n    <tr>\n      <th>RecordID</th>\n      <td>1.000000</td>\n      <td>1.000000</td>\n    </tr>\n    <tr>\n      <th>Age</th>\n      <td>44.000000</td>\n      <td>44.000000</td>\n    </tr>\n    <tr>\n      <th>BUN</th>\n      <td>4.666667</td>\n      <td>4.666667</td>\n    </tr>\n    <tr>\n      <th>Creatinine</th>\n      <td>0.333333</td>\n      <td>0.333333</td>\n    </tr>\n    <tr>\n      <th>DiasABP</th>\n      <td>67.125000</td>\n      <td>67.125000</td>\n    </tr>\n    <tr>\n      <th>FiO2</th>\n      <td>0.500000</td>\n      <td>0.500000</td>\n    </tr>\n    <tr>\n      <th>GCS</th>\n      <td>5.923077</td>\n      <td>5.923077</td>\n    </tr>\n    <tr>\n      <th>Glucose</th>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>HCO3</th>\n      <td>25.000000</td>\n      <

In [134]:
print(x)

<table border="0" class="dataframe table table-striped" id = "a_nice_table">
  <tbody>
    <tr>
      <th>Timestamp</th>
      <td>1200.000000</td>
      <td>1200.000000</td>
    </tr>
    <tr>
      <th>RecordID</th>
      <td>1.000000</td>
      <td>1.000000</td>
    </tr>
    <tr>
      <th>Age</th>
      <td>44.000000</td>
      <td>44.000000</td>
    </tr>
    <tr>
      <th>BUN</th>
      <td>4.666667</td>
      <td>4.666667</td>
    </tr>
    <tr>
      <th>Creatinine</th>
      <td>0.333333</td>
      <td>0.333333</td>
    </tr>
    <tr>
      <th>DiasABP</th>
      <td>67.125000</td>
      <td>67.125000</td>
    </tr>
    <tr>
      <th>FiO2</th>
      <td>0.500000</td>
      <td>0.500000</td>
    </tr>
    <tr>
      <th>GCS</th>
      <td>5.923077</td>
      <td>5.923077</td>
    </tr>
    <tr>
      <th>Glucose</th>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>HCO3</th>
      <td>25.000000</td>
      <td>25.000000</td>
    </tr>
    <tr>
      <th>HCT<