In [None]:
# This notebook reads in the discretised input data and then preprocesses the model features
# Firstly, values deemed excessively high/low are capped
# Relevant binary features and normally/log-normally features are standardised accordingly
# Training and test sets are split - 70% train, 10% validation, 20% test
# Resulting datasets are saved to file.

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame

In [2]:
disc_inp_data = pd.read_csv("../data/discretised_input_data.csv")

In [3]:
# add rewards - sparsely for now; reward function shaping comes in a separate script
disc_inp_data['reward'] = 0
for i in disc_inp_data.index:
    if i == 0:
        continue
    else:
        if disc_inp_data.ix[i, 'icustayid'] != disc_inp_data.ix[i-1, 'icustayid']:
            if disc_inp_data.ix[i-1, 'died_in_hosp'] == 1:
                disc_inp_data.ix[i-1,'reward'] = -100
            elif disc_inp_data.ix[i-1, 'died_in_hosp'] == 0:
                disc_inp_data.ix[i-1,'reward'] = 100
            else:
                print "error in row", i-1
if disc_inp_data.ix[len(disc_inp_data)-1, 'died_in_hosp'] == 1:
    disc_inp_data.ix[len(disc_inp_data)-1, 'reward'] = -100
elif disc_inp_data.ix[len(disc_inp_data)-1, 'died_in_hosp'] == 0:
     disc_inp_data.ix[len(disc_inp_data)-1, 'reward'] = 100
print disc_inp_data['reward'].value_counts()

 0      224552
 100     15583
-100      2315
Name: reward, dtype: int64


In [4]:
# now split into train/validation/test sets
import random
unique_ids = disc_inp_data['icustayid'].unique()
random.shuffle(unique_ids)
train_sample = 0.7
val_sample = 0.1
test_sample = 0.2
train_num = int(len(unique_ids) * 0.7)
val_num = int(len(unique_ids)*0.1) + train_num
train_ids = unique_ids[:train_num]
val_ids = unique_ids[train_num:val_num]
test_ids = unique_ids[val_num:]

In [5]:
train_set = DataFrame()
train_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(train_ids)]

val_set = DataFrame()
val_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(val_ids)]

test_set = DataFrame()
test_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(test_ids)]

In [6]:
# cap values in train and test
caps = pd.read_csv("capping_values.csv")
for i in caps.index:
    param = caps.ix[i,'parameter'][1:-1]
    maxval = caps.ix[i,'limsup']
    minval = caps.ix[i,'liminf']
    train_set[param][train_set[param] >= maxval] = maxval
    train_set[param][train_set[param] <= minval] = minval
    val_set[param][val_set[param] >= maxval] = maxval
    val_set[param][val_set[param] <= minval] = minval
    test_set[param][test_set[param] >= maxval] = maxval
    test_set[param][test_set[param] <= minval] = minval

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.h

In [7]:
binary_fields = ['gender','mechvent','re_admission']
norm_fields= ['age','Weight_kg','GCS','HR','SysBP','MeanBP','DiaBP','RR','Temp_C','FiO2_1',
    'Potassium','Sodium','Chloride','Glucose','Magnesium','Calcium',
    'Hb','WBC_count','Platelets_count','PTT','PT','Arterial_pH','paO2','paCO2',
    'Arterial_BE','HCO3','Arterial_lactate','SOFA','SIRS','Shock_Index',
    'PaO2_FiO2','cumulated_balance_tev', 'elixhauser', 'Albumin', u'CO2_mEqL', 'Ionised_Ca']
log_fields = ['max_dose_vaso','SpO2','BUN','Creatinine','SGOT','SGPT','Total_bili','INR',
              'input_total_tev','input_4hourly_tev','output_total','output_4hourly']

In [8]:
# normalise binary fields
train_set[binary_fields] = train_set[binary_fields] - 0.5 
val_set[binary_fields] = val_set[binary_fields] - 0.5 
test_set[binary_fields] = test_set[binary_fields] - 0.5 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [9]:
# normal distn fields
for item in norm_fields:
    av = train_set[item].mean()
    std = train_set[item].std()
    train_set[item] = (train_set[item] - av) / std
    val_set[item] = (val_set[item] - av) / std
    test_set[item] = (test_set[item] - av) / std

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [2]:
# log normal fields
train_set[log_fields] = np.log(0.1 + train_set[log_fields])
val_set[log_fields] = np.log(0.1 + val_set[log_fields])
test_set[log_fields] = np.log(0.1 + test_set[log_fields])
for item in log_fields:
    av = train_set[item].mean()
    std = train_set[item].std()
    train_set[item] = (train_set[item] - av) / std
    val_set[item] = (val_set[item] - av) / std
    test_set[item] = (test_set[item] - av) / std

In [11]:
train_set.head()

Unnamed: 0.1,Unnamed: 0,bloc,icustayid,charttime,gender,age,elixhauser,re_admission,died_in_hosp,mortality_90d,...,median_dose_vaso,max_dose_vaso,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,vaso_input,iv_input,reward
0,0,1,3,7245052800,-0.5,-0.972456,-1.807766,-0.5,0,1,...,0.0,-0.379552,0.133952,1.254772,-0.380319,0.423948,0.09069,0.0,4.0,0
1,1,2,3,7245067200,-0.5,-0.972456,-1.807766,-0.5,0,1,...,0.0,-0.379552,0.319865,1.240192,-0.040929,0.646407,0.317665,0.0,4.0,0
2,2,3,3,7245081600,-0.5,-0.972456,-1.807766,-0.5,0,1,...,0.0,-0.379552,0.327648,0.368842,0.324865,1.034249,0.095431,0.0,2.0,0
3,3,4,3,7245096000,-0.5,-0.972456,-1.807766,-0.5,0,1,...,0.0,-0.379552,0.33203,0.203508,0.397886,0.735431,0.012464,0.0,2.0,0
4,4,5,3,7245110400,-0.5,-0.972456,-1.807766,-0.5,0,1,...,0.0,-0.379552,0.336366,0.203508,0.439922,0.620478,-0.042354,0.0,2.0,0


In [12]:
train_set.to_csv('../data/rl_train_set_unscaled.csv',index = False)
val_set.to_csv('./data/rl_val_set_unscaled.csv', index = False)
test_set.to_csv('./data/rl_test_set_unscaled.csv', index = False)

In [None]:
# scale features to [0,1] in train set, similar in val and test
import copy
scalable_fields = copy.deepcopy(binary_fields)
scalable_fields.extend(norm_fields)
scalable_fields.extend(log_fields)
scalable_fields.extend(missed_fields)
for col in scalable_fields:
    minimum = min(train_set[col])
    maximum = max(train_set[col])
    train_set[col] = (train_set[col] - minimum)/(maximum-minimum)
    val_set[col] = (val_set[col] - minimum)/(maximum-minimum)
    test_set[col] = (test_set[col] - minimum)/(maximum-minimum)

In [None]:
train_set.to_csv('../data/rl_train_set_scaled.csv',index = False)
val_set.to_csv('../data/rl_val_set_scaled.csv', index = False)
test_set.to_csv('../data/rl_test_set_scaled.csv', index = False)