In [56]:
import pandas as pd
import numpy as np
from matplotlib.ticker import PercentFormatter
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.utils import compute_class_weight
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook as tqdm
%matplotlib inline

## Load Patient Data

In [62]:
hr_all = pd.read_csv("../data/alpaca_hr.csv")
resp_all = pd.read_csv("../data/alpaca_resp.csv")
sao2_all = pd.read_csv("../data/alpaca_sao2.csv")
gcs_all = pd.read_csv("../data/patient_motor.csv")

hr_all['offset'] /= 60
resp_all['offset'] /= 60
sao2_all['offset'] /= 60
gcs_all['observationoffset'] /= 60

# drop patients with >1 month stay, <12 hrs stay
patients = hr_all.copy()
max_stays = patients.groupby('patientunitstayid').apply(lambda x: x['offset'].max()).values
patients = patients.drop(columns=['offset', 'value']).drop_duplicates()
patients['length_of_stay_hrs'] = max_stays
patients = patients[patients['length_of_stay_hrs'] < 720]
patients = patients[patients['length_of_stay_hrs'] >= 12]
display(patients)

[1. 6. 5. 4. 2. 3.]


Unnamed: 0,patientunitstayid,length_of_stay_hrs
0,143274,47.516667
571,144116,69.166667
1402,145997,25.916667
1714,146455,21.583333
1974,147899,126.333333
...,...,...
3499999,3335182,42.200000
3500307,3335520,60.966667
3500461,3339866,41.633333
3500892,3344456,282.716667


#### Bin Data

In [20]:
# set interval in hrs
binning_interval = 1

In [64]:
# assign bins
hr_all['offset_bin'] = hr_all['offset'] // binning_interval
resp_all['offset_bin'] = resp_all['offset'] // binning_interval
sao2_all['offset_bin'] = sao2_all['offset'] // binning_interval
gcs_all['offset_bin'] = gcs_all['observationoffset'] // binning_interval

# keep only patients in patient list, calculate bin mean for each patient
hr_binned = patients[['patientunitstayid']].merge(hr_all.drop(columns='offset').groupby(['patientunitstayid', 'offset_bin']).mean().reset_index(), how='left')
resp_binned = patients[['patientunitstayid']].merge(resp_all.drop(columns='offset').groupby(['patientunitstayid', 'offset_bin']).mean().reset_index(), how='left')
sao2_binned = patients[['patientunitstayid']].merge(sao2_all.drop(columns='offset').groupby(['patientunitstayid', 'offset_bin']).mean().reset_index(), how='left')

#TODO maybe keep gcs mean as the X feature but latest gcs score as the label
gcs_binned = patients[['patientunitstayid']].merge(gcs_all.drop(columns='observationoffset').groupby(['patientunitstayid', 'offset_bin']).agg(lambda x:x.value_counts().index[0]).reset_index(), how='left')

print(gcs_binned['Value'].unique())

max_bins = patients['length_of_stay_hrs'].max() // binning_interval + 1

# reindex helper method
def rein(df):
    return df.set_index('offset_bin').reindex(np.arange(max_bins))

# reindex hr df, use this as merging table for other values
hr_binned = hr_binned.groupby('patientunitstayid').apply(rein).drop(columns='patientunitstayid').reset_index()

# format column names to allow merging
hr_binned.rename(columns={'value' : 'hr'}, inplace=True)
resp_binned.rename(columns={'value' : 'resp'}, inplace=True)
sao2_binned.rename(columns={'value' : 'sao2'}, inplace=True)
gcs_binned.rename(columns={'Value' : 'gcs'}, inplace=True)

# merge data
ts_binned = hr_binned.merge(resp_binned, how='left').merge(sao2_binned, how='left').merge(gcs_binned, how='left')
ts_binned = ts_binned.groupby('patientunitstayid').apply(lambda x: x.fillna(method='bfill')).dropna()

# add a gcs_label column (to store untransformed gcs score)
ts_binned['gcs_label'] = ts_binned['gcs']

# display data
display(ts_binned)

print(ts_binned['gcs_label'].unique())

[nan  1.  6.  5.  4.  2.  3.]


Unnamed: 0,patientunitstayid,offset_bin,hr,resp,sao2,Key,gcs,origin,gcs_label
19236,172448,0.0,89.090909,14.545455,97.272727,Motor,1.0,nurse_charting,1.0
19237,172448,1.0,93.500000,14.583333,96.583333,Motor,1.0,nurse_charting,1.0
19238,172448,2.0,92.500000,16.083333,94.750000,Motor,1.0,nurse_charting,1.0
73509,242714,0.0,102.181818,37.875000,94.545455,Motor,6.0,nurse_charting,6.0
73510,242714,1.0,105.583333,37.875000,93.916667,Motor,6.0,nurse_charting,6.0
...,...,...,...,...,...,...,...,...,...
1911934,3352819,13.0,81.416667,31.000000,98.750000,Motor,6.0,nurse_charting,6.0
1911935,3352819,14.0,81.666667,30.916667,97.616667,Motor,6.0,nurse_charting,6.0
1911936,3352819,15.0,86.333333,27.583333,97.175000,Motor,6.0,nurse_charting,6.0
1911937,3352819,16.0,80.166667,30.916667,97.833333,Motor,6.0,nurse_charting,6.0


[1. 6. 5. 4. 2. 3.]


#### Create Dataset and DataLoader Classes

In [97]:
# set prediction window, memory window
prediction_window = 6
memory_window = 12

In [98]:
# scale data (to be applied when initializing dataset)
def transform(X, train=True, scaler=None):
    if train:
        scaler = StandardScaler()
        scaler.fit_transform(X)
        return X, scaler
    
    if scaler is None:
        raise AttributeError("Must provide fitted scaler when testing")
    
    scaler.transform(X)
    return X
    
class PatientsTrain(Dataset):
    """Patients training dataset."""
    def __init__(self, ts_df, prediction_window, memory_window, transform=None):
        self.ts_df = ts_df
        self.prediction_window = prediction_window
        self.memory_window = memory_window
        self.scaler = None

        if transform:
            self.ts_df[['hr', 'resp', 'sao2', 'gcs']], self.scaler = transform(self.ts_df[['hr', 'resp', 'sao2', 'gcs']])
        
    def __len__(self):
        # the last usable window is length - foresight_window - observation_interval + 1
        # we drop patients with fewer than this many observations earlier
        return len(self.ts_df) - memory_window - prediction_window + 1

    def __getitem__(self, idx):
        # simply pull entry beginning at index idx
        X = self.ts_df.iloc[idx:idx + memory_window][['hr', 'resp', 'sao2', 'gcs']].values.flatten('F')
        y = self.ts_df.iloc[idx + memory_window + prediction_window - 1]['gcs_label']
        
        sample = {'X': X, 'y': y}
        return sample
    
class PatientsTest(Dataset):
    """Patients training dataset."""
    def __init__(self, ts_df, prediction_window, memory_window, transform=None, scaler=None):
        self.ts_df = ts_df
        self.prediction_window = prediction_window
        self.memory_window = memory_window

        if transform:
            self.ts_df[['hr', 'resp', 'sao2', 'gcs']] = transform(self.ts_df[['hr', 'resp', 'sao2', 'gcs']], train=False, scaler=scaler)
        
    def __len__(self):
        # the last usable window is length - foresight_window - observation_interval + 1
        # we drop patients with fewer than this many observations earlier
        return len(self.ts_df) - memory_window - prediction_window + 1

    def __getitem__(self, idx):
        # simply pull entry beginning at index idx
        X = self.ts_df.iloc[idx:idx + memory_window][['hr', 'resp', 'sao2', 'gcs']].values.flatten('F')
        y = self.ts_df.iloc[idx + memory_window + prediction_window - 1]['gcs_label']
        
        sample = {'X': X, 'y': y}
        return sample

In [99]:
# split train/test
train_patients, test_patients = train_test_split(patients[['patientunitstayid']], test_size=0.2)
ts_binned_train = train_patients.merge(ts_binned)
ts_binned_test = test_patients.merge(ts_binned)

# create datasets 
train_set = PatientsTrain(ts_binned_train, prediction_window=prediction_window, memory_window=memory_window, transform=transform)
test_set = PatientsTest(ts_binned_test, prediction_window=prediction_window, memory_window=memory_window, transform=transform, scaler=train_set.scaler)

# create data loaders
train_loader = DataLoader(train_set, batch_size=512, num_workers=8, shuffle=True)
test_loader = DataLoader(test_set, batch_size=1024, num_workers=8, shuffle=True)

## Training Loop

In [100]:
# compute class weight
class_weight = compute_class_weight('balanced', np.arange(6) + 1, ts_binned_train['gcs_label'].unique())
class_weight = {i+1: class_weight[i] for i in range(class_weight.shape[0])}
classes = list(class_weight.keys())

# create model
clf = SGDClassifier(loss='log', penalty='elasticnet', class_weight=class_weight)

# set training params
num_epochs = 5
progress_bar = tqdm(total=num_epochs, desc='Training')

for i in range(1, num_epochs + 1):
    # run training loop
    running_acc = []
    for batch_num, sample_batch in enumerate(train_loader):
        X = sample_batch['X'].numpy()
        y = sample_batch['y'].numpy()

        # fit model
        clf.partial_fit(X, y, classes)

        # update metrics
        running_acc.append(clf.score(X, y))
        progress_bar.set_description('Training: Epoch: {}, acc: {:.2f}'.format(i, np.array(running_acc).mean()))
        progress_bar.refresh()
    
    progress_bar.update(1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, description='Training', max=5.0, style=ProgressStyle(description_width…