# Functions
Various functions used in data prep and modeling

## Static data cleaning

In [5]:
def add_columns(A, B, r_map, column, prefix=''):
    '''Uses regx on input dataframe B to create column of positive instances of regx. Column 
       appended to dataframe A.
       Inputs:
           A   (dataframe): Master df.
           B   (dataframe): Dataframe against which regx will be run.
           r_map    (dict): Dictionary of {column_name: regx},
           column (string): Column from df B agains regx will be run.
           prefix (string): Prefix to be added to newly created column name.
       Output:
           A.join(B)      : DF A with newly created column appended.
    '''
    for c,regx in r_map.items():
        print(c)
        B[f'{prefix}{c}'] = B[column].str.match(regx).fillna(0).astype(int,errors='ignore')

    B = B.drop(column,axis=1).groupby('HADM_ID').sum()

    missing_patients = set(B.index) - set(A.index)
    B = B.drop(missing_patients)
    return A.join(B)

def get_first(A):
    '''Pulls first chronological instance for each HADM_ID of input df.
        Input :
            A (dataframe)
        Output:
            A (dataframe): All but first instance per HADM_ID removed.
    '''
    return A.sort_values(by='CHARTTIME')\
            .drop_duplicates(subset=['HADM_ID','ITEMID'])\
            .set_index('HADM_ID')

def fix_height(h):
    '''Standardizes height to cm and corrects typographical errors.
        Input :
            h (int or float): Height in in, ft, or cm
        Output:
            h (int or float): Height in cm
    '''
    
    if 5 <= h <= 7:
        h *= 30.48 # ft to cm
    elif 12 <= h <= 30:
        h += 100
    elif h < 120:
        h *= 2.54 # in to cm
    elif h > 400:
        h /= 2.54
    return h

def fix_weight(w):
    '''Standardizes weight to kg.
        Input :
            w (int or float): Weight in lb or kg
        Output:
            w (int or float): Weight in kg
    '''
    w.VALUENUM = w.VALUENUM/2.2    if w.ITEMID=='Present Weight  (lb)' else w.VALUENUM
    w.VALUENUM = w.VALUENUM/35.274 if w.ITEMID=='Present Weight  (oz)' else w.VALUENUM
    return w.VALUENUM 

def get_descriptor(s,descriptor):
    '''Confirms if input value matches descriptor.
        Inputs:
            s (string): Value from series cell.
            descriptor (string): Desired value.
        Output:
            Int: 1 if s matches descriptor, 0 otherwise           
    '''
    return 1 if s[-len(descriptor):]==descriptor else 0

def strip_descriptor(s,descriptor):
    '''Removes string matching descriptor from string s if present.
        Inputs:
            s (string): Value from series cell.
            descriptor (string): Desired value.
        Output:
            s (string): s string with descriptor removed if present. 
    '''
    return s[:-len(descriptor)-1] if s[-len(descriptor):]==descriptor else s


## Dynamic data cleaning

In [6]:
def prep_df(A):
    '''Formats column datatypes.
        Input : A (dataframe)
        Output: A (dataframe)
    '''
    A = A[A.HADM_ID.notna()].copy()
    A.CHARTTIME = pd.to_datetime(A.CHARTTIME)
    A.HADM_ID   = A.HADM_ID.astype(int)
    return A

def F_to_C(t):
    '''Converts Fahrenheit to Celsius.
        Input : t (int or float)
        Output: t (float)
    '''
    return (t-32)/1.8

def convert_temp(A):
    '''Standardizes temperature units to Celsius and addresses entry errors.
        Input : A (dataframe)
        Output: A (dataframe)
    '''
    A = A[A.VALUENUM.between(15,120)].copy()
    A['TEMP'] = A.apply(lambda t: F_to_C(t.VALUENUM) if t.ITEMID==223761                                  
                                                   else t.VALUENUM, axis=1)
    A.TEMP = A.apply(lambda t: F_to_C(t.VALUENUM)if t.TEMP > 46 # C recorded as F
                                               else t.TEMP, axis=1)
    A.TEMP = A.apply(lambda t: t.VALUENUM if t.TEMP < 15        # F recorded as C
                                        else t.TEMP, axis=1) 
    return A

def fix_oxy(o):
    '''Fixes SaO2 typographical errors (negative and *10)
        Input : o (int or float)
        Output: o (int or float)
    '''
    o = -o if o<0 else o      
    return o/100 if o>200 else o

def get_ts(A,column,c_map,get_labs=False): 
    '''Creates long-form dataframe of time series of daily mean and counts for
       specified column.
        Inputs :
            A   (dataframe)
            column (string): Column from which to calculate means and counts.
            c_map    (dict): Dictionary with ITEMID corresponding to column,
                             as well as outlier cutoffs.
            get_labs (bool): If true, narrows A by ITEMID
        Outputs:
            A   (dataframe): Long-form df, indexed as (HADM_ID,DATE).
    '''
    lower, upper = c_map['min'], c_map['max'] # Outlier values
    A = A.copy()
    A = A[(A.ITEMID==c_map['code'])] if get_labs else A 
    A = convert_temp(A) if column == 'TEMP' else A
    A.VALUENUM = A.VALUENUM.map(fix_oxy) if column == 'SaO2' else A.VALUENUM

    A = A[A['VALUENUM'].between(lower,upper)]
    first = str(A.CHARTTIME.min().date())

    A = A.set_index(['HADM_ID','CHARTTIME']) \
         .groupby('HADM_ID')                  \
         .resample('1440min'                   , # Possible to resample at smaller intervals
                   level='CHARTTIME'           ,
                   origin=f'{first} 00:00:00')  \
         .agg({'VALUENUM':['mean','count','std']}) 
    A.columns = A.columns.droplevel()
    A.rename(columns={'mean' :f'{column}_MEAN' ,
                      'count':f'{column}_COUNT',
                      'std'  :f'{column}_SD'  },
             inplace=True)
    A = A.groupby(['HADM_ID']).ffill()
    print(f'saving {column}')
    A.to_csv(f'mimic_iii_data/{column}_ts.csv')
    
    A =  A.reset_index()\
          .set_index(['HADM_ID','CHARTTIME'])
    A.index.names = ['HADM_ID','DATE']
     
    return A

def add_rx(A,rx,route,rx_map):
    '''Creates long-form time series dataframe of days specified drug class administerd.
        Inputs :
            A  (dataframe):
            rx    (string): Drug class for which ts will be created.
            route (string): Specifies drug route when naming column.
            rx_map  (dict): Dictionary with regx for each drug class.
        Outputs:
            A (dataframe) : Long-form df, indexed as (HADM_ID,DATE).
    '''
    print(rx)
    regx = rx_map[rx]
    A = A[A['DRUG'].str.match(regx)]
    
    A = pd.DataFrame(A.apply(lambda t: pd.date_range(t.STARTDATE,
                                                     t.ENDDATE,freq='1D' ).date,axis=1)\
                      .explode().drop_duplicates())
    A[f'{route}_{rx}'] = 1
    A.reset_index(inplace=True)
    A.rename(columns={'index':'HADM_ID',0:'DATE'},inplace=True)
    return A.set_index(['HADM_ID','DATE'])


## Model Metrics

In [7]:
def RMSE(y,y_hat):
    '''Calculates and returns Root Mean Square Error between input arrays'''
    return ((y-y_hat)**2).sum()/len(y)

def MAE(y,y_hat):
    '''Calculates and returns Mean Square Error between input arrays'''
    return (abs(y-y_hat).sum())/len(y)

def evaluate_regression(model,X_train,X_test,y_train,y_test):
    '''Fits and evaluates performance of input regression model and returns test 
       set predictions.
        Inputs:
            model   (sklearn model)   
            X_train, X_test    (df): DF from which model predictions will be obtained
            y_train,y_test (series): Observed train and test targets
        Output:
            y_predict      (series): Values predicted from test set 
            model   (sklearn model): fit regression model
    '''
    model     = model.fit(X_train,y_train)
    predicted = model.predict(X_test)
    print(f'Training R2:\n\t{model.score(X_train,y_train)}')
    print(f'Test R2:\n\t{model.score(X_test,y_test)}')
    return model.predict(X_test),model

def evaluate_classifier(y_hat,y_pred):
    '''Calculates, prints, and returs accuracy,precision,recall from input observed 
       and predicted arrays, prints confusion matrix.
    '''
    confusion = pd.crosstab(y_hat,y_pred)
    confusion.index.name,confusion.columns.name  = 'Observed','Predicted'
    
    acc = m.accuracy_score(y_hat,y_pred)
    rec = m.recall_score(y_hat,y_pred,pos_label=1)
    prc = m.precision_score(y_hat,y_pred,pos_label=1)
    print(confusion)
    print('Accuracy:\n\t' ,acc)
    print('Recall:\n\t'   ,rec)
    print('Precision:\n\t',prc)
    return acc,rec,prc

## LSTM

In [8]:
def create_batches(i,batch_size):
    '''Creates list of batches, each with a list HADM_IDs to be used in each batch
        Inputs:
            i (pd multi-index): ('HADM_IDs','DATE')
            batch_size (int)        
        Outputs:
            batches (array): Array of batches. Each sub-array contains HADM_IDs for batch
            batch_n   (int): number of batches
    '''
    hadm_id = i.reset_index().HADM_ID.unique()
    batch_n = (len(hadm_id)//batch_size)+1
    cutoff  = batch_size*(batch_n-1)
    tail    = hadm_id[cutoff:]
    batches = np.split(hadm_id[:cutoff],batch_n-1)
    batches.append(hadm_id[cutoff:])
    return batches, batch_n

def batch_generator(X,y,target,batches,columns):
    '''Creates batch generator/formatter 
        Inputs:
            X          (df): Feature df
            y          (df): Target df
            target   (list): list of columns from y to use as training target
            batches (array): Array of HADM_IDs for each batch
            columns  (list): List of columns from X to use in training
        Outputs:
            X_batch    (df): Single batch from X
            y_batch    (df): Single batch from y
    '''
    i = 0
    while True:
        i = 0 if i==len(batches) else i
        
        batch = batches[i]
        X_batch = X.loc[batch,columns]
        y_batch = y.loc[batch]

        X_batch.reset_index(inplace=True)
        hadm_batch = X_batch.HADM_ID.unique()
        X_batch.drop('DATE',axis=1,inplace=True)
        X_batch.set_index(['HADM_ID',y_batch.DAY_NUMBER],inplace=True)

        hadm_n,col_n,row_n = len(hadm_batch),len(columns),max(y.LOS)

        index = pd.DataFrame({'HADM_ID':np.repeat(hadm_batch,row_n)})
        index['DAY_NUMBER'] = index.groupby('HADM_ID').cumcount()
        index.set_index(['HADM_ID','DAY_NUMBER'],inplace=True)

        y_batch = y_batch[y_batch.DAY_NUMBER==1][target]
        y_batch = y_batch.values.reshape(hadm_n,len(target))

        X_batch = index.join(X_batch).fillna(9999)
        X_batch = X_batch.values.reshape(hadm_n,row_n,col_n)

        i+=1    
        yield X_batch,y_batch
        
def create_lstm(target,columns,batch_size,shape,loss,activations,metrics):
    '''Contructs LSTM
        Inputs:
            target     (list): list of targets to be used in training
            columns    (list): list of features to be used in training
            batch_size  (int): batch size (number of HADM_ID)
            shape     (tuple): shape of training data
            loss     (string): desired loss functio
            activation (list): [LSTM activation, Output activation]
            metrics    (list): List of metrics to calculate during training
        Outputs:
            model (keras model): compiled model ready to be fit
    '''
    model = Sequential()

    model.add(Masking(mask_value  = 9999, 
                      input_shape = shape))
    model.add(LSTM(16,
                   activation       = activations[0],
                   input_shape      = shape         ,
                   return_sequences = True)         )
    model.add(LSTM(16,
                   activation = activations[0]))
    model.add(BatchNormalization())

    model.add(Dense(16,
                    activation = activations[0]))        
    model.add(Dropout(0.3))
    model.add(Dense(len(target), 
                    activation = activations[1]))

    model.compile(loss      = loss   , 
                  optimizer = 'adam' ,
                  metrics   = metrics)
    return model

def unpack_prediction(prediction,target):
    '''Unpacks output LSTM prediction and returns as series with class names.'''
    return pd.Series({f'{t}_PREDICTED_PROB':p for t,p in zip(target,prediction.ravel())})

def run_model(X,y,target,columns,batch_size,loss,activations,metrics):
    '''Creates and trains LSTM model, returns model and validate set predictions.
        Inputs:
            X              (df): Feature df
            y              (df): Target df
            target       (list): List of target variables to be used in training
            columns      (list): List of features to be used in training
            batch_size    (int): Desired batch size
            loss       (string): Desired loss funtion
            activations  (list): [LSTM activations, final activation]
            metrics      (list): List of metrics to calculate during training
        Outputs:
            model (keras model): Trained LSTM
            predicted      (df): DF of predicted values off validation set          
    '''
    shape = (None,len(columns))
    train_batches,train_batch_n = create_batches(train,batch_size)
    val_batches  ,val_batch_n   = create_batches(val,batch_size)
    
    model = create_lstm(target     ,
                        columns    ,
                        batch_size ,
                        shape      ,
                        loss       ,
                        activations,
                        metrics    )
    
    train_generator = batch_generator(X            ,
                                      y            ,
                                      target       ,
                                      train_batches,
                                      columns      )
    val_generator   = batch_generator(X            ,
                                      y            ,
                                      target       ,
                                      val_batches  ,
                                      columns      )

    model.fit(x        = train_generator, 
              validation_data  = val_generator  ,
              validation_steps = val_batch_n    ,
              epochs           = 10             ,
              steps_per_epoch  = train_batch_n  ) 
    
    predicted_test = X.loc[train.index,columns]\
                                .apply(lambda x: unpack_prediction(model.predict(np.asarray([[x]])),
                                                                   target),axis=1)
    
    predicted_val  = X.loc[val.index,columns]\
                              .apply(lambda x: unpack_prediction(model.predict(np.asarray([[x]])),
                                                                 target),axis=1)

    return model, predicted_test, predicted_val