In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.layers.experimental import preprocessing

import seaborn as sns
import matplotlib.pyplot as plt

import inspect
import os
import sys
import math

from data.feature_processing import get_category_encoding_layer, get_normalization_layer


# Preprocessing

In [2]:
WORKFOLDER = os.getcwd()


In [3]:
filepath_2 = os.path.join(WORKFOLDER, os.path.join('dataset', 'sepsis_labeled.csv'))
raw_df = pd.read_csv(filepath_2)
raw_df["label"] = raw_df["label"].astype(int)
raw_df = raw_df.rename(columns={"case:concept:name": "CaseID", "time:timestamp": "Timestamps"}, errors="raise")
raw_df.index.name = "index"

In [4]:
def create_index(log_df, column):
    """Creates an idx for a categorical attribute.
    Args:
        log_df: dataframe.
        column: column name.
    Returns:
        index of a categorical attribute pairs.
    """
    temp_list = log_df[[column]].values.tolist()
    subsec_set = {(x[0]) for x in temp_list}
    subsec_set = sorted(list(subsec_set))
    alias = dict()
    if column !='label':
      for i, _ in enumerate(subsec_set):
          alias[subsec_set[i]] = i + 1
    else:
      for i, _ in enumerate(subsec_set):
          alias[subsec_set[i]] = i  
    return alias

In [5]:
# Index creation for activity
# column_names = ['CaseID', 'Activity', 'Department', 'Timestamps', 'Activity code', 'Number of executions', 'Producer code', 'Section', 'Age', 'Diagnosis code', 'Treatment code', 'Month', 'Day']

ac_index = create_index(raw_df, 'concept:name')
ac_index['start'] = 0
ac_index['end'] = len(ac_index)
index_ac = {v: k for k, v in ac_index.items()}

print(ac_index)
print(index_ac)

# Index creation for department/role

res_index = create_index(raw_df, 'org:group')
res_index['start'] = 0
res_index['end'] = len(res_index)
index_res = {v: k for k, v in res_index.items()}


#mapping the dictionary values as columns in the dataframe
raw_df['ac_index'] = raw_df['concept:name'].map(ac_index)
raw_df['res_index'] = raw_df['org:group'].map(res_index)

print(res_index)
print(index_res)
raw_df.head()


{'Admission NC': 1, 'CRP': 2, 'ER Registration': 3, 'ER Sepsis Triage': 4, 'ER Triage': 5, 'IV Antibiotics': 6, 'IV Liquid': 7, 'LacticAcid': 8, 'Leucocytes': 9, 'Release A': 10, 'Release B': 11, 'Release C': 12, 'Release D': 13, 'Release E': 14, 'Return ER': 15, 'start': 0, 'end': 16}
{1: 'Admission NC', 2: 'CRP', 3: 'ER Registration', 4: 'ER Sepsis Triage', 5: 'ER Triage', 6: 'IV Antibiotics', 7: 'IV Liquid', 8: 'LacticAcid', 9: 'Leucocytes', 10: 'Release A', 11: 'Release B', 12: 'Release C', 13: 'Release D', 14: 'Release E', 15: 'Return ER', 0: 'start', 16: 'end'}
{'?': 1, 'A': 2, 'B': 3, 'C': 4, 'D': 5, 'E': 6, 'F': 7, 'G': 8, 'H': 9, 'I': 10, 'J': 11, 'K': 12, 'L': 13, 'M': 14, 'N': 15, 'O': 16, 'P': 17, 'Q': 18, 'R': 19, 'S': 20, 'T': 21, 'U': 22, 'V': 23, 'W': 24, 'X': 25, 'Y': 26, 'start': 0, 'end': 27}
{1: '?', 2: 'A', 3: 'B', 4: 'C', 5: 'D', 6: 'E', 7: 'F', 8: 'G', 9: 'H', 10: 'I', 11: 'J', 12: 'K', 13: 'L', 14: 'M', 15: 'N', 16: 'O', 17: 'P', 18: 'Q', 19: 'R', 20: 'S', 21: '

Unnamed: 0_level_0,CaseID,Timestamps,concept:name,org:group,CRP,LacticAcid,Leucocytes,Diagnose,DiagnosticArtAstrup,DiagnosticBlood,...,SIRSCritHeartRate,SIRSCritLeucos,SIRSCritTachypnea,SIRSCritTemperature,SIRSCriteria2OrMore,Age,label,elapsed_time,ac_index,res_index
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,A,2014-10-22 09:15:41+00:00,ER Registration,A,,,,A,True,True,...,True,False,True,True,True,85.0,0,0.0,3,2
1,A,2014-10-22 09:27:00+00:00,Leucocytes,B,,,9.6,A,True,True,...,True,False,True,True,True,85.0,0,11.316667,9,3
2,A,2014-10-22 09:27:00+00:00,CRP,B,21.0,,9.6,A,True,True,...,True,False,True,True,True,85.0,0,11.316667,2,3
3,A,2014-10-22 09:27:00+00:00,LacticAcid,B,21.0,2.2,9.6,A,True,True,...,True,False,True,True,True,85.0,0,11.316667,8,3
4,A,2014-10-22 09:33:37+00:00,ER Triage,C,21.0,2.2,9.6,A,True,True,...,True,False,True,True,True,85.0,0,17.933333,5,4


In [6]:
# =============================================================================
# Split an event log dataframe to peform split-validation 
# =============================================================================

import random

def split_train_test(df, percentage):
    cases = set(df.CaseID.unique().tolist())

    # num_test_cases = int(np.round(len(cases)*percentage))
    # test_cases = cases[:num_test_cases]
    # train_cases = cases[num_test_cases:]

    num_test_cases = int(np.round(len(cases)*percentage)/2) # set the number to select here.
    test_cases = random.Random(2021).sample(cases, num_test_cases)
    rest_cases = cases.difference(test_cases)
    val_cases = random.Random(2021).sample(rest_cases, num_test_cases)
    train_cases = rest_cases.difference(val_cases)

    df_train, df_test, df_val = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
    for case in train_cases:
        df_train = df_train.append(df[df.CaseID==case]) 
    df_train = df_train.sort_values('Timestamps', ascending=True).reset_index(drop=True)
 
    for case in test_cases:
        df_test = df_test.append(df[df.CaseID==case]) 
    df_test = df_test.sort_values('Timestamps', ascending=True).reset_index(drop=True)

    for case in val_cases:
        df_val = df_val.append(df[df.CaseID==case]) 
    df_val = df_val.sort_values('Timestamps', ascending=True).reset_index(drop=True)
    
    return df_train, df_test, df_val



In [7]:
log_df_train, log_df_test, log_df_val = split_train_test(raw_df, 0.3) # 70%/15%/15%

In [8]:
print('Distribution of cases in log_df_train \n')
print(log_df_train.groupby(['label'])['CaseID'].nunique())

Distribution of cases in log_df_train 

label
0    467
1     80
Name: CaseID, dtype: int64


In [9]:
print('Distribution of cases in log_df_test\n')
print(log_df_test.groupby(['label'])['CaseID'].nunique())

Distribution of cases in log_df_test

label
0    103
1     14
Name: CaseID, dtype: int64


In [10]:
print('Distribution of cases in log_df_test \n')
print(log_df_val.groupby(['label'])['CaseID'].nunique())

Distribution of cases in log_df_test 

label
0    104
1     13
Name: CaseID, dtype: int64


In [11]:
def normalize_events(log_df,features):

#log_df(DataFrame): The dataframe with eventlog data
#args(Dictionary): The set of parameters
#Returns a Dataframe with normalized numerical features
  for feature in features:
    logit = lambda x: math.log1p(x[feature])
    log_df['%s_log'%(feature)] = log_df.apply(logit, axis=1)
    mean_feature = np.mean(log_df['%s_log'%(feature)])
    std_feature=np.std(log_df['%s_log'%(feature)])
    norm = lambda x: (x['%s_log'%(feature)]-mean_feature)/std_feature
    log_df['%s_norm'%(feature)] = log_df.apply(norm, axis=1)
  return log_df

In [12]:
numerical_features = ['elapsed_time']
log_df_train = normalize_events(log_df_train, numerical_features)
log_df_test = normalize_events(log_df_test, numerical_features)
log_df_val = normalize_events(log_df_val, numerical_features)
log_df_train.head()

Unnamed: 0,CaseID,Timestamps,concept:name,org:group,CRP,LacticAcid,Leucocytes,Diagnose,DiagnosticArtAstrup,DiagnosticBlood,...,SIRSCritTachypnea,SIRSCritTemperature,SIRSCriteria2OrMore,Age,label,elapsed_time,ac_index,res_index,elapsed_time_log,elapsed_time_norm
0,XJ,2013-11-07 07:18:29+00:00,ER Registration,A,53.0,1.2,18.2,C,False,True,...,False,True,True,90.0,0,0.0,3,2,0.0,-1.703966
1,XJ,2013-11-07 07:29:18+00:00,ER Triage,C,53.0,1.2,18.2,C,False,True,...,False,True,True,90.0,0,10.816667,5,4,2.469511,-0.895124
2,XJ,2013-11-07 07:37:32+00:00,ER Sepsis Triage,A,53.0,1.2,18.2,C,False,True,...,False,True,True,90.0,0,19.05,4,2,2.998229,-0.721953
3,XJ,2013-11-07 07:51:00+00:00,LacticAcid,B,53.0,1.4,18.2,C,False,True,...,False,True,True,90.0,0,32.516667,8,3,3.512043,-0.553663
4,XJ,2013-11-07 07:51:00+00:00,CRP,B,16.0,1.4,296.2,C,False,True,...,False,True,True,90.0,0,32.516667,2,3,3.512043,-0.553663


In [13]:
# ==============================================================================
# Reformat events: converts the dataframe into a numerical dictionary
# ==============================================================================

import itertools

def reformat_events(log_df, ac_index, res_index):
    """Creates series of activities, roles and relative times per trace.
    Args:
        log_df: dataframe.
        ac_index (dict): index of activities.
        rl_index (dict): index of roles.
    Returns:
        list: lists of activities, roles and relative times.
    """
    log_df = log_df.to_dict('records')

    temp_data = list()
    log_df = sorted(log_df, key=lambda x: (x['CaseID'], x['Timestamps']))
    for key, group in itertools.groupby(log_df, key=lambda x: x['CaseID']):
        trace = list(group)
        #dynamic features
        ac_order = [x['ac_index'] for x in trace]
        res_order = [x['res_index'] for x in trace]
        elapsed_time = [x['elapsed_time_norm'] for x in trace]

        #outcome
        label = max(x['label'] for x in trace)

        #Adding start and end to the dynamic features
        ac_order.insert(0, ac_index[('start')])
        ac_order.append(ac_index[('end')])
        res_order.insert(0, res_index[('start')])
        res_order.append(res_index[('end')])
        elapsed_time.insert(0, 0)
        elapsed_time.append(0)

        temp_dict = dict(caseid=key,
                         ac_order=ac_order,
                         res_order=res_order,
                         elapsed_time=elapsed_time,
                         label=label)
        temp_data.append(temp_dict)

    return temp_data


In [14]:
log_train = reformat_events(log_df_train, ac_index, res_index)
log_test = reformat_events(log_df_test, ac_index, res_index)
log_val = reformat_events(log_df_val, ac_index, res_index)

In [15]:
print(log_train[13])

{'caseid': 'BA', 'ac_order': [0, 3, 5, 4, 7, 6, 9, 8, 2, 1, 2, 9, 10, 15, 16], 'res_order': [0, 2, 4, 2, 2, 2, 3, 3, 3, 7, 3, 3, 6, 1, 27], 'elapsed_time': [0, -1.7039663367690556, -0.5980739355156732, -0.5947336452189546, -0.5910617325237895, -0.5903322641109698, -0.543086019227653, -0.543086019227653, -0.543086019227653, 0.05032816479963957, 0.8404174649290114, 0.8404174649290114, 1.1837123077010634, 2.135538230600206, 0], 'label': 0}


In [16]:
# Support function for Vectirization

# This function returns the maximum trace length (trc_len), and the number of cases for train and test sets (cases)
# The maximum out of trc_len for train and test sets will be used to define the trace length of the dataset that is fed to lstm

def lengths (log):
  trc_len = 1
  cases = 1

  for i,_ in enumerate(log):

    if trc_len <len(log[i]['ac_order']):

        trc_len = len(log[i]['ac_order'])
        cases += 1
    else:
        cases += 1

  return trc_len, cases


In [17]:
#Obtain the trc_len and cases for each set

trc_len_train, cases_train = lengths(log_train)
trc_len_test, cases_test = lengths(log_test)
trc_len_val, cases_val = lengths(log_val)

trc_len = trc_len_train
if trc_len < trc_len_test:
  trc_len = trc_len_test
else:
    if trc_len < trc_len_val:
      trc_len = trc_len_val

print("trace_length: "+str(trc_len)+", training cases: "+str(cases_train)+", test cases: "+str(cases_test) +", val cases: "+str(cases_val))

trace_length: 62, training cases: 548, test cases: 118, val cases: 118


In [18]:
def vectorization(log, ac_index, res_index, trc_len, cases):

#Example function with types documented in the docstring.
#Args:
        #log: event log data in a dictionary.
        #ac_index (dict): index of activities.
        #rl_index (dict): index of roles (departments).
        #di_index (dict) : index of diagnosis codes.

#Returns:vec: Dictionary that contains all the LSTM inputs. """

  vec = {'prefixes':dict(),'label':[]} 
  len_ac = trc_len  

  for i ,_ in enumerate(log):
  
    padding = np.zeros(len_ac-len(log[i]['ac_order']))

    if i == 0:
            vec['prefixes']['x_ac_inp'] = np.array(np.append(log[i]['ac_order'],padding))
            vec['prefixes']['x_res_inp'] = np.array(np.append(log[i]['res_order'],padding))
            vec['prefixes']['xt_inp'] = np.array(np.append(log[i]['elapsed_time'],padding))
            vec['label'] = np.array(log[i]['label'])


            #print(len(vec['prefixes']['x_ac_inp']))

  
    vec['prefixes']['x_ac_inp'] = np.concatenate((vec['prefixes']['x_ac_inp'],
                                                          np.array(np.append(log[i]['ac_order'],padding))), axis=0)
    vec['prefixes']['x_res_inp'] = np.concatenate((vec['prefixes']['x_res_inp'],
                                                          np.array(np.append(log[i]['res_order'],padding))), axis=0)
    vec['prefixes']['xt_inp'] = np.concatenate((vec['prefixes']['xt_inp'],
                                                        np.array(np.append(log[i]['elapsed_time'],padding))), axis=0)

    vec['label'] = np.append(vec['label'],log[i]['label'])
  

  

  
  #The concatenation returns a flattened vector. Hence, reshaping the vectors at the end
  vec['prefixes']['x_ac_inp'] = np.reshape(vec['prefixes']['x_ac_inp'],(cases,len_ac))
  vec['prefixes']['x_res_inp'] = np.reshape(vec['prefixes']['x_res_inp'],(cases,len_ac))
  vec['prefixes']['xt_inp'] = np.reshape(vec['prefixes']['xt_inp'],(cases,len_ac))

  return vec


In [19]:
vec_train = vectorization(log_train,ac_index,res_index,trc_len,cases_train)
vec_test = vectorization(log_test,ac_index,res_index,trc_len,cases_test)
vec_val = vectorization(log_val,ac_index,res_index,trc_len,cases_test)

In [22]:
from tensorflow.keras import utils as ku 

ac_weights = ku.to_categorical(sorted(index_ac.keys()), len(ac_index))
# print('AC_WEIGHTS', ac_weights)
res_weights =  ku.to_categorical(sorted(index_res.keys()), len(res_index))
# print('RL_WEIGHTS', res_weights)

# Deep learning models