In [None]:
import os
import random
import pandas as pd
import numpy as np
import copy
import operator
from collections import Counter
import pickle
import joblib
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
random.seed(42)
optuna.logging.set_verbosity(optuna.logging.WARNING)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Truncate Trace

In [None]:
def case_trun(log, min_thres, max_thres_p):
    case_count = log[caseid].value_counts()
    max_len = np.quantile(case_count.values, max_thres_p)
    if np.quantile(case_count.values,0.25) < min_thres:
        min_len = 2 
    else:
        min_len = min_thres
    case_keep = case_count[( case_count >min_len ) & ( case_count <= max_len )].index.tolist()
    log = log[log[caseid].isin(case_keep)]
    log.reset_index(drop=True, inplace=True)
    return log, min_len, max_len

# Time feature

In [None]:
def time_feat(log, comb):
    if comb == 'all':
        pass
    elif comb == 'tssc_tsp':
        log[[wk,tmd]] = 0
    elif comb == 'none':
        log[[wk,tmd,tssc,tsp]] = 0
    else:
        print("Wrong Input")
        return None
    return log

# Sequence Encoding

In [None]:
def seq_enc(log,method,win_size,pad_size):
    input_idx = 0
    input_id = 'Input ID'
    cols_ = log.columns.tolist()
    cols_.append(input_id)
    encoded_df = pd.DataFrame(columns=cols_)
    for c, t in log.groupby(caseid):
        for ts in range(min_size,len(t)):
            if method == 'cont': 
                trc_ = pd.DataFrame()
                trc_ = copy.deepcopy(t.iloc[(ts-5):ts])
                trc_[input_id] = input_idx
                encoded_df = pd.concat([encoded_df, trc_],ignore_index=True)
            elif method == 'prfx': 
                trc_ = pd.DataFrame()
                trc_ = copy.deepcopy(t.iloc[:ts])
                zero_rows = pd.DataFrame(np.zeros((int(max_size-trc_.shape[0]), trc_.shape[1])), columns=trc_.columns)
                zero_rows[act] = 'zero_pad'
                trc_ = pd.concat([trc_, zero_rows], ignore_index=True)
                trc_[input_id] = input_idx
                trc_[caseid] = t[caseid].iloc[ts]
                encoded_df = pd.concat([encoded_df, trc_], ignore_index=True)
            elif method == 'se': 
                trc_ = pd.DataFrame()
                trc_ = copy.deepcopy(t.iloc[[ts-1]])
                trc_[input_id] = input_idx
                encoded_df = pd.concat([encoded_df, trc_], ignore_index=True)
            else:
                print("Wrong Input")
            input_idx += 1
    return encoded_df

# MinMax Scaler

In [None]:
def time_minmax(log,comb):
    scaler = MinMaxScaler()
    if comb == 'all':
        log[[wk,tmd,tssc,tsp]] = scaler.fit_transform(log[[wk,tmd,tssc,tsp]])
    elif comb == 'tssc_tsp':
        log[[wk,tmd,tssc,tsp]] = scaler.fit_transform(log[[wk,tmd,tssc,tsp]])
    elif comb == 'none':
        log[[wk,tmd,tssc,tsp]] = scaler.fit_transform(log[[wk,tmd,tssc,tsp]])
    else:
        print("Wrong Input")
        return None
    return log



# Event Encoding1

In [None]:
def evt_enc1(log,comb):
    if comb == 'emb':
        unique_act = log[act].unique()
        act_to_idx = {a: i for i, a in enumerate(unique_act)}
        log[act] = log[act].map(act_to_idx)
    elif comb == 'oh':
        org_col = log.columns
        log = pd.get_dummies(copy.deepcopy(log), columns=[act],dtype=int)
        oh_col = log.columns
        oh_act = list(set(oh_col)-set(org_col))
    elif comb == 'frq':
        org_col = log.columns
        log = pd.get_dummies(copy.deepcopy(log), columns=[act],dtype=int)
        oh_col = log.columns
        oh_act = list(set(oh_col)-set(org_col))
        frq_df1 = log[[label,'Input ID',caseid]].groupby('Input ID').max()
        frq_df2 = log[oh_act+['Input ID']+[wk,tmd,tssc,tsp]].groupby('Input ID').sum()
        log = frq_df1.join(frq_df2)
        log = log.reset_index()
        log[list(set([wk,tmd,tssc,tsp])-set(log.columns)&set([wk,tmd,tssc,tsp]))] = 0
    return log

# Train Test split

In [None]:
def set_split(log1,log2,ratio):
    random.seed(42)
    split_ = list(log1[caseid].unique())
    random.shuffle(split_)
    split_ = split_[:int(len(split_)*ratio)]
    train_log = log2[log2[caseid].isin(split_)]
    test_log = log2[~(log2[caseid].isin(split_))]
    if sorted(list(set(list(train_log[act].unique()))&set(list(test_log[act].unique())))) != sorted(set(list(test_log[act].unique()))):
        ex_act = list(set(list(test_log[act].unique()))-set(list(train_log[act].unique()))&set(list(test_log[act].unique())))
        del_case = log1[log1[caseid].isin(log2[log2[act].isin(ex_act)][caseid].values)].index.tolist()
        log1.drop(del_case,inplace=True)
        train_log = log1[log1[caseid].isin(split_)]
        test_log = log1[~(log1[caseid].isin(split_))]
    else:
        train_log = log1[log1[caseid].isin(split_)]
        test_log = log1[~(log1[caseid].isin(split_))]
    val_case = random.sample(list(train_log[caseid].unique()),int(round(train_log[caseid].nunique()/4,0)))
    val_log = train_log[train_log[caseid].isin(val_case)]    
    train_log = train_log[~train_log[caseid].isin(val_case)]
    return train_log, test_log, val_log

# DataLoader

In [None]:
def tensorset(log,comb,cols):
    act_list =[]
    time_list = []
    label_list = []
    for c, t in log.groupby('Input ID'):
        if comb == 'emb':
            act_list.append(t[act].values.reshape(1,-1).tolist())
            label_list.append(np.array(list(set(t[label].values))).reshape(1,-1).tolist())
            time_list.append(t[[wk,tmd,tssc,tsp]].values.reshape(1,-1).tolist())
        elif comb == 'oh':
            act_list.append(t[cols].values.reshape(1,-1).tolist())
            lab_temp, _ = Counter(t[label].values.tolist()).most_common(1)[0]
            label_list.append([[lab_temp]])
            time_list.append(t[[wk,tmd,tssc,tsp]].values.reshape(1,-1).tolist())
        elif comb == 'frq':
            act_list.append(t[cols].values.reshape(1,-1).tolist())
            lab_temp, _ = Counter(t[label].values.tolist()).most_common(1)[0]
            label_list.append([[lab_temp]])
            time_list.append(t[[wk,tmd,tssc,tsp]].values.reshape(1,-1).tolist())
    if comb == 'emb':
        act_tensor = torch.tensor(act_list, dtype=torch.int64)
        label_tensor = torch.tensor(label_list, dtype=torch.float)
        time_tensor = torch.tensor(time_list, dtype=torch.float) 
        ts = TensorDataset(act_tensor, time_tensor, label_tensor)
        dl = DataLoader(ts, batch_size=16, shuffle=True)
    elif (comb == 'oh') or (comb == 'frq'):
        act_tensor = torch.tensor(act_list, dtype=torch.float)
        label_tensor = torch.tensor(label_list, dtype=torch.float)
        time_tensor = torch.tensor(time_list, dtype=torch.float)
        ts = TensorDataset(act_tensor, time_tensor, label_tensor)
        dl = DataLoader(ts, batch_size=16, shuffle=True)
    return dl

In [None]:
def mldataset(log):
    y_bin = []
    x_bin = []
    for ipid, ipr in log.groupby('Input ID'):
        y_temp,_ = Counter(ipr[label].values.tolist()).most_common(1)[0]
        y_bin.append([y_temp])
        x_bin.extend((ipr[flat_col].values.reshape(1, -1).tolist()))
    x_set = np.array(x_bin)
    y_set = np.array(y_bin)
    return x_set,y_set

# Model

In [None]:
class MyLSTMModel(nn.Module):
    def __init__(self, act_size, time_feature_dim, hidden_dim, output_dim, num_layers, comb2, comb3):
        super(MyLSTMModel, self).__init__()
        self.mod = comb2
        self.evmod = comb3
        self.main_layers = nn.ModuleList()
        if self.evmod =='oh': 
            for i in range(num_layers):
                self.main_layers.append(nn.LSTM(int(act_size/(time_feature_dim/4)) if i == 0 else hidden_dim, hidden_dim, batch_first=True))
                self.main_layers.append(nn.BatchNorm1d(hidden_dim))
        elif self.evmod == 'frq':
            for i in range(num_layers):
                self.main_layers.append(nn.LSTM(act_size if i == 0 else hidden_dim, hidden_dim, batch_first=True))
                self.main_layers.append(nn.BatchNorm1d(hidden_dim))
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.actf = nn.Tanh()
        for mlayer in self.main_layers:
            if isinstance(mlayer, nn.LSTM):
                for name, param in mlayer.named_parameters():
                    if 'weight_ih' in name:
                        nn.init.xavier_uniform_(param.data)
                    elif 'weight_hh' in name:
                        nn.init.xavier_uniform_(param.data)
    def forward(self, feature1, feature2):
        if self.evmod =='oh':
            layer_out = feature1.view(feature1.shape[0], int(feature2.shape[2]/4), int(feature1.shape[2]/(feature2.shape[2]/4)))
        elif self.evmod == 'frq':
            layer_out = feature1
        for i in range(0, len(self.main_layers), 2):
            mlayer = self.main_layers[i]
            bn = self.main_layers[i+1]
            layer_out, _ = mlayer(layer_out)
            layer_out = layer_out.permute(0, 2, 1)  
            layer_out = bn(layer_out)
            layer_out = layer_out.permute(0, 2, 1)  
        output = self.fc(layer_out[:, -1, :])
        return output  

In [None]:
class MyMLPModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(MyMLPModel, self).__init__()
        self.main_layers = nn.ModuleList()
        for i in range(num_layers):
            self.main_layers.append(nn.Linear(input_size if i == 0 else hidden_size, hidden_size))
            self.main_layers.append(nn.BatchNorm1d(hidden_size))
        self.fc = nn.Linear(hidden_size, output_size)
        self.actf = nn.Tanh()
        for mlayer in self.main_layers:
            if isinstance(mlayer, nn.Linear):
                nn.init.xavier_uniform_(mlayer.weight)
                if mlayer.bias is not None:
                    nn.init.constant_(mlayer.bias, 0)
    def forward(self, x):
        layer_out = x
        for i in range(0, len(self.main_layers), 2):
            mlayer = self.main_layers[i]
            bn = self.main_layers[i+1]
            layer_out = mlayer(layer_out)
            layer_out = layer_out.permute(0, 2, 1)  
            layer_out = bn(layer_out)
            layer_out = layer_out.permute(0, 2, 1)  
            layer_out = self.actf(layer_out)
        output = self.fc(layer_out[:, -1, :])
        return output

In [None]:
def lstmobjective(trial):
    
    hidden_size = trial.suggest_int('hidden_size', 16, 256,step = 16, log=False)
    num_layers = trial.suggest_int('num_layers', 1, 3)
    lr = trial.suggest_float('lr', 1e-4, 1e-2,step = 5*1e-5, log=False)
    model = MyLSTMModel(act_size = act_in_shape, time_feature_dim=time_in_shape, hidden_dim=hidden_size, 
                        output_dim=1, num_layers=num_layers,  comb2 = model_mtd, comb3 = ev_mtd).to(device)
    prstr = '_'.join(str(value) for value in trial.params.values())
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    early_loss = np.Inf
    early_count = 0
    model.train()
    for epoch in range(50): 
        model.train()
        train_loss = 0.0
        for feature1, feature2, lab_load in train_loader:  
            feature1, feature2, lab_load = feature1.to(device), feature2.to(device), lab_load.to(device)
            optimizer.zero_grad()
            outputs = model(feature1, feature2)
            lab_load = lab_load.squeeze()
            if lab_load.dim() == 0:
                lab_load = lab_load.unsqueeze(0)
            loss = criterion(outputs.squeeze(), lab_load.float())  
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * feature1.size(0)
        train_loss = train_loss / len(train_loader.dataset)

        model.eval()
        valid_loss = 0.0
        with torch.no_grad():
            for feature1, feature2, lab_load in valid_loader:
                feature1, feature2, lab_load = feature1.to(device), feature2.to(device), lab_load.to(device)
                outputs = model(feature1, feature2 )
                lab_load = lab_load.squeeze()
                if lab_load.dim() == 0:
                    lab_load = lab_load.unsqueeze(0)
                loss = criterion(outputs.squeeze(), lab_load.float())  
                valid_loss += loss.item() * feature1.size(0)
        valid_loss = valid_loss / len(valid_loader.dataset)

        if valid_loss < early_loss:
            early_loss = valid_loss
            torch.save(model.state_dict(), model_loc+f'lstm_model_{prstr}.pt')
            best_epoch = epoch
        elif valid_loss >= early_loss:
            if early_count > 5:
                break
            early_count+= 1    

    pred_auc = np.empty((0, 1))
    true_auc = np.array([])
    model.load_state_dict(torch.load(model_loc+f'lstm_model_{prstr}.pt'))
    model.eval()
    pred_acc_bin, true_acc = [], []
    with torch.no_grad():
        for feature1, feature2, lab_load in valid_loader:
            feature1, feature2, lab_load = feature1.to(device), feature2.to(device), lab_load.to(device)
            outputs = model(feature1, feature2)
            lab_load = lab_load.squeeze()
            if lab_load.dim() == 0:
                lab_load = lab_load.unsqueeze(0)                                    

            pred_acc = torch.sigmoid(outputs).round().squeeze().tolist()
            pred_auc = np.vstack((pred_auc, torch.sigmoid(outputs).detach().cpu().numpy()))
            if true_auc.size == 0:
                true_auc = lab_load.unsqueeze(1).cpu().numpy()
            else:
                true_auc = np.concatenate((true_auc, lab_load.unsqueeze(1).cpu().numpy()))

            pred_acc_bin.extend(pred_acc)
            true_acc.extend(lab_load.tolist())
    
    accuracy = accuracy_score(true_acc, pred_acc_bin)
    
    auc = roc_auc_score(true_auc, pred_auc)
    return (auc+accuracy)/2

In [None]:
def mlpobjective(trial):

    hidden_size = trial.suggest_int('hidden_size', 16, 256,step = 16, log=False)
    num_layers = trial.suggest_int('num_layers', 1, 3)
    lr = trial.suggest_float('lr', 1e-4, 1e-2,step = 5*1e-5, log=False)
    
    model = MyMLPModel(input_size=act_in_shape, hidden_size=hidden_size, num_layers=num_layers, output_size=1).to(device)
    
    prstr = '_'.join(str(value) for value in trial.params.values())
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    early_loss = np.Inf
    early_count = 0
    model.train()
    for epoch in range(50): 
        model.train()
        train_loss = 0.0
        for feature1, lab_load in train_loader:  
            feature1, lab_load = feature1.to(device),lab_load.to(device)
            optimizer.zero_grad()
            outputs = model(feature1)
            lab_load = lab_load.squeeze()
            if lab_load.dim() == 0:
                lab_load = lab_load.unsqueeze(0)
            loss = criterion(outputs.squeeze(), lab_load.float())  
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * feature1.size(0)
        train_loss = train_loss / len(train_loader.dataset)

        model.eval()
        valid_loss = 0.0
        with torch.no_grad():
            for feature1, lab_load in valid_loader:
                feature1, lab_load = feature1.to(device), lab_load.to(device)
                outputs = model(feature1 )
                lab_load = lab_load.squeeze()
                if lab_load.dim() == 0:
                    lab_load = lab_load.unsqueeze(0)
                loss = criterion(outputs.squeeze(), lab_load.float())  

                valid_loss += loss.item() * feature1.size(0)
        valid_loss = valid_loss / len(valid_loader.dataset)

        if valid_loss < early_loss:
            early_loss = valid_loss
            torch.save(model.state_dict(), model_loc+f'mlp_model_{prstr}.pt')
            best_epoch = epoch
        elif valid_loss >= early_loss:
            if early_count > 5:
                break
            early_count+= 1    

    pred_auc = np.empty((0, 1))
    true_auc = np.array([])
    model.load_state_dict(torch.load(model_loc+f'mlp_model_{prstr}.pt'))
    model.eval()
    pred_acc_bin, true_acc = [], []
    with torch.no_grad():
        for feature1, lab_load in valid_loader:
            feature1, lab_load = feature1.to(device), lab_load.to(device)
            outputs = model(feature1)
            lab_load = lab_load.squeeze()
            if lab_load.dim() == 0:
                lab_load = lab_load.unsqueeze(0)                                    

            pred_acc = torch.sigmoid(outputs).round().squeeze().tolist()
            pred_auc = np.vstack((pred_auc, torch.sigmoid(outputs).detach().cpu().numpy()))
            if true_auc.size == 0:
                true_auc = lab_load.unsqueeze(1).cpu().numpy()
            else:
                true_auc = np.concatenate((true_auc, lab_load.unsqueeze(1).cpu().numpy()))

            pred_acc_bin.extend(pred_acc)
            true_acc.extend(lab_load.tolist())
    
    accuracy = accuracy_score(true_acc, pred_acc_bin)
    auc = roc_auc_score(true_auc, pred_auc)
    return (auc+accuracy)/2

In [None]:
def xgbobjective(trial):
    
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'max_depth': trial.suggest_int('max_depth', 1, 32),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1.0,step = 5*1e-5, log=False),
        'n_estimators': trial.suggest_int('n_estimators', 10, 300),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

    clf = xgb.XGBClassifier(**param)
    clf.fit(train_x, train_y.ravel())

    prstr = '_'.join(str(value) for value in trial.params.values())
    clf.save_model(model_loc+f'xgb_model_{prstr}.json')
    
    preds_acc = clf.predict(valid_x)
    preds_auc = clf.predict_proba(valid_x)[:, 1]
    auc = roc_auc_score(valid_y, preds_auc)
    acc = accuracy_score(valid_y, preds_acc)
    return (acc+auc)/2

def adbobjective(trial):
    max_depth = trial.suggest_int('max_depth', 1, 32)
    n_estimators = trial.suggest_int('n_estimators', 10, 300,step = 5, log=False)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1.0,step = 5*1e-5, log=False)
    
    base_estimator = DecisionTreeClassifier(max_depth=max_depth)
    
    clf = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=max_depth),
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        random_state=42)

    clf.fit(train_x, train_y.ravel())
    
    prstr = '_'.join(str(value) for value in trial.params.values())
    joblib.dump(clf, model_loc+f'adb_model_{prstr}.pkl')
        
    preds_acc = clf.predict(valid_x)
    preds_auc = clf.predict_proba(valid_x)[:, 1]
    auc = roc_auc_score(valid_y, preds_auc)
    acc = accuracy_score(valid_y, preds_acc)
    return (acc+auc)/2


def rfobjective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 300,step = 5, log=False)
    max_depth = trial.suggest_int('max_depth', 1, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    
    clf = RandomForestClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_samples_split=min_samples_split, 
        min_samples_leaf=min_samples_leaf,
        random_state=42)
    clf.fit(train_x, train_y.ravel())

    prstr = '_'.join(str(value) for value in trial.params.values())
    joblib.dump(clf, model_loc+f'rf_model_{prstr}.pkl')
    
    preds_acc = clf.predict(valid_x)
    preds_auc = clf.predict_proba(valid_x)[:, 1]
    auc = roc_auc_score(valid_y, preds_auc)
    acc = accuracy_score(valid_y, preds_acc)
    return (acc+auc)/2


def dtobjective(trial):
    max_depth = trial.suggest_int('max_depth', 1, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_float('max_features', 0.1, 1.0)
    
    clf = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42)
    clf.fit(train_x, train_y.ravel())

    prstr = '_'.join(str(value) for value in trial.params.values())
    joblib.dump(clf, model_loc+f'dt_model_{prstr}.pkl')
    
    preds_acc = clf.predict(valid_x)
    preds_auc = clf.predict_proba(valid_x)[:, 1]
    auc = roc_auc_score(valid_y, preds_auc)
    acc = accuracy_score(valid_y, preds_acc)
    return (acc+auc)/2

# EarlyStop

In [None]:
class EarlyStoppingCallback(object):
    def __init__(self, early_stopping_rounds: int, direction: str) -> None:
        self.early_stopping_rounds = early_stopping_rounds
        self._iter = 0
        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study: optuna.Study, trial: optuna.Trial) -> None:
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1
        if self._iter >= self.early_stopping_rounds:
            study.stop()

# ObtunaTEST

## lstm

In [None]:
def lstmtest(lstmparam):
    prteststr = '_'.join(str(value) for value in lstmparam.values())
    model = MyLSTMModel(act_size=act_in_shape, time_feature_dim=time_in_shape, hidden_dim=lstmparam['hidden_size'],
                        num_layers=lstmparam['num_layers'], output_dim=1,  comb2 = model_mtd, comb3 = ev_mtd).to(device)
    pred_auc = np.empty((0, 1))
    true_auc = np.array([])
    model.load_state_dict(torch.load(model_loc+f'lstm_model_{prteststr}.pt'))
    model.eval()
    pred_acc_bin, true_acc = [], []
    with torch.no_grad():
        for feature1, feature2, lab_load in test_loader:
            feature1, feature2, lab_load = feature1.to(device), feature2.to(device), lab_load.to(device)
            outputs = model(feature1, feature2)
            lab_load = lab_load.squeeze()
            if lab_load.dim() == 0:
                lab_load = lab_load.unsqueeze(0)                                    

            pred_acc = torch.sigmoid(outputs).round().squeeze().tolist()
            pred_acc_bin.extend(pred_acc)
            true_acc.extend(lab_load.tolist())
            
            pred_auc = np.vstack((pred_auc, torch.sigmoid(outputs).detach().cpu().numpy()))
            if true_auc.size == 0:
                true_auc = lab_load.unsqueeze(1).cpu().numpy()
            else:
                true_auc = np.concatenate((true_auc, lab_load.unsqueeze(1).cpu().numpy()))
    acc = accuracy_score(true_acc, pred_acc_bin)
    auc = roc_auc_score(true_auc, pred_auc)
    return auc,acc

## mlp

In [None]:
def mlptest(mlpparam):
    prteststr = '_'.join(str(value) for value in mlpparam.values())
    model = MyMLPModel(input_size=act_in_shape, hidden_size=mlpparam['hidden_size'], num_layers=mlpparam['num_layers'], output_size=1).to(device)
    pred_auc = np.empty((0, 1))
    true_auc = np.array([])
    model.load_state_dict(torch.load(model_loc+f'mlp_model_{prteststr}.pt'))
    model.eval()
    pred_acc_bin, true_acc = [], []
    with torch.no_grad():
        for feature1, lab_load in test_loader:
            feature1, lab_load = feature1.to(device), lab_load.to(device)
            outputs = model(feature1)
            lab_load = lab_load.squeeze()
            if lab_load.dim() == 0:
                lab_load = lab_load.unsqueeze(0)                                    

            pred_acc = torch.sigmoid(outputs).round().squeeze().tolist()
            pred_acc_bin.extend(pred_acc)
            true_acc.extend(lab_load.tolist())
            
            pred_auc = np.vstack((pred_auc, torch.sigmoid(outputs).detach().cpu().numpy()))
            if true_auc.size == 0:
                true_auc = lab_load.unsqueeze(1).cpu().numpy()
            else:
                true_auc = np.concatenate((true_auc, lab_load.unsqueeze(1).cpu().numpy()))

    acc = accuracy_score(true_acc, pred_acc_bin)
    auc = roc_auc_score(true_auc, pred_auc)
    return auc,acc

## machine learning

In [None]:
def xgbtest(xgbparam):
    prteststr = '_'.join(str(value) for value in xgbparam.values())
    clf = xgb.XGBClassifier()
    clf.load_model(model_loc+f'xgb_model_{prteststr}.json')    
    preds_auc = clf.predict_proba(test_x)[:, 1]
    preds_acc = clf.predict(test_x)
    auc = roc_auc_score(test_y, preds_auc)
    acc = accuracy_score(test_y, preds_acc)
    return auc,acc

def adbtest(adbparam):
    prteststr = '_'.join(str(value) for value in adbparam.values())
    clf = joblib.load(model_loc+f'adb_model_{prteststr}.pkl')
    preds_auc = clf.predict_proba(test_x)[:, 1]
    preds_acc = clf.predict(test_x)
    auc = roc_auc_score(test_y, preds_auc)
    acc = accuracy_score(test_y, preds_acc)
    return auc,acc

def rftest(rfparam):
    prteststr = '_'.join(str(value) for value in rfparam.values())
    clf = joblib.load(model_loc+f'rf_model_{prteststr}.pkl')
    preds_auc = clf.predict_proba(test_x)[:, 1]
    preds_acc = clf.predict(test_x)
    auc = roc_auc_score(test_y, preds_auc)
    acc = accuracy_score(test_y, preds_acc)
    return auc,acc

def dttest(dtparam):
    prteststr = '_'.join(str(value) for value in dtparam.values())
    clf = joblib.load(model_loc+f'dt_model_{prteststr}.pkl')
    preds_auc = clf.predict_proba(test_x)[:, 1]
    preds_acc = clf.predict(test_x)
    auc = roc_auc_score(test_y, preds_auc)
    acc = accuracy_score(test_y, preds_acc)
    return auc,acc


# Loop

In [None]:
eventlog = 'SEPSIS.csv'
model_loc =  'model_bin/'
dir =  os.getcwd()
data_loc = '/datasets/'

if eventlog == 'BPIC11.csv':
    df = pd.read_csv(dir+data_loc+eventlog,delimiter=';')
    df = df[['Case ID','label','Activity code','timesincecasestart','timesincelastevent','weekday','timesincemidnight']]
    act = 'Activity code'
    
elif (eventlog =='BPIC12.csv') or (eventlog =='BPIC15.csv') or(eventlog =='SEPSIS.csv'):
    df = pd.read_csv(dir+data_loc+eventlog,delimiter=';')
    df = df[['Case ID','label','Activity','timesincecasestart','timesincelastevent','weekday','timesincemidnight']]
    act = 'Activity'
else:
    raise Exception("Wrong Eventlog Input")
    
df['label'] = df['label'].replace({'regular': 1, 'deviant': 0})
caseid = 'Case ID'
label = 'label'
tssc = 'timesincecasestart'
tsp = 'timesincelastevent'
wk = 'weekday'
tmd = 'timesincemidnight'
inputid = 'Input ID'

In [None]:
case_trun_df, min_size, max_size = case_trun(copy.deepcopy(df), 5, 0.75)

sq_mtd_bin = ['cont','prfx','se']
time_bin = ['all','tssc_tsp','none']
ev_mtd_bin = ['oh','frq']
model_mtd_bin = ['dt','rf','xgb','adb','lstm','mlp']

In [None]:
time_list = []
sq_list = []
ev_list = []

model_list = []
acc_list = []
auc_list = []
for sq_mtd in sq_mtd_bin:
    seq_df = seq_enc(copy.deepcopy(case_trun_df),sq_mtd,min_size,max_size)
    for ev_mtd in ev_mtd_bin:
        evt_df1 = evt_enc1(copy.deepcopy(seq_df),ev_mtd)
        for time_mtd in time_bin:
            time_df = time_feat(copy.deepcopy(evt_df1),time_mtd)
            time_df = time_minmax(copy.deepcopy(time_df),time_mtd)

            train_df, test_df, valid_df = set_split(copy.deepcopy(time_df),copy.deepcopy(seq_df),0.8)
            flat_col = list(set(time_df.columns)-set([label,'Input ID',caseid]))
            
            train_x, train_y = mldataset(train_df)
            valid_x, valid_y = mldataset(valid_df)
            test_x, test_y = mldataset(test_df)

                
            train_loader = tensorset(train_df,ev_mtd,flat_col)
            test_loader = tensorset(test_df,ev_mtd,flat_col)
            valid_loader = tensorset(valid_df,ev_mtd,flat_col)

            for a,t,l in train_loader:
                act_in_shape = a.shape[2]
                time_in_shape = t.shape[2]
                break
                
            for model_mtd in model_mtd_bin:
                if  model_mtd == 'mlp':
                    early_stopping = EarlyStoppingCallback(5, direction='maximize')
                    study = optuna.create_study(direction='maximize')
                    study.optimize(mlpobjective, callbacks=[early_stopping], timeout=600)
                    trial = study.best_trial
                    best_auc,best_acc = mlptest(trial.params)
                    auc_list.append(best_auc)
                    acc_list.append(best_acc)
                
                elif model_mtd == 'lstm':
                    early_stopping = EarlyStoppingCallback(5, direction='maximize')
                    study = optuna.create_study(direction='maximize')
                    study.optimize(lstmobjective, callbacks=[early_stopping], timeout=600)
                    trial = study.best_trial
                    best_auc,best_acc = lstmtest(trial.params)
                    auc_list.append(best_auc)
                    acc_list.append(best_acc)

                elif model_mtd == 'xgb':
                    early_stopping = EarlyStoppingCallback(15, direction='maximize')
                    study = optuna.create_study(direction='maximize')
                    study.optimize(xgbobjective, callbacks=[early_stopping], timeout=600)
                    trial = study.best_trial
                    best_auc,best_acc = xgbtest(trial.params)
                    auc_list.append(best_auc)
                    acc_list.append(best_acc)
                
                elif model_mtd == 'adb':
                    early_stopping = EarlyStoppingCallback(15, direction='maximize')
                    study = optuna.create_study(direction='maximize')
                    study.optimize(adbobjective, callbacks=[early_stopping], timeout=600)
                    trial = study.best_trial
                    best_auc,best_acc = adbtest(trial.params)
                    auc_list.append(best_auc)
                    acc_list.append(best_acc)
                
                elif model_mtd == 'rf':
                    early_stopping = EarlyStoppingCallback(15, direction='maximize')
                    study = optuna.create_study(direction='maximize')
                    study.optimize(rfobjective, callbacks=[early_stopping], timeout=600)
                    trial = study.best_trial
                    best_auc,best_acc = rftest(trial.params)
                    auc_list.append(best_auc)
                    acc_list.append(best_acc)
                
                elif model_mtd == 'dt':
                    early_stopping = EarlyStoppingCallback(15, direction='maximize')
                    study = optuna.create_study(direction='maximize')
                    study.optimize(dtobjective, callbacks=[early_stopping], timeout=600)
                    trial = study.best_trial
                    best_auc,best_acc = dttest(trial.params)
                    auc_list.append(best_auc)
                    acc_list.append(best_acc)
                    
                else:
                    print("wrong input")
                    break
                
                time_list.append(time_mtd)
                sq_list.append(sq_mtd)
                ev_list.append(ev_mtd)
                model_list.append(model_mtd)

                            
                print(sq_mtd,ev_mtd,time_mtd,model_mtd,best_auc,best_acc)

In [None]:
config = pd.DataFrame()
config['Sequence'] = sq_list
config['Event'] = ev_list
config['Time'] = time_list
config['Models'] = model_list
config['AUC'] = auc_list
config['Accuracy'] = acc_list

with open(file= dir + '/config_outcome/' + eventlog[:-4] + '/{}_outcome.pickle'.format(eventlog[:-4]), mode='wb') as f:
    pickle.dump(df, f)
