In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [2]:
import os
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from pm4py.objects.log.util import dataframe_utils
import time
import joblib

In [11]:
def get_lead_ts(log):
    
    temp_log = log.copy()
    temp_log = temp_log.sort_values(['case_id', 'event_id'])
    temp_log['ts_next'] = temp_log.ts.shift(-1)
    temp_log.loc[temp_log['activity'] == '<EOS>', 'ts_next'] = np.nan
    
    return log.merge(temp_log[['event_id', 'ts_next']], left_on='event_id', right_on='event_id')

def get_all_configs(conf_dir = 'configs/lead_configs/'):
        
    configurations = {}

    for conf in os.listdir(conf_dir):

        if 'configuration' in conf:
            configurations.update(joblib.load(conf_dir + conf))

    #print('config keys:', list(configurations.keys()))
    
    return configurations


def fill_na_configs(configs, na_val = 60*24):
    
    for locations in list(configs.keys()):
        
        for key, val in zip(configs['{}'.format(locations)].keys(), configs['{}'.format(locations)].values()):
            
            if pd.isna(val[0]):
                #print(locations, key, val)
                configs['{}'.format(locations)][key] = (configs['{}'.format(locations)][key][0], configs['{}'.format(locations)][key][1], (na_val))
    
    return configs      
            
            

def get_lead_location(log, location):
    
    #ev_ids = np.array(log[log.activity.str.startswith(target_activity)].event_id)+1 
    ev_ids = np.array(log[log.activity == location].event_id)+1 
    leads = log[log.event_id.isin(ev_ids)]['activity'].value_counts()

    return leads


def get_lead_location_dict(log, lead):
    
    lead_dic = {}
    locations = log.activity.value_counts().index
    
    for location in locations:
        
        lead_dic[location] = {}
        
        if location == '<EOS>':
            lead_dic[location] = ['NoLeadLocation']
            continue
        
        leads = get_lead_location(log, location)
        lead_dic[location] = list(leads.index[:lead+1])
        
    return lead_dic



def comp_loadstate_optdur(x, previous, configs, lead):
    
    if len(previous) > 0:
        
        location = x['activity']

        offset = pd.DateOffset(minutes=0)
        diff = pd.DateOffset(minutes=configs[location][list(configs[location].keys())[lead]][2])
        

        return pd.Series([x.event_id, previous.loc[(previous.index >= x['ts']-diff-offset) & (previous.index < x['ts']-offset)].ts.count()])
    
    else:
        return pd.Series([x.event_id, 0])
    
    
def comp_loadstate_activecases(x, previous):
    
    return pd.Series([x.event_id, previous.loc[(previous.ts <= x.ts) & (previous.ts_next >= x.ts)].ts.count()])
    
    
def compute_lead_load(log, location, lead_dic, lead, load_state):
    """lead = 0 e.g. returns the loadstate at the most likely next location"""
    
    load_comp = None
    
    if len(lead_dic[location]) -1 >= lead:
        print(location, '->', lead_dic[location][lead])
        previous = log.loc[log.activity == lead_dic[location][lead]]
    
    else:
        print(' {} likely next location does not exist for the location {}, return load = 0 for all events:'.format(lead, location))
        previous = log.loc[log.activity.str.startswith('NoNextEvent')]
    
    target_log = log.loc[(log.activity == location)]
    
    if len(target_log) == 0:
        return print('target log empty')
    
    if load_state == 'actcase':
        load_comp = target_log.apply(lambda x: comp_loadstate_activecases(x, previous), axis=1)
    
    if load_state == 'optdur':
        configs = get_all_configs()
        configs = fill_na_configs(configs)
        
        load_comp = target_log.apply(lambda x: comp_loadstate_optdur(x, previous, configs, lead), axis=1) ##add configs here
         
    load_comp.columns = ['event_id', 'lead_{}_load'.format(lead+1)]
    
    return load_comp
    
def get_lead_loads(log, lead=0, load_state='actcase'):
    
    load_df = pd.DataFrame(columns=['event_id', 'lead_{}_load'.format(lead+1)])
    lead_dic = get_lead_location_dict(log, lead)
      
    if load_state == 'actcase':
        log = get_lead_ts(log)
        
    for location in tqdm(lead_dic):
        
        load_comp = compute_lead_load(log, location, lead_dic, lead, load_state)
        load_df = load_df.append(load_comp)
    
    return log.merge(load_df, left_on='event_id', right_on='event_id')
            

In [9]:
log_csv = pd.read_csv('evlog.csv', sep=',')
log_csv.drop(log_csv.columns[0], axis=1, inplace=True)
log_csv = dataframe_utils.convert_timestamp_columns_in_df(log_csv)
log_csv = log_csv.sort_values('ts')
log_csv.set_index(log_csv.ts, inplace=True)

In [10]:
load_log = get_lead_loads(log_csv, lead=0, load_state='optdur')

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))

W_Completeren aanvraag -> W_Completeren aanvraag
W_Nabellen offertes -> W_Nabellen offertes
W_Nabellen incomplete dossiers -> W_Nabellen incomplete dossiers
W_Valideren aanvraag -> W_Nabellen incomplete dossiers
W_Afhandelen leads -> W_Afhandelen leads
A_SUBMITTED -> A_PARTLYSUBMITTED
<BOS> -> A_SUBMITTED
<EOS> -> NoLeadLocation
A_PARTLYSUBMITTED -> A_PREACCEPTED
A_DECLINED -> <EOS>
A_PREACCEPTED -> W_Completeren aanvraag
O_SELECTED -> O_CREATED
O_CREATED -> O_SENT
O_SENT -> W_Nabellen offertes
A_ACCEPTED -> O_SELECTED
A_FINALIZED -> O_CREATED
O_CANCELLED -> O_SELECTED
O_SENT_BACK -> W_Valideren aanvraag
A_CANCELLED -> O_CANCELLED
A_APPROVED -> A_ACTIVATED
A_ACTIVATED -> W_Valideren aanvraag
A_REGISTERED -> A_ACTIVATED
O_ACCEPTED -> A_REGISTERED
O_DECLINED -> A_DECLINED
W_Beoordelen fraude -> W_Beoordelen fraude
W_Wijzigen contractgegevens -> W_Wijzigen contractgegevens



In [12]:
print(load_log.lead_1_load.mean(), load_log.lead_1_load.median(), load_log.lead_1_load.std())

154.26157351217518 68.0 163.97547369819748


In [8]:
print(load_log.lead_1_load.mean(), load_log.lead_1_load.median(), load_log.lead_1_load.std())

154.2582340987745 68.0 163.97423537584413
