# Objective
Notebook provides a pipeline to compare the results of the simulation model against the true model discovered from the original eventlog

In [118]:
import simpy
from outputs.run_definition import run_definition 
import outputs.process_class as pc
from outputs.utils import process_path_probability
import pickle
import pandas as pd
import pm4py
from pm4py.statistics.variants.log import get as pml
from pm4py.objects.conversion.log.variants import df_to_event_log_nv 

# For simulation

In [119]:
env = simpy.Environment()

with open('outputs/data/variants.pkl', 'rb') as f:
     variants = pickle.load(f)


process_scenario_object = pc.process_definition(env)
process_path_probabilities = process_path_probability(variants).run()
event_log = process_scenario_object.log_list

with open('outputs/data/log_sim.pkl', 'wb') as f:
     pickle.dump(event_log, f)

    
env.process(run_definition(env, process_scenario_object,  process_path_probabilities))
env.run(30000)



In [120]:
log_sim_pm = pd.DataFrame(event_log, columns = ['case_id', 'event_time', 'Activity'])
log_pm = pm4py.format_dataframe(log_sim_pm, case_id = 'case_id', activity_key = 'Activity', timestamp_key = 'event_time')

In [121]:
#process_mode = pm4py.discover_bpmn_inductive(log_pm)

In [122]:
event_log_renamed = log_pm

In [123]:
event_log_renamed

Unnamed: 0,case:concept:name,time:timestamp,concept:name,@@index
30,10,20,aannamelaboratoriumonderzoek,30
49,10,26,aannamelaboratoriumonderzoek,49
55,10,28,creatininespoed,55
61,10,30,natriumvlamfotometrischspoed,61
68,10,32,kaliumvlamfotometrischspoed,68
...,...,...,...,...
1035,99,224,administratieftariefeerstepol,1035
1041,99,226,cytologischonderzoekectocervix,1041
1066,99,232,vervolgconsultpoliklinisch,1066
1073,99,234,administratieftariefeerstepol,1073


In [124]:
event_log_renamed_sim = df_to_event_log_nv.apply(event_log_renamed)

In [125]:
variants_simulation = pml.get_variants_sorted_by_count(pml.get_variants(event_log_renamed_sim))

In [126]:
len(variants_simulation)

96

# For original model

In [127]:
event_log = pm4py.read_xes('Dataset/Hospital_log.xes')

parsing log, completed traces ::   0%|          | 0/1143 [00:00<?, ?it/s]

In [128]:
event_log_pd = pm4py.convert_to_dataframe(event_log)

In [129]:
event_log_pd_filtered = event_log_pd[['concept:name', 'time:timestamp', 'case:concept:name']]

In [130]:
event_log_pd_filtered['concept:name'] = event_log_pd_filtered['concept:name'].str.replace('[^a-zA-Z]', '')

  event_log_pd_filtered['concept:name'] = event_log_pd_filtered['concept:name'].str.replace('[^a-zA-Z]', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  event_log_pd_filtered['concept:name'] = event_log_pd_filtered['concept:name'].str.replace('[^a-zA-Z]', '')


In [131]:
event_log = df_to_event_log_nv.apply(event_log_pd_filtered)

In [132]:
variants_original = pml.get_variants_sorted_by_count(pml.get_variants(event_log))

In [133]:
len(variants_original)

981

In [134]:
# Quantifying the accuracy

In [135]:
variants_simulation_component = []
for i in range(len(variants_simulation)):
    variants_simulation_component.append(variants_simulation[i][-2]) 

In [136]:
len(variants_simulation_component)

96

In [137]:
variants_original_component = []
for i in range(len(variants_original)):
    variants_original_component.append(variants_original[i][-2]) 

In [138]:
count_true_positives = 0
for variant in variants_simulation_component:
    if variant in variants_original_component:
        count_true_positives = count_true_positives + 1

In [139]:
count_true_positives

96

In [140]:
T_P_T_real = (count_true_positives/len(variants_original))*100

In [141]:
T_N_T_real = ((len(variants_simulation) - count_true_positives)/len(variants_original))*100

In [142]:
T_P_T_sim = (count_true_positives/len(variants_simulation))*100

In [143]:
T_N_T_sim = ((len(variants_simulation) - count_true_positives)/len(variants_simulation))*100

count of variants that did not show in the simulation

In [144]:
variants_left = ((len(variants_original) - count_true_positives)/len(variants_original))*100 # count of variants that did not show in the simulation

In [145]:
T_P_T_real = (count_true_positives/len(variants_original))*100

In [146]:
T_P_T_real

9.785932721712538

In [147]:
T_N_T_real

0.0

In [148]:
T_P_T_sim

100.0

In [149]:
T_N_T_sim

0.0

In [150]:
variants_left

90.21406727828746