In [1]:
import pm4py
pm4py.__version__

'1.3.0'

# Inductive Miner

> In this question you should discover a model for the given event log with a special focus on the Inductive Miner implemented in PM4Py.

## Data loading

In [2]:
import os
import pandas as pd

PROJ_ROOT = os.path.abspath(os.path.pardir)
data_path = os.path.join(PROJ_ROOT, 'data', 'log.csv')

assert os.path.exists(data_path)

df_log = pd.read_csv(data_path)

df_log.head()

Unnamed: 0,Patient,Activity,Resource,PatientName,Age,Insurance,start_timestamp,Timestamp,@@duration
0,1,Register,Alexander,Hermann the 1.,51,STAT,2020-06-01 06:00:00,2020-06-01 06:08:53,533.0
1,1,Initial Exam,Anna,Hermann the 1.,51,STAT,2020-06-01 06:10:48,2020-06-01 06:25:43,895.0
2,1,Initial Exam Decision,"Amelie,Anna",Hermann the 1.,51,STAT,2020-06-01 06:26:43,2020-06-01 06:31:52,309.0
3,1,Inform about Isolation,Alexander,Hermann the 1.,51,STAT,2020-06-01 06:33:45,2020-06-01 06:33:45,0.0
4,1,Test III,Anna,Hermann the 1.,51,STAT,2020-06-01 06:35:35,2020-06-01 07:03:47,1692.0


In [3]:
from pandas_profiling import ProfileReport

log_profile = ProfileReport(df_log, title="Raw Log Profile")

log_profile.to_notebook_iframe()

ModuleNotFoundError: No module named 'pandas_profiling'

We first verify if the `Patient` has repeated traces, i.e., if it is used again in case the patient returns to the institution. We can test this through the `PatientName` column.

In [4]:
patients_ids = df_log.groupby('PatientName')['Patient'].unique()

(patients_ids.apply(len) > 1).any()

False

So no patient has more than one ID, which means we can use the `Patient` column as an identifier of the traces.

Let's take a look at some traces:

In [5]:
import numpy as np

# group activities into traces
traces = df_log.groupby('Patient')['Activity'].apply(np.array)
traces.sample(10).values

array([array(['Register', 'Initial Exam', 'Initial Exam Decision',
       'Inform about Isolation', 'Control Call', 'Control Call',
       'Control Call', 'Control Call', 'Control Call', 'Control Call',
       'Control Call', 'Test III', 'Test III Decision', 'Control Call',
       'Inform Authority Fill Form', 'Inform Authority Send Form',
       'Control Call', 'Control Call', 'Control Call', 'Control Call',
       'Control Call', 'Control Call', 'Control Call', 'Control Call',
       'Referral', 'Decide Treatment', 'Treatment B', 'Check Treatment B',
       'Treatment B', 'Check Treatment B', 'Treatment B',
       'Check Treatment B', 'Treatment B', 'Check Treatment B',
       'Treatment B', 'Check Treatment B', 'Discharge'], dtype=object),
       array(['Register', 'Initial Exam', 'Initial Exam Decision',
       'Discharge Init Exam'], dtype=object),
       array(['Register', 'Initial Exam', 'Initial Exam Decision',
       'Discharge Init Exam'], dtype=object),
       array(['Regist

In [6]:
from pm4py.objects.conversion.log import factory as log_converter
from pm4py.util import constants

# map dataset columns to PM4Py keys
param_keys={constants.PARAMETER_CONSTANT_CASEID_KEY: 'Patient',
            constants.PARAMETER_CONSTANT_RESOURCE_KEY: 'Resource', 
            constants.PARAMETER_CONSTANT_ACTIVITY_KEY: 'Activity',
            constants.PARAMETER_CONSTANT_TIMESTAMP_KEY: 'Timestamp',
            constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY: 'start_timestamp'}

event_log = log_converter.apply(df_log, parameters=param_keys)

event_log

  # This is added back by InteractiveShellApp.init_path()


[{'attributes': {'concept:name': 1}, 'events': [{'Patient': 1, 'Activity': 'Register', 'Resource': 'Alexander', 'PatientName': 'Hermann the 1.', 'Age': 51, 'Insurance': 'STAT', 'start_timestamp': '2020-06-01 06:00:00', 'Timestamp': '2020-06-01 06:08:53', '@@duration': 533.0}, '..', {'Patient': 1, 'Activity': 'Discharge', 'Resource': 'Brigitte', 'PatientName': 'Hermann the 1.', 'Age': 51, 'Insurance': 'STAT', 'start_timestamp': '2020-06-06 10:58:34', 'Timestamp': '2020-06-06 11:04:55', '@@duration': 381.0}]}, '....', {'attributes': {'concept:name': 1500}, 'events': [{'Patient': 1500, 'Activity': 'Register', 'Resource': 'Alexander', 'PatientName': 'Ulrich the 1500.', 'Age': 51, 'Insurance': 'PRIV', 'start_timestamp': '2020-07-12 11:23:10', 'Timestamp': '2020-07-12 11:33:39', '@@duration': 629.0}, '..', {'Patient': 1500, 'Activity': 'Discharge Test', 'Resource': 'Alexander', 'PatientName': 'Ulrich the 1500.', 'Age': 51, 'Insurance': 'PRIV', 'start_timestamp': '2020-07-14 14:21:11', 'Times

## a)

> Apply the Inductive Miner implemented in PM4Py to the given event log and describe the process. Furthermore, give and reason about the fitness and precision results, respectively. On a high level, describe the potential problems of the model and reason how they were caused by the algorithm and the log.

In [7]:
from pm4py.algo.discovery.inductive import factory as inductive_miner
from pm4py.visualization.petrinet import factory as pn_visualizer

net, initial_marking, final_marking = inductive_miner.apply(event_log, parameters=param_keys)

net_graph = pn_visualizer.apply(net,
                                initial_marking=initial_marking,
                                final_marking=final_marking)

figures_dir = os.path.join(PROJ_ROOT, 'report', 'figures')

# fix place size
import numpy as np
body = np.array(net_graph.body)
body[body ==  '\tnode [fixedsize=true shape=circle width=0.75]'] = '\tnode [fixedsize=true shape=circle width=1]'
net_graph.body = body

net_graph.render(os.path.join(figures_dir, 'q1_a_petrinet'),
                 format='pdf',
                 view=True)

  after removing the cwd from sys.path.
  


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_a_petrinet.pdf'

In [23]:
from pm4py.visualization.process_tree import factory as pt_visualizer

tree = inductive_miner.apply_tree(event_log, parameters=param_keys)

tree_graph = pt_visualizer.apply(tree)

# fix operations size
tree_graph.body = list(map(lambda r:r.replace('width=0.6', 'width=1'),
                           tree_graph.body))

tree_graph.render(os.path.join(figures_dir, 'q1_a_tree'),
                 format='pdf',
                 view=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  """


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_a_tree.pdf'

Let's look for noise on the data.

In [8]:
df_log.groupby('Patient')['Activity'].first().value_counts()

Register                 1438
Initial Exam               60
Initial Exam Decision       2
Name: Activity, dtype: int64

In [9]:
initial_trace = np.array(['Register', 'Initial Exam', 'Initial Exam Decision'])

# compares the beginning of all traces to the `initial_trace`
is_initial_trace = traces.apply(lambda t: np.array_equal(t[:len(initial_trace)], initial_trace))

is_initial_trace.value_counts()

True     1304
False     196
Name: Activity, dtype: int64

We see that the majority of the traces start with the initial trace.

In [10]:
traces[~is_initial_trace]

Patient
6       [Register, Initial Exam Decision, Discharge In...
7       [Register, Initial Exam Decision, Inform about...
21      [Register, Initial Exam, Inform about Isolatio...
31      [Register, Initial Exam, Inform about Isolatio...
38      [Initial Exam, Initial Exam Decision, Inform a...
                              ...                        
1458                  [Initial Exam, Discharge Init Exam]
1466    [Register, Initial Exam Decision, Discharge In...
1475    [Initial Exam, Initial Exam Decision, Inform a...
1478    [Register, Initial Exam Decision, Inform about...
1479    [Register, Initial Exam, Inform about Isolatio...
Name: Activity, Length: 196, dtype: object

And that the ones that don't are likely because of the noisy absence of one of the activities.

As the Inductive miner guarantess perfect fitness to the generated model, we can notice the influence of noise in the results. As tested, there is plenty of noise in the data, which resulted in several silent transitions (another characteristic of the IM). Another characteristic of the IM is its inability to capture non-local dependencies, as it shows by failing to constrain the `Check Treatment *` activities to follow, not necessarily directly, the respective treatment activity. Still, the model indeed is possible to describe the whole event log.

## b)

> From the process owner we know that patients are called in order to control the quarantine and that there are two potential quarantine phases, i.e., before and after a positive test. Implement a function that resolves the duplicate activity Control Call by context sensitive renaming. Discuss the impact on the discovered model.

Let's assume that `Test III` is the test that distinguishes the two `Control Call` possibilities. We'll rename the one after the positive test as `Control Call (+)`.

In [11]:
new_cc_name = 'Control Call (+)'
split_on_activity = 'Test III'

# get `Test III` moment
df_log_test = df_log[df_log['Activity'] == split_on_activity]
split_timestamp = df_log_test.groupby('Patient')['Timestamp'].first()

# map timestamp to whole patient trace
df_renaming = df_log.copy()
df_renaming[split_on_activity + ' Timestamp'] = df_log['Patient'].map(split_timestamp)

# renames `Control Call` activities that happen after `Test III`
new_cc = df_renaming['Activity'].str.replace('Control Call', new_cc_name)
df_renaming['Activity'] = new_cc.where(
    df_renaming['Timestamp'] > df_renaming[split_on_activity + ' Timestamp'],
    df_renaming['Activity']
)
df_renaming['Activity'].value_counts()

Control Call                        5082
Control Call (+)                    4872
Check Treatment A1                  2921
Treatment A1                        2897
Treatment B                         2238
Check Treatment B                   1678
Register                            1438
Initial Exam                        1436
Initial Exam Decision               1419
Check Treatment A2                  1134
Treatment A2                        1133
Test III                             779
Inform about Isolation               758
Test III Decision                    745
Discharge Init Exam                  687
Treatment A3                         595
Check Treatment A3                   584
Inform Authority Send Form           533
Discharge                            532
Referral                             529
Inform Authority Fill Form           529
Decide Treatment                     527
Register Facility                    523
Discharge Test                       216
Prescripe Specia

Check if it worked.

In [12]:
df_renaming['Activity'].value_counts().drop(['Control Call', new_cc_name]) - \
    df_log['Activity'].value_counts().drop('Control Call')

Check Treatment A1                  0
Treatment A1                        0
Treatment B                         0
Check Treatment B                   0
Register                            0
Initial Exam                        0
Initial Exam Decision               0
Check Treatment A2                  0
Treatment A2                        0
Test III                            0
Inform about Isolation              0
Test III Decision                   0
Discharge Init Exam                 0
Treatment A3                        0
Check Treatment A3                  0
Inform Authority Send Form          0
Discharge                           0
Referral                            0
Inform Authority Fill Form          0
Decide Treatment                    0
Register Facility                   0
Discharge Test                      0
Prescripe Special Medication        0
Emergency Send 2 Facility (Wait)    0
Name: Activity, dtype: int64

In [13]:
renamed_traces = df_renaming.groupby('Patient')['Activity'].apply(np.array)

renamed_traces.sample(10).values

array([array(['Register', 'Initial Exam', 'Initial Exam Decision',
       'Inform about Isolation', 'Control Call', 'Control Call',
       'Control Call', 'Control Call', 'Control Call', 'Control Call',
       'Control Call', 'Control Call', 'Test III', 'Test III Decision',
       'Discharge Test'], dtype=object),
       array(['Register', 'Initial Exam', 'Initial Exam Decision',
       'Inform about Isolation', 'Control Call', 'Control Call',
       'Control Call', 'Control Call', 'Test III', 'Test III Decision',
       'Discharge Test'], dtype=object),
       array(['Register', 'Initial Exam', 'Initial Exam Decision',
       'Inform about Isolation', 'Control Call', 'Control Call',
       'Control Call', 'Test III', 'Test III Decision', 'Discharge Test'],
      dtype=object),
       array(['Register', 'Initial Exam', 'Initial Exam Decision',
       'Inform about Isolation', 'Control Call', 'Control Call',
       'Control Call', 'Control Call', 'Control Call', 'Control Call',
       '

No problems detected.

Generate the tree.

In [18]:
renamed_event_log = log_converter.apply(df_renaming, parameters=param_keys)

tree = inductive_miner.apply_tree(renamed_event_log, parameters=param_keys)

tree_graph = pt_visualizer.apply(tree)

# fix operations size
tree_graph.body = list(map(lambda r:r.replace('width=0.6', 'width=1'),
                           tree_graph.body))

tree_graph.render(os.path.join(figures_dir, 'q1_b_tree'),
                 format='pdf',
                 view=True)

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


NameError: name 'pt_visualizer' is not defined

We can see that not only the `Control Call` activity was removed from the loop+silent-activity at the bottom, but also the `Test III` and `Test III Decision` were brought up to the first sequence operator, of course still within a exclusive-choice operator with a silent activity, due to the noise on the data.

## c)

> The log contains a considerable amount of noise induced by errors during the event logging. Apply the IM to a DFG filtered for noise. Describe your results and explain why the IM mines a different model. Which type of noise is prominent in the log?

Original DFG:

In [19]:
from pm4py.algo.discovery.dfg import factory as dfg_discovery

og_dfg = dfg_discovery.apply(event_log, parameters=param_keys)

from pm4py.visualization.dfg import factory as dfg_visualization
og_dfg_viz = dfg_visualization.apply(og_dfg, log=event_log, parameters=param_keys)

og_dfg_viz.render(os.path.join(figures_dir, 'q1_c_og_dfg'),
                  format='pdf',
                  view=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_c_og_dfg.pdf'

DFG after context-sensitive renaming:

In [20]:
dfg = dfg_discovery.apply(renamed_event_log, parameters=param_keys)

dfg_viz = dfg_visualization.apply(dfg, log=renamed_event_log, parameters=param_keys)

dfg_viz.render(os.path.join(figures_dir, 'q1_c_dfg_renaming'),
               format='pdf',
               view=True)

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_c_dfg_renaming.pdf'

Filter DFG.

In [14]:
df_renaming['Activity'].unique()

array(['Register', 'Initial Exam', 'Initial Exam Decision',
       'Inform about Isolation', 'Test III', 'Test III Decision',
       'Inform Authority Fill Form', 'Referral',
       'Inform Authority Send Form', 'Register Facility',
       'Decide Treatment', 'Treatment B', 'Check Treatment B',
       'Discharge', 'Treatment A1', 'Check Treatment A1', 'Treatment A2',
       'Check Treatment A2', 'Treatment A3', 'Check Treatment A3',
       'Discharge Test', 'Discharge Init Exam', 'Control Call',
       'Prescripe Special Medication', 'Control Call (+)',
       'Emergency Send 2 Facility (Wait)'], dtype=object)

In [21]:
from pm4py.objects.dfg.filtering import dfg_filtering

filt_dfg = dfg_filtering.apply(dfg, parameters=param_keys)

filt_dfg_viz = dfg_visualization.apply(filt_dfg, log=renamed_event_log, parameters=param_keys)

filt_dfg_viz.render(os.path.join(figures_dir, 'q1_c_dfg_filtered'),
                    format='pdf',
                    view=True)

  """


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_c_dfg_filtered.pdf'

We can already notice the improvement in the DFG, which is much more readable. There are way less edges and the flow seems much more linear. Even though, the structure about the treatments A1, A2 and A3 is still unsettled.

In [24]:
filt_tree = inductive_miner.apply_tree_dfg(filt_dfg, parameters=param_keys)

tree_graph = pt_visualizer.apply(filt_tree)

# fix operations size
tree_graph.body = list(map(lambda r:r.replace('width=0.6', 'width=1'),
                           tree_graph.body))

tree_graph.render(os.path.join(figures_dir, 'q1_c_filt_tree'),
                  format='pdf',
                  view=True)

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_c_filt_tree.pdf'

The new model generated is much closer to the expected from the superficial analysis of the traces. The noise removed, as already previously stated, comes mostly from skipped activities, as we can notice that the amount of silent activities was drastically reduced. Still, there are plenty of them, caused by the traces related to the `Treatment A*` activities.

In [25]:
treatment_A_traces = renamed_traces[renamed_traces.apply(lambda t: any('Treatment A' in a for a in t))]

treatment_A_traces.shape[0]

123

In [26]:
treatment_A_traces.sample(10).values

array([array(['Register', 'Initial Exam', 'Initial Exam Decision',
       'Inform about Isolation', 'Control Call', 'Control Call',
       'Control Call', 'Control Call', 'Control Call', 'Test III',
       'Test III Decision', 'Prescripe Special Medication',
       'Control Call (+)', 'Inform Authority Fill Form',
       'Inform Authority Send Form', 'Control Call (+)',
       'Control Call (+)', 'Control Call (+)', 'Control Call (+)',
       'Control Call (+)', 'Control Call (+)', 'Referral',
       'Register Facility', 'Decide Treatment', 'Treatment A2',
       'Check Treatment A2', 'Treatment A3', 'Check Treatment A3',
       'Treatment A1', 'Check Treatment A1', 'Treatment A1',
       'Check Treatment A1', 'Treatment A1', 'Check Treatment A1',
       'Treatment A2', 'Check Treatment A2', 'Treatment A1',
       'Check Treatment A1', 'Treatment A1', 'Check Treatment A1',
       'Treatment A1', 'Check Treatment A1', 'Treatment A3',
       'Check Treatment A3', 'Treatment A2', 'Check T

We can notice how the rarity of traces with `Treatment A*` activities impacted in them being filtered, as the transitions from `Decide Treatment` to the `Treatment A*` were filtered out. But as the `Treatment A*` happen many times in a single trace, the structure was roughly kept.

## d)

> Investigate the DFG of the log after applying the preceding steps. Which activities might be filtered out in order to obtain an improved model that explains most of the process more precisely? Why might this yield better results when applying the IM? Implement a filter and apply the IM to the filtered log.

Perhaps the resulting model can be improved by tuning the noise threshold or by removing the `* Treatment A*` activities completely.

#### Total `* Treatment A*` removal

As the traces in which the A* treatments were applied account for only 123 traces (8,2%), we can say that removing them totally we will still represent the majority of the process.

In [27]:
df_no_A = df_renaming[~df_renaming['Patient'].isin(treatment_A_traces.index)]

df_no_A

Unnamed: 0,Patient,Activity,Resource,PatientName,Age,Insurance,start_timestamp,Timestamp,@@duration,Test III Timestamp
0,1,Register,Alexander,Hermann the 1.,51,STAT,2020-06-01 06:00:00,2020-06-01 06:08:53,533.0,2020-06-01 07:03:47
1,1,Initial Exam,Anna,Hermann the 1.,51,STAT,2020-06-01 06:10:48,2020-06-01 06:25:43,895.0,2020-06-01 07:03:47
2,1,Initial Exam Decision,"Amelie,Anna",Hermann the 1.,51,STAT,2020-06-01 06:26:43,2020-06-01 06:31:52,309.0,2020-06-01 07:03:47
3,1,Inform about Isolation,Alexander,Hermann the 1.,51,STAT,2020-06-01 06:33:45,2020-06-01 06:33:45,0.0,2020-06-01 07:03:47
4,1,Test III,Anna,Hermann the 1.,51,STAT,2020-06-01 06:35:35,2020-06-01 07:03:47,1692.0,2020-06-01 07:03:47
5,1,Test III Decision,"Adrian,Anna",Hermann the 1.,51,STAT,2020-06-01 07:03:47,2020-06-01 07:08:06,259.0,2020-06-01 07:03:47
6,1,Inform Authority Fill Form,Alina,Hermann the 1.,51,STAT,2020-06-01 07:09:57,2020-06-01 07:17:47,470.0,2020-06-01 07:03:47
7,1,Referral,Adrian,Hermann the 1.,51,STAT,2020-06-01 07:09:57,2020-06-01 07:18:48,531.0,2020-06-01 07:03:47
8,1,Inform Authority Send Form,Alina,Hermann the 1.,51,STAT,2020-06-01 07:17:57,2020-06-01 07:20:07,130.0,2020-06-01 07:03:47
9,1,Register Facility,Bernhard,Hermann the 1.,51,STAT,2020-06-01 08:21:45,2020-06-01 08:31:18,573.0,2020-06-01 07:03:47


In [28]:
log_no_A = log_converter.apply(df_no_A, parameters=param_keys)

dfg_no_A = dfg_discovery.apply(log_no_A, parameters=param_keys)

dfg_no_A = dfg_filtering.apply(dfg_no_A, parameters=param_keys)

dfg_viz = dfg_visualization.apply(dfg_no_A, log=log_no_A, parameters=param_keys)

dfg_viz.render(os.path.join(figures_dir, 'q1_d_no_A'),
               format='pdf',
               view=True)

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_d_no_A.pdf'

The DFG now looks very concise.

In [29]:
tree_no_A = inductive_miner.apply_tree_dfg(dfg_no_A, parameters=param_keys)

tree_graph = pt_visualizer.apply(tree_no_A)

# fix operations size
tree_graph.body = list(map(lambda r:r.replace('width=0.6', 'width=1'),
                           tree_graph.body))

tree_graph.render(os.path.join(figures_dir, 'q1_d_tree_no_A'),
                  format='pdf',
                  view=True)

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_d_tree_no_A.pdf'

We can see that the PT is also much closer to the majority of the traces, representing well the process. Still, it keeps the silent activities.

#### Partial `* Treatment A*` removal

We can also remove only the activities related to these treatments, keeping the traces. This ensures that we do not loose too much data.

In [30]:
df_silent_A = df_renaming[df_renaming['Activity'].str.find('Treatment A') == -1]
df_silent_A

Unnamed: 0,Patient,Activity,Resource,PatientName,Age,Insurance,start_timestamp,Timestamp,@@duration,Test III Timestamp
0,1,Register,Alexander,Hermann the 1.,51,STAT,2020-06-01 06:00:00,2020-06-01 06:08:53,533.0,2020-06-01 07:03:47
1,1,Initial Exam,Anna,Hermann the 1.,51,STAT,2020-06-01 06:10:48,2020-06-01 06:25:43,895.0,2020-06-01 07:03:47
2,1,Initial Exam Decision,"Amelie,Anna",Hermann the 1.,51,STAT,2020-06-01 06:26:43,2020-06-01 06:31:52,309.0,2020-06-01 07:03:47
3,1,Inform about Isolation,Alexander,Hermann the 1.,51,STAT,2020-06-01 06:33:45,2020-06-01 06:33:45,0.0,2020-06-01 07:03:47
4,1,Test III,Anna,Hermann the 1.,51,STAT,2020-06-01 06:35:35,2020-06-01 07:03:47,1692.0,2020-06-01 07:03:47
5,1,Test III Decision,"Adrian,Anna",Hermann the 1.,51,STAT,2020-06-01 07:03:47,2020-06-01 07:08:06,259.0,2020-06-01 07:03:47
6,1,Inform Authority Fill Form,Alina,Hermann the 1.,51,STAT,2020-06-01 07:09:57,2020-06-01 07:17:47,470.0,2020-06-01 07:03:47
7,1,Referral,Adrian,Hermann the 1.,51,STAT,2020-06-01 07:09:57,2020-06-01 07:18:48,531.0,2020-06-01 07:03:47
8,1,Inform Authority Send Form,Alina,Hermann the 1.,51,STAT,2020-06-01 07:17:57,2020-06-01 07:20:07,130.0,2020-06-01 07:03:47
9,1,Register Facility,Bernhard,Hermann the 1.,51,STAT,2020-06-01 08:21:45,2020-06-01 08:31:18,573.0,2020-06-01 07:03:47


In [31]:
log_silent_A = log_converter.apply(df_silent_A, parameters=param_keys)

dfg_silent_A = dfg_discovery.apply(log_silent_A, parameters=param_keys)

dfg_silent_A = dfg_filtering.apply(dfg_silent_A, parameters=param_keys)

dfg_viz = dfg_visualization.apply(dfg_silent_A, log=log_silent_A, parameters=param_keys)

dfg_viz.render(os.path.join(figures_dir, 'q1_d_silent_A'),
               format='pdf',
               view=True)

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_d_silent_A.pdf'

We notice this way the `Prescripe Special Medication` activity is kept.

In [32]:
tree_silent_A = inductive_miner.apply_tree_dfg(dfg_silent_A, parameters=param_keys)

tree_graph = pt_visualizer.apply(tree_silent_A)

# fix operations size
tree_graph.body = list(map(lambda r:r.replace('width=0.6', 'width=1'),
                           tree_graph.body))

tree_graph.render(os.path.join(figures_dir, 'q1_d_tree_silent_A'),
                  format='pdf',
                  view=True)

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_d_tree_silent_A.pdf'

The biggest difference in the Process Tree is that the `Discharge Test` activity was dragged away from the exclusive choice operator with the other 3 activities to a different exclusive choice with `Prescripe Special Medication`.

#### Noise threshold tuning

For the reasons stated in c), the noise threshold will be tuned.

In [33]:
param_keys['noiseThreshold'] = 0.07

filt_dfg = dfg_filtering.apply(dfg, parameters=param_keys)

filt_dfg_viz = dfg_visualization.apply(filt_dfg, log=renamed_event_log, parameters=param_keys)

filt_dfg_viz.render(os.path.join(figures_dir, 'q1_d_noise_threshold_tuning'),
                    format='pdf',
                    view=True)

  """


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_d_noise_threshold_tuning.pdf'

We can already notice the improvement in the DFG, which is much more readable. There are way less edges and the flow seems much more linear. Even though, the structure about the treatments A1, A2 and A3 is still unsettled.

In [34]:
filt_tree = inductive_miner.apply_tree_dfg(filt_dfg, parameters=param_keys)

tree_graph = pt_visualizer.apply(filt_tree)

# fix operations size
tree_graph.body = list(map(lambda r:r.replace('width=0.6', 'width=1'),
                           tree_graph.body))

tree_graph.render(os.path.join(figures_dir, 'q1_d_noise_threshold_tuning'),
                  format='pdf',
                  view=True)

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_d_noise_threshold_tuning.pdf'

Even though the final model is not so readable and can still accept some unreasonable mixes between applying one treatment and checking for another, one can say that it is more complete as to the amount of structures that it presents.

## e)

> Consider the process model for the patients who were prescript the special medication. What do you observe? How is this behavior captured by the complete model in d)?

In [35]:
special_medication_traces = renamed_traces[renamed_traces.apply(lambda t: 'Prescripe Special Medication' in t)]

special_medication_traces.sample(10).values

array([array(['Register', 'Initial Exam', 'Initial Exam Decision',
       'Inform about Isolation', 'Control Call', 'Control Call',
       'Control Call', 'Control Call', 'Test III', 'Test III Decision',
       'Prescripe Special Medication', 'Control Call (+)',
       'Inform Authority Fill Form', 'Inform Authority Send Form',
       'Control Call (+)', 'Control Call (+)', 'Control Call (+)',
       'Control Call (+)', 'Control Call (+)', 'Control Call (+)',
       'Control Call (+)', 'Control Call (+)', 'Control Call (+)',
       'Control Call (+)', 'Control Call (+)', 'Referral',
       'Decide Treatment', 'Treatment A2', 'Check Treatment A2',
       'Treatment A3', 'Check Treatment A3', 'Treatment A1',
       'Check Treatment A1', 'Treatment A1', 'Check Treatment A1',
       'Treatment A1', 'Check Treatment A1', 'Treatment A2',
       'Check Treatment A2', 'Treatment A1', 'Check Treatment A1',
       'Treatment A1', 'Treatment A3', 'Check Treatment A3',
       'Treatment A2', 'Chec

In [36]:
special_medication_traces[special_medication_traces.apply(lambda t: 'Treatment B' in t)]

Series([], Name: Activity, dtype: object)

Through further inspection we can assume that the special medication is only prescribed together with the A treatments, therefore it was damaged by the actions taken in d), as previously highlighted.

In [37]:
df_special = df_renaming[df_renaming['Patient'].isin(special_medication_traces.index)]

df_special

Unnamed: 0,Patient,Activity,Resource,PatientName,Age,Insurance,start_timestamp,Timestamp,@@duration,Test III Timestamp
405,31,Register,Alexander,Alexander the 31.,60,STAT,2020-06-01 15:24:21,2020-06-01 15:33:24,543.0,2020-06-02 10:58:47
406,31,Initial Exam,Anna,Alexander the 31.,60,STAT,2020-06-01 15:35:24,2020-06-01 15:50:21,897.0,2020-06-02 10:58:47
407,31,Inform about Isolation,Anna,Alexander the 31.,60,STAT,2020-06-01 15:58:38,2020-06-01 15:58:38,0.0,2020-06-02 10:58:47
408,31,Control Call,InspectorInis,Alexander the 31.,60,STAT,2020-06-01 19:00:26,2020-06-01 19:05:29,303.0,2020-06-02 10:58:47
409,31,Control Call,InspectorInis,Alexander the 31.,60,STAT,2020-06-02 06:00:00,2020-06-02 06:05:11,311.0,2020-06-02 10:58:47
410,31,Control Call,InspectorInis,Alexander the 31.,60,STAT,2020-06-02 09:05:11,2020-06-02 09:10:02,291.0,2020-06-02 10:58:47
411,31,Test III,Ava,Alexander the 31.,60,STAT,2020-06-02 10:31:21,2020-06-02 10:58:47,1646.0,2020-06-02 10:58:47
412,31,Test III Decision,"Adrian,Ava",Alexander the 31.,60,STAT,2020-06-02 10:58:47,2020-06-02 11:02:00,193.0,2020-06-02 10:58:47
413,31,Prescripe Special Medication,Adrian,Alexander the 31.,60,STAT,2020-06-02 11:03:35,2020-06-02 12:04:06,3631.0,2020-06-02 10:58:47
414,31,Referral,Adrian,Alexander the 31.,60,STAT,2020-06-02 12:05:36,2020-06-02 12:16:11,635.0,2020-06-02 10:58:47


In [38]:
log_special = log_converter.apply(df_special, parameters=param_keys)

dfg_special = dfg_discovery.apply(log_special, parameters=param_keys)

dfg_special = dfg_filtering.apply(dfg_special, parameters=param_keys)

dfg_viz = dfg_visualization.apply(dfg_special, log=log_special, parameters=param_keys)

dfg_viz.render(os.path.join(figures_dir, 'q1_e_special'),
               format='pdf',
               view=True)

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_e_special.pdf'

In [39]:
tree_special = inductive_miner.apply_tree_dfg(dfg_special, parameters=param_keys)

tree_graph = pt_visualizer.apply(tree_special)

# fix operations size
tree_graph.body = list(map(lambda r:r.replace('width=0.6', 'width=1'),
                           tree_graph.body))

tree_graph.render(os.path.join(figures_dir, 'q1_e_tree_special'),
                  format='pdf',
                  view=True)

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_e_tree_special.pdf'

It is very similar, as previously stated. One can say that the previous models failed to capture the non-local influence of the `Prescripe Special Medication` in the choice of the treatment, which it is possible to assume that must be one of the A treatments.

## f)

> Apply additional miners to the log and compare the results. Which model is the best model?

#### Alpha Miner

In [40]:
from pm4py.algo.discovery.alpha import factory as alpha_miner
from pm4py.objects.dfg.filtering.dfg_filtering import DEFAULT_NOISE_THRESH_DF

param_keys['noiseThreshold'] = DEFAULT_NOISE_THRESH_DF
filt_dfg = dfg_filtering.apply(dfg, parameters=param_keys)

net, initial_marking, final_marking = alpha_miner.apply_dfg(filt_dfg, parameters=param_keys)

net_graph = pn_visualizer.apply(net,
                                initial_marking=initial_marking,
                                final_marking=final_marking)

figures_dir = os.path.join(PROJ_ROOT, 'report', 'figures')

# fix place size
import numpy as np
body = np.array(net_graph.body)
body[body ==  '\tnode [fixedsize=true shape=circle width=0.75]'] = '\tnode [fixedsize=true shape=circle width=1]'
net_graph.body = body

net_graph.render(os.path.join(figures_dir, 'q1_f_alpha_miner'),
                 format='pdf',
                 view=True)

  import sys
  # This is added back by InteractiveShellApp.init_path()


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_f_alpha_miner.pdf'

#### Heuristics Miner

In [41]:
from pm4py.algo.discovery.heuristics import factory as heuristics_miner

net, initial_marking, final_marking = heuristics_miner.apply_dfg(filt_dfg, parameters=param_keys)

net_graph = pn_visualizer.apply(net,
                                initial_marking=initial_marking,
                                final_marking=final_marking)

figures_dir = os.path.join(PROJ_ROOT, 'report', 'figures')

# fix place size
import numpy as np
body = np.array(net_graph.body)
body[body ==  '\tnode [fixedsize=true shape=circle width=0.75]'] = '\tnode [fixedsize=true shape=circle width=1]'
net_graph.body = body

net_graph.render(os.path.join(figures_dir, 'q1_f_heuristics_miner'),
                 format='pdf',
                 view=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q1_f_heuristics_miner.pdf'

The Alpha miner fails completely to most of the structures beyond the initial activities. The Heuristics miner presents a good job up until the treatments start, showing precisely the control calls and its relation with the `Inform Authority*` activities. Both fail to understand non-local influence of the `Prescripe Special Medication` and the sequencing of the A treatments and checks. Therefore, one can say the most precise model is still the one generated by the IM, even though the one generated by the HM is more readable.