In [1]:
import pm4py
pm4py.__version__

'1.2.13'

# Decision Points

> Investigate how patients are referred for further treatment by means of a decision tree. Describe the factors that you observe.

## Data loading

Import the log.

In [35]:
import os
import pandas as pd
import numpy as np
from pm4py.objects.log.importer.xes import factory as xes_import_factory
from pm4py.objects.petri.importer import factory as pnml_importer
from pm4py.objects.conversion.log import factory as log_converter
from pm4py.util import constants

PROJ_ROOT = os.path.abspath(os.path.pardir)

# load csv and rename patieent and activity column
df_log = pd.read_csv(PROJ_ROOT+"/data/log.csv")

# convert timestamp columns to datetime friendly format
df_log['Timestamp'] = pd.to_datetime(df_log['Timestamp'])
df_log['start_timestamp'] = pd.to_datetime(df_log['start_timestamp'])

df_log = df_log.rename(columns={"Age": "case:Age", "Insurance": "case:Insurance", "PatientName": "case:PatientName"})



#import the renamed event log from Q1
#event_log = xes_import_factory.apply(PROJ_ROOT+"/data/processed_log.xes")

In [46]:
#remove all events after a final decision was made
new_df = pd.DataFrame(index=np.arange(0, len(df_log)), columns=["Patient", "Activity", "Resource", "case:PatientName", "case:Age", "case:Insurance", "start_timestamp", "Timestamp", "@@duration"])

current_id = -1
keep = True
count = 0
for row in df_log.itertuples():
    if current_id != row[1]:
        current_id = row[1]
        keep = True

    if keep:
        if row[2] in ["Discharge", "Treatment A1", "Treatment A2", "Treatment B", "Discharge Init Exam", "Discharge Test"]:
            keep = False
            
        new_df.loc[count] = row[1:]
        count += 1         

new_df = new_df.dropna()

In [47]:
            
new_df.head(30)

Unnamed: 0,Patient,Activity,Resource,case:PatientName,case:Age,case:Insurance,start_timestamp,Timestamp,@@duration
0,1,Register,Alexander,Hermann the 1.,51,STAT,2020-06-01 06:00:00,2020-06-01 06:08:53,533
1,1,Initial Exam,Anna,Hermann the 1.,51,STAT,2020-06-01 06:10:48,2020-06-01 06:25:43,895
2,1,Initial Exam Decision,"Amelie,Anna",Hermann the 1.,51,STAT,2020-06-01 06:26:43,2020-06-01 06:31:52,309
3,1,Inform about Isolation,Alexander,Hermann the 1.,51,STAT,2020-06-01 06:33:45,2020-06-01 06:33:45,0
4,1,Test III,Anna,Hermann the 1.,51,STAT,2020-06-01 06:35:35,2020-06-01 07:03:47,1692
5,1,Test III Decision,"Adrian,Anna",Hermann the 1.,51,STAT,2020-06-01 07:03:47,2020-06-01 07:08:06,259
6,1,Inform Authority Fill Form,Alina,Hermann the 1.,51,STAT,2020-06-01 07:09:57,2020-06-01 07:17:47,470
7,1,Referral,Adrian,Hermann the 1.,51,STAT,2020-06-01 07:09:57,2020-06-01 07:18:48,531
8,1,Inform Authority Send Form,Alina,Hermann the 1.,51,STAT,2020-06-01 07:17:57,2020-06-01 07:20:07,130
9,1,Register Facility,Bernhard,Hermann the 1.,51,STAT,2020-06-01 08:21:45,2020-06-01 08:31:18,573


In [48]:
#convert to event log
# map dataset columns to PM4Py keys
param_keys={constants.PARAMETER_CONSTANT_CASEID_KEY: 'Patient',
            constants.PARAMETER_CONSTANT_RESOURCE_KEY: 'Resource', 
            constants.PARAMETER_CONSTANT_ACTIVITY_KEY: 'Activity',
            constants.PARAMETER_CONSTANT_TIMESTAMP_KEY: 'Timestamp',
            constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY: 'start_timestamp'}

event_log = log_converter.apply(new_df, parameters=param_keys)

---

# a)
> Create a decision tree of reasonable complexity using the available attributes in the log. 

In [62]:
from pm4py.objects.log.util import get_log_representation
from pm4py.objects.log.util import get_class_representation

# preprocess the log for decision tree mining
str_trace_attributes = ["Insurance"]
str_event_attributes = []#["Resource"]
num_trace_attributes = ["Age"]
num_event_attributes = []#["@@duration"]

data, feature_names = get_log_representation.get_representation(event_log, str_trace_attributes, str_event_attributes,
                                                              num_trace_attributes, num_event_attributes)

target, classes = get_class_representation.get_class_representation_by_str_ev_attr_value_value(event_log, "Activity")


In [69]:
from sklearn import tree
#calculate the decision tree

classifier = tree.DecisionTreeClassifier(max_depth=2)
classifier.fit(data, target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [70]:
from pm4py.visualization.decisiontree import factory as dt_vis_factory
#visualize the obtained decision tree

decision_tree_vis = dt_vis_factory.apply(classifier, feature_names, classes)

figures_dir = os.path.join(PROJ_ROOT, 'report', 'figures')
decision_tree_vis.render(os.path.join(figures_dir, 'q3_perf_petrinet'),
                 format='pdf',
                 view=True)

'/Users/Tom/Documents/Uni/4. Semester M/Advanced Process Mining/Assignments/Assignment 1/APM-A1/report/figures/q3_perf_petrinet.pdf'

---

# b)
> Since it is likely that the resources at the treatment facilities are limited, implement a function that assigns a(n) (estimate) of the number of patients at each facility to each event. To this end, you have to decide which event occurs at which facility based on your analysis in question 2. Create a decision tree of reasonable complexity using this derived attribute.