# Decision Trees for predicting the next event of a case

In [42]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, accuracy_score

In [2]:
os.chdir("../")

In [3]:
data_train = pd.read_csv('./data/road-train-pre.csv', error_bad_lines=False)
data_test = pd.read_csv('./data/road-test-pre.csv', error_bad_lines=False)

In [4]:
data_train['event time:timestamp'] = pd.to_datetime(data_train['event time:timestamp'])
data_train = data_train.sort_values(by=['case concept:name', 'event time:timestamp'])

In [16]:
data_test['event time:timestamp'] = pd.to_datetime(data_test['event time:timestamp'])
data_test = data_test.sort_values(by=['case concept:name', 'event time:timestamp'])

# Train Data

In [5]:
data_train.to_csv("fixed.csv")

file = open('fixed.csv', 'r')
log = dict()
with open('fixed.csv', 'r') as file:
    next(file)
    for line in file:
        line = line.strip()
        if len(line) == 0:
            continue
        parts = line.split(',')
        caseid = parts[2]

        task = parts[3]
        timestamp = parts[5]

        if caseid not in log:
            log[caseid] = [[],[]]

        log[caseid][0].append(task)
        log[caseid][1].append(timestamp)
        
file.close()

os.remove('fixed.csv')

In [6]:
for i in log.keys():
    current = log[i][0]  # recording the current case' events
    
    real_next = current[1:]  # next real event
    #real_next.append('New Case')  # adding a 'new case' as real next event for every last event
    
    log[i].append(real_next)  # adding the real next events to the log file

# Test Data

In [30]:
data_test.to_csv("fixed_test.csv")

file = open('fixed_test.csv', 'r')
log_test = dict()
with open('fixed_test.csv', 'r') as file:
    next(file)
    for line in file:
        line = line.strip()
        if len(line) == 0:
            continue
        parts = line.split(',')
        caseid = parts[2]
        
        task = parts[3]
        timestamp = parts[5]

        if caseid not in log_test:
            log_test[caseid] = [[],[]]

        log_test[caseid][0].append(task)
        log_test[caseid][1].append(timestamp)
        
file.close()

os.remove('fixed_test.csv')

In [31]:
"""Fixing a bug of cases that are in the test data but are incomplete due to the train-test split."""

bugs = []

for i in log_test.keys():
    if len(log_test[i][0]) == 1:
        bugs.append(i)
            
for x in bugs:
    del log_test[x]

In [32]:
for i in log_test.keys():
    current = log_test[i][0]  # recording the current case' events
    
    real_next = current[1:]  # next real event
    #real_next.append('New Case')  # adding a 'new case' as real next event for every last event
    
    log_test[i].append(real_next) 

# Decision trees for every position

In [33]:
event_list = []  # list of lists that contains all event names for every case, where each sublist is a new case.
for case in log.keys():
    event_list.append((log[case][0]))


events_longest_event = max(map(len,event_list))  # longest case

pos_events = []  # list of lists which has all event names for every position, where each sublist is a new position
for i in range(events_longest_event):  # creates a list every iteration which is appended to the above list
    pos_events.append([])
    for case in event_list:
        if (len(case)-1) >= i:
            pos_events[i].append(case[i])
        else:
            pass

In [34]:
event_list_test = []  # list of lists that contains all event names for every case, where each sublist is a new case.
for case in log_test.keys():
    event_list_test.append((log_test[case][0]))


events_longest_event_test = max(map(len,event_list_test))  # longest case

pos_events_test = []  # list of lists which has all event names for every position, where each sublist is a new position
for i in range(events_longest_event_test):  # creates a list every iteration which is appended to the above list
    pos_events_test.append([])
    for case in event_list_test:
        if (len(case)-1) >= i:
            pos_events_test[i].append(case[i])
        else:
            pass

# Encoding Event names into integers

In [36]:
cases = list(data_train['event concept:name'].unique())  # list of all unique event names

le = preprocessing.LabelEncoder()
le.fit(cases)  # encoding all event names into integers

x_train= le.transform(pos_events[0]).reshape(-1, 1)
y_train= le.transform(pos_events[1]).reshape(-1, 1)

x_test= le.transform(pos_events_test[0]).reshape(-1, 1)
y_test= le.transform(pos_events_test[1]).reshape(-1, 1)


In [40]:
classifier = DecisionTreeClassifier()
classifier.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [41]:
y_pred = classifier.predict(x_test)

In [43]:
acc = accuracy_score(y_test, y_pred)

In [44]:
acc

0.5781824611032532