# Decision Tree predicting the next event of a case - Full Trace

In [1]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, accuracy_score
from progressbar import ProgressBar
import numpy as np
import time

In [2]:
start_time = time.time()

In [3]:
os.chdir("../")

In [4]:
data_train = pd.read_csv('./data/road-train-pre.csv', error_bad_lines=False)
data_test = pd.read_csv('./data/road-test-pre.csv', error_bad_lines=False)

In [5]:
data_train['event time:timestamp'] = pd.to_datetime(data_train['event time:timestamp'])
data_train = data_train.sort_values(by=['case concept:name', 'event time:timestamp'])

In [6]:
data_test['event time:timestamp'] = pd.to_datetime(data_test['event time:timestamp'])
data_test = data_test.sort_values(by=['case concept:name', 'event time:timestamp'])

# 1. Train Data

In [7]:
data_train.to_csv("fixed.csv")

log = dict()  # dictionary that contains all information for a case - key: case name; values: events, timestamps
with open('fixed.csv', 'r') as file:
    next(file)
    for line in file:
        line = line.strip()
        if len(line) == 0:
            continue
        parts = line.split(',')
        caseid = parts[2]

        task = parts[3]
        timestamp = parts[5]

        if caseid not in log:
            log[caseid] = [[],[]]

        log[caseid][0].append(task)  # adding the events as a list into the dictionary
        log[caseid][1].append(timestamp)  # adding the timestamps as a list into the dictionary
        
file.close()

os.remove('fixed.csv')

In [8]:
for i in log.keys():  # updating the dictionary to contain also all next events 
    current = log[i][0]  # recording the cuurent case' events
    
    real_next = current[1:]  # next real events
    real_next.append('New Case')  # adding a 'new case' as real next event for every last event
    
    log[i].append(real_next)  # adding the real next events to the log file

# 2. Test Data

In [9]:
#  Repeating the same process from above on the test data.

data_test.to_csv("fixed_test.csv")

log_test = dict()
with open('fixed_test.csv', 'r') as file:
    next(file)
    for line in file:
        line = line.strip()
        if len(line) == 0:
            continue
        parts = line.split(',')
        caseid = parts[2]
        
        task = parts[3]
        timestamp = parts[5]

        if caseid not in log_test:
            log_test[caseid] = [[],[]]

        log_test[caseid][0].append(task)
        log_test[caseid][1].append(timestamp)
        
file.close()

os.remove('fixed_test.csv')

In [10]:
"""Fixing a bug of cases that are in the test data but are incomplete due to the train-test split."""

bugs = []

for i in log_test.keys():  #  recording the cases which have events cut because of the train - test split
    if len(log_test[i][0]) == 1:
        bugs.append(i)
            
for x in bugs:  # deleting the above mentioned events 
    del log_test[x]

In [11]:
for i in log_test.keys():
    current = log_test[i][0]  # current case' events
    
    real_next = current[1:]  # next real events
    real_next.append('New Case')  # adding a 'new case' as real next event for every last event
    log_test[i].append(real_next) 

# 3. Storing the data

In [12]:
#  new dictionary that will contain for every position(key) the observed traces and next events for each trace(values)
#  so case [A, B, C] would be saved as {0:[[A],[B]], 1: [[A,B], [C]], 2: [[A, B, C], [New Case]]} 
train_data = {} 

for i in log.keys():
    for x in log[i][0]:
        case = log[i][0]
        ind = log[i][0].index(x)
        
        if ind not in train_data:  # making the two lists in the dictionary
            train_data[ind] = [[],[]]  # list 1 is all for all traces of the position, list 2 is for all next events
        
        
        train_data[ind][0].append(case[:ind+1])  # appending the trace
        
        if ind < len(case)-1:
            train_data[ind][1].append(case[ind+1])  # appending the next event of the trace
            
        elif ind == len(case)-1:
            train_data[ind][1].append('New Case')

In [13]:
#  repeating the same process on the test data
test_data = {} 

for i in log_test.keys():
    for x in log_test[i][0]:
        case = log_test[i][0]
        ind = log_test[i][0].index(x)
        
        if ind not in test_data:
            test_data[ind] = [[],[]]
        
        
        test_data[ind][0].append(case[:ind+1])  # appending the trace
        
        if ind < len(case)-1:
            test_data[ind][1].append(case[ind+1])  # appending the next event of the trace
            
        elif ind == len(case)-1:
            test_data[ind][1].append('New Case')

# 4. Encoding 

In [14]:
# encoding all unique event names of all the data into integers

cases = list(data_train['event concept:name'].unique()) + list(data_test['event concept:name'].unique())  # all events
cases.append('New Case')  #  adding the 'New Case' because we predict next event is going to be new case
cases = list(set(cases)) 
le = preprocessing.LabelEncoder()
le.fit(cases)  # encoding all event names into integers

LabelEncoder()

TRAIN

In [15]:
pbar = ProgressBar()

for i in pbar(train_data.keys()):  # the dictionaries from above are encoded into integers 
    
    encoded = []
    for trace in train_data[i][0]:  # encoding all strings of a trace, can be multiple if case lenght is more than 2
        local_encoded = []
        for event in trace:
            local_encoded.append(int(le.transform([event])))  # transforming into integer
        encoded.append(local_encoded)
    
    train_data[i][0] = np.array(encoded)  # making the list with integers into array so the tree can take it
    
    
    encoded_next = []  # encoding all strings of next events for a trace, its always length 1 !
    for g in train_data[i][1]:
        encoded_next.append(int(le.transform([g])))  # transforming into integer
                            
                            
    train_data[i][1] = np.array(encoded_next)  #  making the list with integers into array
        

100% |########################################################################|


TEST

In [16]:
# repeating the procedure from above on the test data

pbar = ProgressBar()

for i in pbar(test_data.keys()):
    
    encoded = []
    for trace in test_data[i][0]:
        local_encoded = []
        for event in trace:
            local_encoded.append(int(le.transform([event])))
        encoded.append(local_encoded)
    
    test_data[i][0] = np.array(encoded)
    
    
    encoded_next = []
    for g in test_data[i][1]:
        encoded_next.append(int(le.transform([g])))
                            
                            
    test_data[i][1] = np.array(encoded_next)

100% |########################################################################|


# 5. Training the decision tree

In [17]:
# Function for training decision tree for any given position (as long as the position is in the train data)

def decision_tree(pos):

    x_train= train_data[pos][0]
    y_train= train_data[pos][1]

    classifier = DecisionTreeClassifier()
    classifier.fit(x_train, y_train)
    
    return classifier

In [18]:
predictors = {}  # dictionary to contain all decision trees given the position
#  key - position, value - decision tree for that position

for i in range(len(test_data)):
    if i >= len(train_data) - 1:
        predictors[i] = decision_tree(len(train_data) - 1)
        
    else:
        predictors[i] = decision_tree(i)

# 6. Adding predictions

In [19]:
pbar = ProgressBar()

for i in pbar(log_test.keys()):  # adding an array with the encoding to the log_test dict. for every case in the test 
    current = log_test[i][0]
    
    
    encoded = []  # list will contain all event names encoded into integers
    for g in current:
        encoded.append(int(le.transform([g])))
    encoded = np.array(encoded)
    log_test[i].append(encoded)
    

100% |########################################################################|


In [20]:
pbar = ProgressBar()
for i in pbar(log_test.keys()):  # making predictions for every case in the log_test dict
    
    current_encoded = log_test[i][3]
    predictions = []  # list that will contain all predictions for a given case
    
    for x in current_encoded:
        ind = list(current_encoded).index(x)
        
        
        # the if-else is a checks whether the case length is more than any case length observed in the train data
        if ind >= len(train_data) - 1: # if its in the train data we call the appropriate decision tree
            
            
            tree = predictors[len(train_data) - 1]  # calling the right tree given the position
            p = current_encoded[:(len(train_data))]  # taking the trace
            p = p.reshape(1, -1)
            pred = tree.predict(p)  # making a prediction 
            pred_string = le.inverse_transform(pred)[0]  # transforming the prediction into a string
            predictions.append(pred_string)  # appending the prediction as a string to the log_test data
            
            
            
        else:  # if its not in the train data then we use the last observed decision tree from the train data
        
            tree = predictors[ind]  # calling the right tree given the position
            p = current_encoded[:ind+1]  # taking the trace
            p = p.reshape(1, -1)  # we need to do that, idk why
            pred = tree.predict(p)  # making a prediction
            pred_string = le.inverse_transform(pred)[0]  # transforming the prediction into a string
            predictions.append(pred_string)  # appending the prediction as a string to the log_test data 
        
    log_test[i].append(predictions)  # adding all predictions to the log_test of the current case

100% |########################################################################|


# 7. Evaluation

In [21]:
# making lists for every column we will have in the frame
case_names = []
event_names = []
timestamp = []
p_event = []
current_real = []

for i in log_test.keys():  # appending the right things to every list from the log_test file
    for x in range(len(log_test[i][0])):
        case_names.append(i)
        event_names.append(log_test[i][0][x])
        timestamp.append(log_test[i][1][x])
        p_event.append(log_test[i][4][x])
        current_real.append(log_test[i][2][x])

# dictionary that will be used to make the frame
frame_dict = {'Case_ID': case_names, 'Event_Name': event_names,
              'TimeStamp': timestamp, 'Next_Event': current_real, 'Predicted_Event': p_event}
predicted_df = pd.DataFrame.from_dict(frame_dict)  # making a frame 

event_real = np.array(predicted_df['Next_Event'])  # taking next event col. as an array
event_pred = np.array(predicted_df['Predicted_Event'])  # taking the predictions as an array 

acc = accuracy_score(event_real, event_pred)  # calculates the accuracy based on the both arrays
print('Accuracy for event prediction TEST SET: {}%'.format(round(acc, 2) * 100))

Accuracy for event prediction TEST SET: 69.0%


In [22]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 233.47721219062805 seconds ---


In [23]:
predicted_df

Unnamed: 0,Case_ID,Event_Name,TimeStamp,Next_Event,Predicted_Event
0,A28905,Create Fine,2009-09-25,Send Fine,Send Fine
1,A28905,Send Fine,2010-01-19,Insert Fine Notification,Insert Fine Notification
2,A28905,Insert Fine Notification,2010-08-02,Add penalty,Add penalty
3,A28905,Add penalty,2010-09-04,Send for Credit Collection,Send for Credit Collection
4,A28905,Send for Credit Collection,2012-03-26,New Case,New Case
...,...,...,...,...,...
96156,V19305,Send Appeal to Prefecture,2012-02-24,New Case,New Case
96157,V19308,Create Fine,2011-07-10,Send Fine,Send Fine
96158,V19308,Send Fine,2012-01-30,Insert Fine Notification,Insert Fine Notification
96159,V19308,Insert Fine Notification,2012-02-24,Add penalty,Add penalty
