# Decision Trees for predicting the next event of a case



In [1]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, accuracy_score
from progressbar import ProgressBar
import numpy as np
import time

In [2]:
start_time = time.time()

In [3]:
os.chdir("../")

In [5]:
data_train = pd.read_csv('data/road-train-pre.csv', error_bad_lines=False)
data_test = pd.read_csv('data/road-test-pre.csv', error_bad_lines=False)

In [6]:
data_train['event time:timestamp'] = pd.to_datetime(data_train['event time:timestamp'])
data_train = data_train.sort_values(by=['case concept:name', 'event time:timestamp'])

In [7]:
data_test['event time:timestamp'] = pd.to_datetime(data_test['event time:timestamp'])
data_test = data_test.sort_values(by=['case concept:name', 'event time:timestamp'])

# 1. Train Data

In [8]:
data_train.to_csv("fixed.csv")

file = open('fixed.csv', 'r')
log = dict()
with open('fixed.csv', 'r') as file:
    next(file)
    for line in file:
        line = line.strip()
        if len(line) == 0:
            continue
        parts = line.split(',')
        caseid = parts[2]

        task = parts[3]
        timestamp = parts[5]

        if caseid not in log:
            log[caseid] = [[],[]]

        log[caseid][0].append(task)
        log[caseid][1].append(timestamp)
        
file.close()

os.remove('fixed.csv')

In [9]:
for i in log.keys():
    current = log[i][0]  # recording the current case' events
    
    real_next = current[1:]  # next real event
    real_next.append('New Case')  # adding a 'new case' as real next event for every last event
    
    log[i].append(real_next)  # adding the real next events to the log file

# 2. Test Data

In [10]:
data_test.to_csv("fixed_test.csv")

file = open('fixed_test.csv', 'r')
log_test = dict()
with open('fixed_test.csv', 'r') as file:
    next(file)
    for line in file:
        line = line.strip()
        if len(line) == 0:
            continue
        parts = line.split(',')
        caseid = parts[2]
        
        task = parts[3]
        timestamp = parts[5]

        if caseid not in log_test:
            log_test[caseid] = [[],[]]

        log_test[caseid][0].append(task)
        log_test[caseid][1].append(timestamp)
        
file.close()

os.remove('fixed_test.csv')

In [11]:
"""Fixing a bug of cases that are in the test data but are incomplete due to the train-test split."""

bugs = []

for i in log_test.keys():
    if len(log_test[i][0]) == 1:
        bugs.append(i)
            
for x in bugs:
    del log_test[x]

In [12]:
for i in log_test.keys():
    current = log_test[i][0]  # current case' events
    
    real_next = current[1:]  # next real event
    real_next.append('New Case')  # adding a 'new case' as real next event for every last event
    log_test[i].append(real_next) 

# 3. Storing the data

## 3.1 TRAIN

In [13]:
event_list = []  # list of lists that contains all event names for every case, where each sublist is a new case.
for case in log.keys():
    event_list.append((log[case][0]))


events_longest_event = max(map(len,event_list))  # longest case

pos_events_train = []  # list of lists which has all event names for every position, where each sublist is a new position
for i in range(events_longest_event):  # creates a list every iteration which is appended to the above list
    pos_events_train.append([])
    for case in event_list:
        if (len(case)-1) >= i:
            pos_events_train[i].append(case[i])
        else:
            pass

In [14]:
events_train = {}

for i in range(len(pos_events_train)-1):
    events_train[i] = []
    
    previous_events = pos_events_train[i-1]
    current_events = pos_events_train[i]
    next_events = pos_events_train[i+1]
    
    events_train[i].append(previous_events)
    events_train[i].append(current_events)
    events_train[i].append(next_events)
events_train[0][0] = ['New Case'] * 120296 

In [15]:
for i in events_train.keys():
    if len(events_train[i][1]) > len(events_train[i][2]):
        diff = len(events_train[i][1]) - len(events_train[i][2])
        for x in range(diff):
            events_train[i][2].append('New Case')

## 3.2 TEST

In [16]:
event_list_test = []  # list of lists that contains all event names for every case, where each sublist is a new case.
for case in log_test.keys():
    event_list_test.append((log_test[case][0]))


events_longest_event_test = max(map(len,event_list_test))  # longest case

pos_events_test = []  # list of lists which has all event names for every position, where each sublist is a new position
for i in range(events_longest_event_test):  # creates a list every iteration which is appended to the above list
    pos_events_test.append([])
    for case in event_list_test:
        if (len(case)-1) >= i:
            pos_events_test[i].append(case[i])
        else:
            pass

In [17]:
events_test = {}

for i in range(len(pos_events_test)-1):
    events_test[i] = []
    previous_events = pos_events_test[i-1]
    current_events = pos_events_test[i]
    next_events = pos_events_test[i+1]
    
    events_test[i].append(previous_events)
    events_test[i].append(current_events)
    events_test[i].append(next_events)
events_test[0][0] =  ['New Case'] * len(events_test[0][0]) 

In [18]:
for i in events_test.keys():
    if len(events_test[i][1]) > len(events_test[i][2]):
        diff = len(events_test[i][1]) - len(events_test[i][2])
        for x in range(diff):
            events_test[i][2].append('New Case')

# 4. Training the decision trees


In [19]:
cases = list(data_train['event concept:name'].unique()) + list(data_test['event concept:name'].unique())  # list of all unique event names
cases.append('New Case')
cases = list(set(cases))
le = preprocessing.LabelEncoder()
le.fit(cases)  # encoding all event names into integers

LabelEncoder()

In [22]:
def decision_tree(pos):

    x_train= le.transform(events_train[pos][1]).reshape(-1, 1)
    x_previous = le.transform(events_train[pos][0]).reshape(-1 ,1)
    x_train_new = np.concatenate((x_train,x_previous), axis=1)
    y_train= le.transform(events_train[pos][2]).reshape(-1, 1)

    x_test= le.transform(events_test[pos][0]).reshape(-1, 1)
    y_test= le.transform(events_test[pos][1]).reshape(-1, 1)
    
    classifier = DecisionTreeClassifier()
    classifier.fit(x_train_new, y_train)
    
    return classifier

In [23]:
predictors = {}

for i in range(len(events_train)):
    predictors[i] = decision_tree(i)

# 5. Making Predictions

In [29]:
pbar = ProgressBar()

for i in pbar(log_test.keys()):
    current = log_test[i][0]
    predictions = []
    
    count = 0
    for j in range(len(current)):
        x = current[j]
        if j == 0:
            prev = 'New Case'
        else:
            prev = current[j-1]
            
        if count in predictors:
            tree = predictors[count]
            transf= le.transform([x]).reshape(-1, 1)
            transf_prev = le.transform([prev]).reshape(-1,1)
            trans = np.concatenate((transf, transf_prev), axis=1)
            prediction = tree.predict(trans)
            pred_string = le.inverse_transform(prediction)[0]
            count += 1
            predictions.append(pred_string)
        else:
            tree = predictors[count-1]
            transf= le.transform([x]).reshape(-1, 1)
            transf_prev = le.transform([prev]).reshape(-1,1)
            trans = np.concatenate((transf, transf_prev), axis=1)
            prediction = tree.predict(trans)
            pred_string = le.inverse_transform(prediction)[0]
            predictions.append(pred_string)
            
    
    log_test[i].append(predictions)

100% |########################################################################|


In [30]:
log_test

{'A28905': [['Create Fine',
   'Send Fine',
   'Insert Fine Notification',
   'Add penalty',
   'Send for Credit Collection'],
  ['2009-09-25', '2010-01-19', '2010-08-02', '2010-09-04', '2012-03-26'],
  ['Send Fine',
   'Insert Fine Notification',
   'Add penalty',
   'Send for Credit Collection',
   'New Case'],
  ['Send Fine',
   'New Case',
   'Add penalty',
   'Send for Credit Collection',
   'New Case']],
 'A28907': [['Create Fine', 'Send Fine'],
  ['2009-09-25', '2010-01-19'],
  ['Send Fine', 'New Case'],
  ['Send Fine', 'New Case']],
 'A29038': [['Send Fine', 'Create Fine'],
  ['2009-07-28', '2009-08-05'],
  ['Create Fine', 'New Case'],
  ['Create Fine', 'New Case']],
 'A29040': [['Send Fine',
   'Insert Fine Notification',
   'Add penalty',
   'Create Fine',
   'Payment',
   'Payment'],
  ['2009-07-28',
   '2009-08-17',
   '2009-10-16',
   '2009-11-05',
   '2010-02-02',
   '2010-02-27'],
  ['Insert Fine Notification',
   'Add penalty',
   'Create Fine',
   'Payment',
   'Paymen

# 6. Evaluating Accuracy

In [31]:
case_names = []
event_names = []
timestamp = []
p_event = []
current_real = []

for i in log_test.keys():
    for x in range(len(log_test[i][0])):
        case_names.append(i)
        event_names.append(log_test[i][0][x])
        timestamp.append(log_test[i][1][x])
        p_event.append(log_test[i][3][x])
        current_real.append(log_test[i][2][x])


frame_dict = {'Case_ID': case_names, 'Event_Name': event_names,
              'TimeStamp': timestamp, 'Current_Event': current_real, 'Predicted_Event': p_event}
predicted_df = pd.DataFrame.from_dict(frame_dict)

event_real = np.array(predicted_df['Current_Event'])
event_pred = np.array(predicted_df['Predicted_Event'])

acc = accuracy_score(event_real, event_pred)
print('Accuracy for event prediction TEST SET: {}%'.format(round(acc, 2) * 100))

Accuracy for event prediction TEST SET: 56.99999999999999%


In [32]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 294.3284430503845 seconds ---


In [33]:
predicted_df

Unnamed: 0,Case_ID,Event_Name,TimeStamp,Current_Event,Predicted_Event
0,A28905,Create Fine,2009-09-25,Send Fine,Send Fine
1,A28905,Send Fine,2010-01-19,Insert Fine Notification,New Case
2,A28905,Insert Fine Notification,2010-08-02,Add penalty,Add penalty
3,A28905,Add penalty,2010-09-04,Send for Credit Collection,Send for Credit Collection
4,A28905,Send for Credit Collection,2012-03-26,New Case,New Case
...,...,...,...,...,...
96156,V19305,Send Appeal to Prefecture,2012-02-24,New Case,New Case
96157,V19308,Create Fine,2011-07-10,Send Fine,Send Fine
96158,V19308,Send Fine,2012-01-30,Insert Fine Notification,New Case
96159,V19308,Insert Fine Notification,2012-02-24,Add penalty,Add penalty
