# Decision Tree predicting the next event of a case - Full Trace

In [None]:
import os
import pandas as pd
import numpy as np
from math import sqrt
import datetime
from datetime import date
from itertools import repeat 
from progressbar import ProgressBar
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LinearRegression 
from sklearn import metrics 
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, accuracy_score
import time

In [None]:
start_time = time.time()

In [None]:
os.chdir("../")

In [None]:
data_train = pd.read_csv('./data/road-train-pre.csv', error_bad_lines=False)
data_test = pd.read_csv('./data/road-test-pre.csv', error_bad_lines=False)

In [None]:
data_train['event time:timestamp'] = pd.to_datetime(data_train['event time:timestamp'])
data_train = data_train.sort_values(by=['case concept:name', 'event time:timestamp'])

In [None]:
data_test['event time:timestamp'] = pd.to_datetime(data_test['event time:timestamp'])
data_test = data_test.sort_values(by=['case concept:name', 'event time:timestamp'])

# 1. Train Data

In [None]:
data_train.to_csv("fixed.csv")

file = open('fixed.csv', 'r') 
log = dict()  # dictionary that contains all information for a case - key: case name; values: events, timestamps
with open('fixed.csv', 'r') as file:
    next(file)
    for line in file:
        line = line.strip()
        if len(line) == 0:
            continue
        parts = line.split(',')
        caseid = parts[2]

        task = parts[3]
        timestamp = parts[5]

        if caseid not in log:
            log[caseid] = [[],[]]

        log[caseid][0].append(task)  # adding the events as a list into the dictionary
        log[caseid][1].append(timestamp)  # adding the timestamps as a list into the dictionary
        
file.close()

os.remove('fixed.csv')

In [None]:
for i in log.keys():  # updating the dictionary to contain also all next events 
    current = log[i][0]  # recording the cuurent case' events
    
    real_next = current[1:]  # next real events
    real_next.append('New Case')  # adding a 'new case' as real next event for every last event
    
    log[i].append(real_next)  # adding the real next events to the log file

# 2. Test Data

In [None]:
#  Repeating the same process from above on the test data.

data_test.to_csv("fixed_test.csv")

file = open('fixed_test.csv', 'r')
log_test = dict()
with open('fixed_test.csv', 'r') as file:
    next(file)
    for line in file:
        line = line.strip()
        if len(line) == 0:
            continue
        parts = line.split(',')
        caseid = parts[2]
        
        task = parts[3]
        timestamp = parts[5]

        if caseid not in log_test:
            log_test[caseid] = [[],[]]

        log_test[caseid][0].append(task)
        log_test[caseid][1].append(timestamp)
        
file.close()

os.remove('fixed_test.csv')

In [None]:
"""Fixing a bug of cases that are in the test data but are incomplete due to the train-test split."""

bugs = []

for i in log_test.keys():  #  recording the cases which have events cut because of the train - test split
    if len(log_test[i][0]) == 1:
        bugs.append(i)
            
for x in bugs:  # deleting the above mentioned events 
    del log_test[x]

In [None]:
for i in log_test.keys():
    current = log_test[i][0]  # current case' events
    
    real_next = current[1:]  # next real events
    real_next.append('New Case')  # adding a 'new case' as real next event for every last event
    log_test[i].append(real_next) 

## Mumbo Jumbo

In [None]:
m = 0

tenabove = []
for i in log.keys():
    if len(log[i][0]) > m:
        m = len(log[i][0])
        
    if len(log[i][0]) > 10:
        tenabove.append(i)

In [None]:
tenabove_test = []
for i in log_test.keys():
    if len(log_test[i][0]) > m:
        m = len(log_test[i][0])
        
    if len(log_test[i][0]) > 10:
        tenabove_test.append(i)

In [None]:
delete = []
for i in log_test.keys():
    if len(log_test[i][0]) > m:
        print(i)
        print(log_test[i])
        delete.append(i)
        #m_t = len(log_test[i][0])
        
for i in delete:
    print(i)
    
    test.drop(test.index[test['case concept:name'] == i], inplace = True)

    del log_test[i]

# 3. Storing the data

In [None]:
#  new dictionary that will contain for every position(key) the observed traces and next events for each trace(values)
#  so case [A, B, C] would be saved as {0:[[A],[B]], 1: [[A,B], [C]], 2: [[A, B, C], [New Case]]} 
train_data = {} 

for i in log.keys():
    count = 0
    for x in log[i][0]:
        case = log[i][0]
        #ind = log[i][0].index(x)
        
        if count not in train_data:  # making the two lists in the dictionary
            train_data[count] = [[],[]]  # list 1 is all for all traces of the position, list 2 is for all next events
        
        
        train_data[count][0].append(case[:count+1])  # appending the trace
        
        if count < len(case)-1:
            train_data[count][1].append(case[count+1])  # appending the next event of the trace
            
        elif count == len(case)-1:
            train_data[count][1].append('New Case')
            
        count += 1

In [None]:
#  repeating the same process on the test data
test_data = {} 

for i in log_test.keys():
    count = 0
    for x in log_test[i][0]:
        case = log_test[i][0]
        #ind = log_test[i][0].index(x)
        
        if count not in test_data:
            test_data[count] = [[],[]]
        
        
        test_data[count][0].append(case[:count+1])  # appending the trace
        
        if count < len(case)-1:
            test_data[count][1].append(case[count+1])  # appending the next event of the trace
            
        elif count == len(case)-1:
            test_data[count][1].append('New Case')
            
        count += 1

# 4. Encoding 

In [None]:
# encoding all unique event names of all the data into integers

cases = list(data_train['event concept:name'].unique()) + list(data_test['event concept:name'].unique())  # all events
cases.append('New Case')  #  adding the 'New Case' because we predict next event is going to be new case
cases = list(set(cases)) 
le = preprocessing.LabelEncoder()
le.fit(cases)  # encoding all event names into integers

TRAIN

In [None]:
pbar = ProgressBar()

for i in pbar(train_data.keys()):  # the dictionaries from above are encoded into integers 
    
    encoded = []
    for trace in train_data[i][0]:  # encoding all strings of a trace, can be multiple if case lenght is more than 2
        local_encoded = []
        for event in trace:
            local_encoded.append(int(le.transform([event])))  # transforming into integer
        encoded.append(local_encoded)
    
    train_data[i][0] = np.array(encoded)  # making the list with integers into array so the tree can take it
    
    
    encoded_next = []  # encoding all strings of next events for a trace, its always length 1 !
    for g in train_data[i][1]:
        encoded_next.append(int(le.transform([g])))  # transforming into integer
                            
                            
    train_data[i][1] = np.array(encoded_next)  #  making the list with integers into array
        

TEST

In [None]:
# repeating the procedure from above on the test data

pbar = ProgressBar()

for i in pbar(test_data.keys()):
    
    encoded = []
    for trace in test_data[i][0]:
        local_encoded = []
        for event in trace:
            local_encoded.append(int(le.transform([event])))
        encoded.append(local_encoded)
    
    test_data[i][0] = np.array(encoded)
    
    
    encoded_next = []
    for g in test_data[i][1]:
        encoded_next.append(int(le.transform([g])))
                            
                            
    test_data[i][1] = np.array(encoded_next)

# 5. Training the decision tree

In [None]:
# Function for training decision tree for any given position (as long as the position is in the train data)

def decision_tree(pos):

    x_train= train_data[pos][0]
    y_train= train_data[pos][1]

    classifier = DecisionTreeClassifier()
    classifier.fit(x_train, y_train)
    
    return classifier

In [None]:
predictors = {}  # dictionary to contain all decision trees given the position
#  key - position, value - decision tree for that position

for i in test_data.keys():
    if i > len(train_data) - 1:
        predictors[i] = decision_tree(len(train_data) - 1)
        
    else:
        predictors[i] = decision_tree(i)

# 6. Adding predictions

In [None]:
pbar = ProgressBar()

for i in pbar(log_test.keys()):  # adding an array with the encoding to the log_test dict. for every case in the test 
    current = log_test[i][0]
    
    
    encoded = []  # list will contain all event names encoded into integers
    for g in current:
        encoded.append(int(le.transform([g])))
    encoded = np.array(encoded)
    log_test[i].append(encoded)
    

In [None]:
pbar = ProgressBar()
for i in pbar(log_test.keys()):  # making predictions for every case in the log_test dict
    
    current_encoded = log_test[i][3]
    predictions = []  # list that will contain all predictions for a given case
    count = 0
    
    for x in current_encoded:
        
        
        # the if-else is a checks whether the case length is more than any case length observed in the train data
        if count >= len(train_data) - 1: # if its in the train data we call the appropriate decision tree
            
            
            tree = predictors[len(train_data) - 1]  # calling the right tree given the position
            p = current_encoded[:(len(train_data))]  # taking the trace
            p = p.reshape(1, -1)
            pred = tree.predict(p)  # making a prediction 
            pred_string = le.inverse_transform(pred)[0]  # transforming the prediction into a string
            predictions.append(pred_string)  # appending the prediction as a string to the log_test data
            
            
            
        else:  # if its not in the train data then we use the last observed decision tree from the train data
        
            tree = predictors[count]  # calling the right tree given the position
            p = current_encoded[:count+1]  # taking the trace
            p = p.reshape(1, -1)  # we need to do that, idk why
            pred = tree.predict(p)  # making a prediction
            pred_string = le.inverse_transform(pred)[0]  # transforming the prediction into a string
            predictions.append(pred_string)  # appending the prediction as a string to the log_test data 
            
        count += 1
        
    log_test[i].append(predictions)  # adding all predictions to the log_test of the current case

# 7. Evaluation

In [None]:
# making lists for every column we will have in the frame
case_names = []
event_names = []
timestamp = []
p_event = []
current_real = []

for i in log_test.keys():  # appending the right things to every list from the log_test file
    for x in range(len(log_test[i][0])):
        case_names.append(i)
        event_names.append(log_test[i][0][x])
        timestamp.append(log_test[i][1][x])
        p_event.append(log_test[i][4][x])
        current_real.append(log_test[i][2][x])

# dictionary that will be used to make the frame
frame_dict = {'Case_ID': case_names, 'Event_Name': event_names,
              'TimeStamp': timestamp, 'Next_Event': current_real, 'Predicted_Event': p_event}
predicted_df = pd.DataFrame.from_dict(frame_dict)  # making a frame 

event_real = np.array(predicted_df['Next_Event'])  # taking next event col. as an array
event_pred = np.array(predicted_df['Predicted_Event'])  # taking the predictions as an array 

acc = accuracy_score(event_real, event_pred)  # calculates the accuracy based on the both arrays
print('Accuracy for event prediction TEST SET: {}%'.format(round(acc, 2) * 100))

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

# Accuracy per position

In [None]:
print('-- Accuracy per position on train data\n')
for i in train_data.keys():
    
    x_train = train_data[i][0]
    y_train = train_data[i][1]
    
    tree = predictors[i]
    
    x_test = train_data[i][0]
    y_test = train_data[i][1]
    
    y_pred = tree.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    print('-- Position:', i, '-- Acc:', '{}%'.format(round(acc, 2) * 100), '-- Cases:', len(x_test),
          '-- Cases to train on:', len(x_train))

In [None]:
print('-- Accuracy per position on test data\n')
for i in test_data.keys():
    
    x_test = test_data[i][0]
    y_test = test_data[i][1]
    
    if i <= len(train_data) - 1:

        tree = predictors[i]

        y_pred = tree.predict(x_test)
        acc = accuracy_score(y_test, y_pred)
        print('-- Position:', i, '-- Acc:', '{}%'.format(round(acc, 2) * 100), '-- Cases:', len(x_test),
             '-- Cases to train on:', len(train_data[i][0]) )
        
    elif i > len(train_data) - 1:
        
        new_trace = [x[:len(train_data)] for x in x_test]
        tree = predictors[len(train_data) - 1]
        y_pred = tree.predict(new_trace)
        acc = accuracy_score(y_test, y_pred)
        print('-- Position:', i, '-- Acc:', '{}%'.format(round(acc, 2) * 100), '-- Cases:', len(x_test), 
             '-- Cases to train on: 0')

# Linear regression for timestamp prediction

In [None]:
train = data_train
test = data_test

In [None]:
#Add new useful columns for the model train
train['position_event']=train.groupby('case concept:name').cumcount()
train['position_event']=train['position_event']+1
train['week_day']=train['event time:timestamp'].dt.dayofweek

In [None]:
#Encoding all event names into integers
cases = train['event concept:name'].unique().tolist()
cases.insert(0, 'New Case')
le_case = preprocessing.LabelEncoder()
le_case.fit(cases)

In [None]:
#Encoding lifecycle into integers
life = train['event lifecycle:transition'].unique().tolist()
le_life = preprocessing.LabelEncoder()
le_life.fit(life)

In [None]:
#Preprocess data for model train
#Event poistion
x_train_position = np.array(train['position_event']).reshape(-1,1)[:]
#Previous event
x_train_prev = list(train['event concept:name'])
x_train_prev= le_case.transform(x_train_prev)
x_train_prev = np.array(x_train_prev).reshape(-1,1)[:]
# Event
x_train_event = list(train['event concept:name'])
x_train_event.insert(len(train), 'New Case')
x_train_event= le_case.transform(x_train_event)
x_train_event = np.array(x_train_event).reshape(-1,1)[1:]
#Day of the week previous event event
x_train_week = list(train['week_day'])
x_train_week = np.array(x_train_week).reshape(-1,1)[:]
#Timestamp event
train[['event time:timestamp']] = train[['event time:timestamp']].astype(str)
x_train_date = list(train['event time:timestamp'])
x_train_date.insert(len(train), None)
x_train_date=np.array(x_train_date).reshape(-1,1)[1:]
#Timestamp previous event
x_train_date_prev = list(train['event time:timestamp'])
x_train_date_prev=np.array(x_train_date_prev).reshape(-1,1)[:]
#Event Lifecycle
x_train_life = list(train['event lifecycle:transition'])
x_train_life= le_life.transform(x_train_life)
x_train_life = np.array(x_train_life).reshape(-1,1)[:]

In [None]:
#Length case for train set
cases = train.groupby(['case concept:name'])
per_case = pd.DataFrame({'no of events':cases['eventID '].count()})
lst_per_case = per_case["no of events"].tolist()
case_length = []
for length in lst_per_case:
    case_length.extend(repeat(length, length))
x_train_length_case=np.array(case_length).reshape(-1,1)[:]

In [None]:
#Combine features for the model train
x_train_new = np.concatenate((x_train_position,x_train_prev, x_train_event, x_train_week, x_train_date,
                              x_train_date_prev, x_train_length_case, x_train_life), axis=1)

In [None]:
#Add features to new dataframe train
df_train = pd.DataFrame(data=x_train_new, columns=['position_event', 'prev_event', 'event', 'week_day_prev', 'date', 'date_prev', 'case_length', 'lifecycle'])
df_train.loc[df_train['position_event'] == df_train['case_length'], 'event'] = 5
df_train[['date','date_prev']] = df_train[['date','date_prev']].apply(pd.to_datetime)
df_train.loc[df_train['event'] == 5, 'date'] = None
df_train['in_between'] = (df_train['date'] - df_train['date_prev']).dt.days
df_train.loc[df_train['event'] == 5, 'in_between'] = 0

## Train Dummies

In [None]:
#Implementing dummies train
df_train=pd.get_dummies(df_train, columns=['event', 'prev_event', 'week_day_prev', 'position_event', 'lifecycle'])
df_train = df_train.drop(['date', 'date_prev'], 1)

## Test Data Preprocessing

In [None]:
#Add new useful columns for the model test
test['position_event']=test.groupby('case concept:name').cumcount()
test['position_event']=test['position_event']+1
test['week_day']=test['event time:timestamp'].dt.dayofweek

In [None]:
predicted_events=predicted_df['Predicted_Event'][:].tolist()
test['pred_event']=predicted_events

In [None]:
#Preprocess data for model test
#Event poistion
x_test_position = np.array(test['position_event']).reshape(-1,1)[:]
#Previous event
x_test_prev = test['event concept:name'].tolist()
x_test_prev = le_case.transform(x_test_prev)
x_test_prev = np.array(x_test_prev).reshape(-1,1)[:]
#Predicted Event
x_test_event = test['pred_event'].tolist()
x_test_event= le_case.transform(x_test_event)
x_test_event = np.array(x_test_event).reshape(-1,1)[:]
#Day of the week previous event
x_test_week = test['week_day'].tolist()
x_test_week = np.array(x_test_week).reshape(-1,1)[:]
#Timestamp event
test[['event time:timestamp']] = test[['event time:timestamp']].astype(str)
x_test_date = list(test['event time:timestamp'])
x_test_date.insert(len(test), None)
x_test_date=np.array(x_test_date).reshape(-1,1)[1:]
#Timestamp previous event
x_test_date_prev = list(test['event time:timestamp'])
x_test_date_prev=np.array(x_test_date_prev).reshape(-1,1)[:]
#Event Lifecycle
x_test_life = test['event lifecycle:transition'].tolist()
x_test_life= le_life.transform(x_test_life)
x_test_life = np.array(x_test_life).reshape(-1,1)[:]

In [None]:
#Length case for test set
test_cases = test.groupby(['case concept:name'])
per_case_test = pd.DataFrame({'no of events':test_cases['eventID '].count()})
lst_per_case_test = per_case_test["no of events"].tolist()
case_length_test = []
for length in lst_per_case_test:
    case_length_test.extend(repeat(length, length))
x_test_length_case=np.array(case_length_test).reshape(-1,1)[:]

In [None]:
#Combine features for the model test
x_test_new = np.concatenate((x_test_position ,x_test_prev, x_test_event, x_test_week, x_test_date,
                             x_test_date_prev, x_test_length_case, x_test_life), axis=1)

In [None]:
#Add features to new dataframe test
df_test = pd.DataFrame(data=x_test_new, columns=['position_event', 'prev_event', 'event', 'week_day_prev',
                                                 'date', 'date_prev', 'case_length', 'lifecycle'])
df_test.loc[df_test['position_event'] == df_test['case_length'], 'date'] = None
df_test[['date','date_prev']] = df_test[['date','date_prev']].apply(pd.to_datetime)
df_test['in_between'] = (df_test['date'] - df_test['date_prev']).dt.days
df_test.loc[df_test['position_event'] == df_test['case_length'], 'in_between'] = 0

In [None]:
#Remove cases with more events than the cases in the train set
df_test=df_test[df_test['case_length']<=max(df_train['case_length'])]

## Test Dumies

In [None]:
#Implementing dummies test
df_test=pd.get_dummies(df_test, columns=['event', 'prev_event', 'week_day_prev', 'position_event', 'lifecycle'])
df_test = df_test.drop(['date', 'date_prev'], 1)

## Feature Selection and Model training

In [None]:
col_test=df_test.columns
features=set(col_train).intersection(col_test)
features.discard('in_between')
X_train = df_train[features] # Features
y_train = df_train['in_between'] # Target variable
X_test = df_test[features] # Features
y_test = df_test['in_between'] # Target variable

#Training the algorithm
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

## Evaluation

In [None]:
y_pred = regressor.predict(X_test)
df_predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

In [None]:
#Root Mean Squared Error of the model
rmse = sqrt(mean_squared_error(y_test, y_pred))

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))