# Permutation algorithm, using combinations of two events

In [1]:
import pandas as pd
import numpy as np
import os
import numpy
import itertools
from itertools import tee, combinations, permutations
from progressbar import ProgressBar
import datetime
from sklearn.metrics import mean_squared_error, accuracy_score
import time
import calendar

In [2]:
start_time = time.time()

In [3]:
os.chdir("../")

# Training

In [4]:
data_train = pd.read_csv('./data/road-train-pre.csv', error_bad_lines=False)
data_test = pd.read_csv('./data/road-test-pre.csv', error_bad_lines=False)

In [5]:
data_train['event time:timestamp'] = pd.to_datetime(data_train['event time:timestamp'])

In [6]:
data_train = data_train.sort_values(by=['case concept:name', 'event time:timestamp'])

In [7]:
data_train['day_of_week'] = data_train['event time:timestamp'].dt.dayofweek

In [8]:
data_train.to_csv("fixed.csv")

In [9]:
file = open('fixed.csv', 'r')
log = dict()
with open('fixed.csv', 'r') as file:
    next(file)
    for line in file:
        line = line.strip()
        if len(line) == 0:
            continue
        parts = line.split(',')
        caseid = parts[2]

        task = parts[3]
        timestamp = parts[5]
        day = parts[6]

        if caseid not in log:
            log[caseid] = [[],[],[]]

        log[caseid][0].append(task)
        log[caseid][1].append(timestamp)
        log[caseid][2].append(day)

file.close()
os.remove('fixed.csv')

In [10]:
def pairwise(iterable):
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

In [11]:
event_names = list(data_train['event concept:name'].unique())
event_names.append('New Event')
combs = []

for p in itertools.product(event_names, repeat=2):
    combs.append(p)

In [12]:
pbar = ProgressBar()

for i in pbar(log.keys()):
    ID = []
    stamps = []
    for pairID, stamp in zip(pairwise(log[i][0]), pairwise(log[i][1])):
        ID.append(pairID)
        stamps.append(stamp)
            
    log[i].append(ID)
    log[i].append(stamps)

100% |########################################################################|


In [13]:
pbar = ProgressBar()
count=0

for i in pbar(log.keys()):
    count=0
    for perm in log[i][3]:
        index = log[i][3].index(perm)
        if index == 0:
            log[i][3].insert(0, ('New Event', log[i][3][index][0]))
            
    for count in range(len(log[i][4])):
        stamp = log[i][4][count]
        if count == 0:
            log[i][4].insert(0, (stamp[0], stamp[0]))
            
        count += 1

100% |########################################################################|


In [14]:
def timeDiff(tupl):
    
    datetimeFormat = '%Y-%m-%d'
    diff = datetime.datetime.strptime(tupl[0], datetimeFormat)\
    - datetime.datetime.strptime(tupl[1], datetimeFormat)
    
    return abs(diff.days)

In [15]:
def listCount(lst: list): 
    cases = list(data_train['event concept:name'].unique())  # list of all unique event names
    cases.append('New Event')
    best = 0
    for x in cases:
        current = lst.count(x)
        if current >= best:
            best = current
            str_best = x
    return str_best

In [16]:
pbar = ProgressBar() 

comb_times = {}

for comb in pbar(combs):
    day_list = [[], [], [], [], [], [], []]

    for case in log.keys():
        if comb in log[case][3]:     
            index = log[case][3].index(comb)
            day = int(log[case][2][index])

            if index < (len(log[case][3]) - 1):
                nxt_event = log[case][3][index+1][1]  # we need the second item of the tuple, bc item 1 is repeated
                day_list[day].append(nxt_event)

            elif index == (len(log[case][3]) - 1):
                nxt_event = 'New Event'
                day_list[day].append(nxt_event)

        else:
            pass

    comb_times[comb] = day_list

    for i in range(len(day_list)):
        comb_times[comb][i] = listCount(comb_times[comb][i])

100% |########################################################################|


# Setting up Test Data

In [17]:
data_test['event time:timestamp'] = pd.to_datetime(data_test['event time:timestamp'])

data_test['event time:timestamp'] = pd.to_datetime(data_test['event time:timestamp'])

data_test = data_test.sort_values(by=['case concept:name', 'event time:timestamp'])

data_test['day_of_week'] = data_test['event time:timestamp'].dt.dayofweek

data_test.to_csv("fixed_test.csv")

In [18]:
t_log = dict()


with open('fixed_test.csv', 'r') as file_test:
    next(file_test)
    for line in file_test:
        line = line.strip()
        if len(line) == 0:
            continue
        parts = line.split(',')

        caseid = parts[2]

        task = parts[3]
        timestamp = parts[5]
        day = parts[6]

        if caseid not in t_log:
            t_log[caseid] = [[],[],[]]

        t_log[caseid][0].append(task)
        t_log[caseid][1].append(timestamp)
        t_log[caseid][2].append(day)
file.close()
os.remove('fixed_test.csv')

In [19]:
"""Fixing a bug of cases that are in the test data but are incomplete due to the train-test split."""

bugs = []

for i in t_log.keys():
    if len(t_log[i][0]) == 1:
        bugs.append(i)
            
for x in bugs:
    del t_log[x]

# Full Trace

In [28]:
#  new dictionary that will contain for every position(key) the observed traces and next events for each trace(values)
#  so case [A, B, C] would be saved as {0:[[A],[B]], 1: [[A,B], [C]], 2: [[A, B, C], [New Case]]} 
train_data = {} 

for i in log.keys():
    count = 0
    for x in log[i][0]:
        case = log[i][0]
        #ind = log[i][0].index(x)
        
        if count not in train_data:  # making the two lists in the dictionary
            train_data[count] = [[],[]]  # list 1 is all for all traces of the position, list 2 is for all next events
        
        
        train_data[count][0].append(case[:count+1])  # appending the trace
        
        if count < len(case)-1:
            train_data[count][1].append(case[count+1])  # appending the next event of the trace
            
        elif count == len(case)-1:
            train_data[count][1].append('New Case')
            
        count += 1

#### In the train_data you have all positions observed in the training set, they are the keys! 

That means, train_data[0] is for position 0, train_data[1] is for 1 and so on. 

Each position contains a list which has two sublists.
- sublist 1: all observed traces up to the position. 
- sublist 2: all real next events for the above mentioned traces. Indexes match!

So, train_data[2][0] has all traces observed for position 3 and train_data[2][1] has all next events for these traces.

#### What you want to do is to take the most frequent event in the train_data[position][1]. There is a function in this notebook listCount() that takes list as an argument and its going to do it for you.

#### After that you need to use that as a prediction together with the permutations. You can do it by checking which one performs better for which position and then picking the one with the higher acc for that position (check it on the training data).

#### You can see the adding predictions section for the DT and how all events of a trace are split (look at the double for loop, namely - current_encoded[:(len(train_data))]. You will need that to match the traces up to current event with the ones from train_data. 

###### Once you are all done, please delete the explanation cells.

# Test prediction

In [20]:
pbar = ProgressBar()

for i in pbar(t_log.keys()):
    ID = []
    stamps = []
    for pairID, stamp in zip(pairwise(t_log[i][0]), pairwise(t_log[i][1])):
        ID.append(pairID)
        stamps.append(stamp)

    t_log[i].append(ID)
    t_log[i].append(stamps)

pbar = ProgressBar()
count = 0

100% |########################################################################|


In [21]:
for i in pbar(t_log.keys()):
    count = 0
    for perm in t_log[i][3]:
        index = t_log[i][3].index(perm)
        if index == 0:
            t_log[i][3].insert(0, ('New Event', t_log[i][3][index][0]))

    for count in range(len(t_log[i][4])):
        stamp = t_log[i][4][count]
        if count == 0:
            t_log[i][4].insert(0, (stamp[0], stamp[0]))

        count += 1

100% |########################################################################|


In [22]:
pbar = ProgressBar()

for i in pbar(t_log.keys()):
    # Add the real time differences
    real_diff = []
    for t in t_log[i][4]:
        real_diff.append(timeDiff(t))
    t_log[i].extend([real_diff])

100% |########################################################################|


In [23]:
"""Adding predictions based on the combination with respect to the week. """

for i in t_log.keys():
    current = t_log[i][3]
    prediction = []

    for perm in current:
        index = current.index(perm)
        day = int(t_log[i][2][index])
        current_prediction = comb_times[perm][day]

        if current_prediction != 0:
            prediction.append(current_prediction)
        else:
            merged_list = list(itertools.chain.from_iterable(comb_times[perm]))
            pred = listCount(merged_list)
            prediction.append(pred)

    t_log[i].extend([prediction])

    current_real = []

    for x in t_log[i][0]:
        if t_log[i][0].index(x) == 0:
            current_real.append('New Event')
        else:
            current_real.append(x)
    t_log[i].extend([current_real])

In [24]:
"""Storing all time differences for every combination."""

pbar = ProgressBar()

times = {}
for comb in pbar(combs):
    for case in log.keys():
        if comb in log[case][3]:
            count = log[case][3].index(comb)
            diff = timeDiff(log[case][4][count])
            if comb not in times:
                times[comb] = []
                times[comb].append(diff)
            else:
                times[comb].append(diff)
        else:
            pass
    if comb in times.keys():
        times[comb] = int(np.ceil(np.mean(times[comb])))

100% |########################################################################|


In [25]:
"""Prediction for time difference, we check whether event is last and then predict 0 for it!"""
for i in t_log.keys():
    time_pred = []
    for ev, pred, day in zip(t_log[i][0], t_log[i][6], t_log[i][2]):
        last = len(t_log[i][0]) - 1

        if t_log[i][0].index(ev) == last:
            time_pred.append(0)
        elif pred == 'New Event':
            time_pred.append(0)
        elif (ev, pred) in times:
            if int(day) + times[(ev, pred)] % 7 == 5:
                time_pred.append(times[(ev, pred)] + 2)
            elif int(day) + times[(ev, pred)] % 7 == 6:
                time_pred.append(times[(ev, pred)] + 1)
            else:
                time_pred.append(times[(ev, pred)])

    t_log[i].extend([time_pred])

# Making a dataframe with the predictions

In [26]:
case_names = []
event_names = []
timestamp = []
p_event = []
current_real = []

real_diff = []
pred_diff = []

for i in t_log.keys():
    for x in range(len(t_log[i][0])):
        case_names.append(i)
        event_names.append(t_log[i][0][x])
        timestamp.append(t_log[i][1][x])
        p_event.append(t_log[i][6][x])
        current_real.append(t_log[i][7][x])

        real_diff.append(t_log[i][5][x])
        pred_diff.append(t_log[i][8][x])

real_diff.append(0)

frame_dict = {'Case_ID': case_names, 'Event_Name': event_names,
              'TimeStamp': timestamp, 'Current_Event': current_real, 'Predicted_Event': p_event,
              'Real_Diff': real_diff[1:], 'Predicted_Diff': pred_diff}
predicted_df = pd.DataFrame.from_dict(frame_dict)

event_real = np.array(predicted_df['Current_Event'])
event_pred = np.array(predicted_df['Predicted_Event'])
event_real = event_real[1:]
event_pred = event_pred[:-1]

acc = accuracy_score(event_real, event_pred)
print('Accuracy for event prediction TEST SET: {}%'.format(round(acc, 2) * 100))

time_real = np.array(predicted_df['Real_Diff'])
time_pred = np.array(predicted_df['Predicted_Diff'])

time_pred = time_pred
time_real = time_real

rms = np.sqrt(mean_squared_error(time_real, time_pred))
print('Root mean squared error for time difference prediction TEST SET: {}'.format(round(rms, 2)))

Accuracy for event prediction TEST SET: 60.0%
Root mean squared error for time difference prediction TEST SET: 156.63


In [27]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 112.49065804481506 seconds ---
