In [None]:
#To execute on Google Colab
# !pip install pm4py
# !wget https://data.4tu.nl/ndownloader/files/24025820
# !mv 24025820 BPI_Challenge_2018.xes.gz
# !gzip -d BPI_Challenge_2018.xes.gz


In [None]:
from pm4py.objects.log.importer.xes import importer as xes_importer
import pm4py
import time
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.utils import plot_model 
import tensorflow.keras.utils as ku 
import keras.backend as K
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from tqdm import tqdm
from datetime import datetime
from pm4py.algo.filtering.log.timestamp import timestamp_filter


In [None]:
# log = xes_importer.apply('filtered_log.xes')
# log = xes_importer.apply('financial_log-filtered-5.xes')
# log = xes_importer.apply('SuncorpSkin.xes')
log = xes_importer.apply('BPI_Challenge_2018.xes')


In [None]:
sizes_dict = {}
for i in range(len(log)):
    k = len(log[i])
    if k in sizes_dict:
        v = sizes_dict[k] 
        sizes_dict[k] = v + 1
    else:
        sizes_dict[k] = 1
print(sizes_dict)

In [None]:
print('Number of actions: ', len(sizes_dict))
print('Number of examples: ', len(log))


In [None]:
vocab_index = {}
for seq in log:
    for action in seq:
        action_name = action['concept:name']
        if not action_name in vocab_index:
            v = len(vocab_index) + 1
            vocab_index[action_name] = v    
total_words = len(vocab_index) + 1


In [None]:
def get_data_with_filter(filtered_log, time_s, time_e):
  timestamp_s = datetime.timestamp(datetime.strptime(time_s, '%Y-%m-%d %H:%M:%S'))
  timestamp_e = datetime.timestamp(datetime.strptime(time_e, '%Y-%m-%d %H:%M:%S'))
  input_sequences = []
  for seq in filtered_log:
      sequence = []
      timestamp = 0.0
      for action in seq:
          action_name = action['concept:name']        
          action_index = vocab_index[action_name]
          timestamp = datetime.timestamp(action["time:timestamp"])
          if timestamp >= timestamp_s and timestamp <= timestamp_e: 
            sequence.append((action_index, timestamp)) 
      if len(sequence) != 0: 
        sequence.sort(key=lambda x: x[1], reverse=False)
        
        sequence = [x[0] for x in sequence]     
        for i in range(1, len(sequence)):
                input_sequences.append(sequence[:i+1])          
  return input_sequences


input_sequences_train = get_data_with_filter(log, "2015-12-09 00:00:00", "2017-01-1 00:00:00")
input_sequences_val = get_data_with_filter(log, "2017-01-1 00:00:01", "2017-03-1 00:00:00")
input_sequences_test = get_data_with_filter(log, "2017-03-1 00:00:00", "2018-01-19 23:59:59")

print('Training size = ', len(input_sequences_train))
print('Val size = ', len(input_sequences_val))
print('Test size = ', len(input_sequences_test))



In [None]:
size_dist = []
for seq in input_sequences_train:
    size_dist.append(len(seq))

sns.histplot(size_dist)
plt.xlabel('Size')
plt.ylabel('#of examples')
# plt.xlim(0,200)


In [None]:
max_sequence_len = 50

In [None]:
sample = 20
reverse_vocab_index = dict(map(reversed, vocab_index.items()))
print("The entry ",sample," in 'input_sequences' is: ")
print(input_sequences_train[sample])
print(" and it corresponds to:")
for i in input_sequences_train[sample]:
    print(reverse_vocab_index[i], end=' ')
                        

Next, we padd our training set to the max length in order to be able to make a batch processing.

In [None]:
# max_sequence_len = max([len(x) for x in input_sequences])
input_sequences_train = np.array(pad_sequences(input_sequences_train, maxlen=max_sequence_len, padding='pre'))
input_sequences_val = np.array(pad_sequences(input_sequences_val, maxlen=max_sequence_len, padding='pre'))
input_sequences_test = np.array(pad_sequences(input_sequences_test, maxlen=max_sequence_len, padding='pre'))

Run the following to see the containt of the padded 'input_sequences' object.

In [None]:
print("The entry ",sample," in 'input_sequences' is: ")
print(input_sequences_train[sample])
print(" and it corresponds to:")
print("[", end=' ')
for i in input_sequences_train[sample]:
    if i in reverse_vocab_index:
        print(reverse_vocab_index[i], end=' ')
    else:
        print("__", end=' ')
print("]")


Given a sentence like **"A B A C"**, we want to design a model that can predict the next action -- in the case the action **"C"**.

Therefore, the next code prepares our input and output to our model consequently.

In [None]:
input_to_model_train, label_train = input_sequences_train[:,:-1],input_sequences_train[:,-1]
input_to_model_val, label_val = input_sequences_val[:,:-1],input_sequences_val[:,-1]
input_to_model_test, label_test = input_sequences_test[:,:-1],input_sequences_test[:,-1]


In [None]:
print("The entry ",sample," in 'input_sequences' is: ")
print(input_sequences_train[sample])
print(", it corresponds to the following input to our model:")
print(input_to_model_train[sample])
print(" and the following output: ", label_train[sample])


Here is the architecture of the model we will use:

<img src="https://github.com/amaaradji/ChangePoint_Dectection/blob/main/imgs/text_generation.png?raw=1" style="width:600;height:400px;">




 
**Exercise**: Implement `model()`. You will need to carry out 5 steps:

1. Create a sequencial model using the `Sequential` class
2. Add an embedding layer to the model using the `Embedding` class of size 128
3. Add an LSTM layer to the model using the `LSTM` class of size 128
4. Add a Dense layer to the model using the `Dense` class with a `softmax` activation
5. Set a `categorical_crossentropy` loss function to the model and optimize `accuracy`.


In [None]:
def create_model(dropout_value):
    model = Sequential()
    model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
    model.add(Bidirectional(LSTM(128, return_sequences=False, recurrent_dropout=dropout_value)))
    # model.add(Bidirectional(LSTM(128, recurrent_dropout=dropout_value)))
    model.add(Dropout(dropout_value))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])
    return model
    
# #Print details of the model.
model = create_model(0.2)
model.summary()
plot_model(model)


In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
# Create model
with strategy.scope():
    model = create_model(0.5)


history = model.fit(input_to_model_train, label_train, 
                    validation_data=(input_to_model_val,label_val), 
                    epochs=10, 
                    batch_size=256, 
                    verbose=1)


In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(15,4))

ax1 = plt.subplot(1, 2, 1)
plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss, label='Validation loss')
plt.fill_between(epochs, loss,val_loss,color='g',alpha=.1)

plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

ax2 = plt.subplot(1, 2, 2)
plt.plot(epochs, acc, label='Training accuracy')
plt.plot(epochs, val_acc, label='Validation accuracy')
plt.fill_between(epochs, acc,val_acc,color='g',alpha=.1)
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
test_set = []
step = 10000

for i in range(0,len(input_to_model_test), step):
    if i+step > len(input_to_model_test):
            break
    x_test = input_to_model_test[i:i+step] 
    y_test = label_test[i:i+step] 
    test_set.append((x_test, y_test))
      
    
accuracy_list = []
with tqdm(total=len(test_set)) as pbar:
  for x_test,y_test in test_set:
      accuracy_list.append(model.evaluate(x_test, y_test, batch_size=256, verbose=0)[1])
      pbar.update(1)


In [None]:
N = 10
cumsum, moving_aves = [0], []

for i, x in enumerate(accuracy_list, 1):
    cumsum.append(cumsum[i-1] + x)
    if i >= N:
        moving_ave = (cumsum[i] - cumsum[i-N])/N
        #can do stuff with moving_ave here
        moving_aves.append(moving_ave)

In [None]:
plt.plot(list(range(len(accuracy_list))), accuracy_list, label='Accuracy')
plt.plot(list(range(len(moving_aves))), moving_aves, label='Accuracy MA')
plt.xlabel('Window')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
print(input_sequences_unsorted[0][1])
print(input_sequences_unsorted[-1][1])