# ProcessGAN - AI-Enabled Business Process Improvement
## 1 Packages

In [None]:
# Import necessary packages
import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.nn.functional as F

from random import sample
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

from datetime import datetime, timedelta

import pm4py
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.importer.xes import importer as xes_importer

## 2 Input and Data Preprocessing

In [None]:
input_path = "placeholder"
output_path = "placeholder"

In [None]:
# Option 1: Import and convert XES data
log = xes_importer.apply(input_path)
log = log_converter.apply(log, variant = log_converter.Variants.TO_DATA_FRAME)

In [None]:
# Option 2: Import CSV data
log = pd.read_csv(input_path)

In [None]:
# Select and rename columns from dataset (to be adapted to each dataset)
org_df = log[["concept:name","lifecycle:transition", "time:timestamp","case:concept:name"]]
org_df.columns = ["activity", "status", "timestamp", "id"]

In [None]:
# Function to convert columns to specified types
def convert_datatypes(df, datatypes, timestamp_col, activity_col, id_col):
    df.loc[:,timestamp_col] = pd.to_datetime(df.loc[:,timestamp_col], utc = True)
    
    df = df.astype(datatypes)
       
    time_index = df.columns.get_loc(timestamp_col)
    act_index = df.columns.get_loc(activity_col)
    id_index = df.columns.get_loc(id_col)
    
    # Calculate activity duration
    time_differences = []
    for i in range(1, len(df), 2):
        start = df.iloc[i-1, time_index]
        complete = df.iloc[i, time_index]   
        duration = (complete - start).total_seconds()
        time_differences.extend([duration, duration])
    df["duration"] = time_differences
    dur_index = df.columns.get_loc("duration")
      
    # Calculate time since start of the trace
    start_time_per_id = []
    for i in df[id_col].unique():
        for j in range(len(df[id_col][df[id_col]==i])):
            start_time = df[timestamp_col][df[id_col]==i].iloc[0]
            start_time_per_id.append(start_time)
    df["start_time_per_id"] = start_time_per_id
    start_time_index = df.columns.get_loc("start_time_per_id")
    
    durations_since_start = []
    for i in range(len(df)):
        start_time = df.iloc[i, start_time_index]
        act_timestamp = df.iloc[i, time_index]
        durations_since_start.append((act_timestamp - start_time).total_seconds())
    df["time_since_start"] = durations_since_start
    dur_since_start_index = df.columns.get_loc("time_since_start")
        
    
    df = df.iloc[::2, [act_index, id_index, dur_index, dur_since_start_index]].reset_index(drop=True)
    
    return df  

In [None]:
# Convert the dataframe
datatypes = {"activity" : "category", "status" : "category", "id" : "category"}
df = convert_datatypes(org_df, datatypes, "timestamp", "activity", "id")

In [None]:
# Identify the maximum trace length
modeID = df.id.mode()[0]
max_trace_len = df.id[df.id == modeID].count()

In [None]:
# Describe the dataframe 
print("Length of dataset:", len(df), "rows")
print("Different unique activities:", len(df.activity.unique()))
print("Different unique IDs:", len(df.id.unique()))
print("Average duration of a single activity:", int(df.duration.mean()), "seconds")
print("Maximum length of a trace:", max_trace_len)

In [None]:
# Function to create a new compressed dataset
def create_compressed_dataset(df):
    combined_list = []
    ID_list = df.id.unique()

    for ID in ID_list:
        sub_df = df.id[df.id == ID].copy()
        act_list = []
        total_time = 0
        for i in range(0, len(sub_df)):
            act_list.append(df.activity[i])
            total_time += int(df.duration[i])
        id_act_time_list = [ID, act_list, total_time]
        combined_list.append(id_act_time_list)
    compressed_df = pd.DataFrame(combined_list, columns = ["id", "activity", "duration"])
    
    return compressed_df


# Function to identify p% (but at least one) standard variants from a compressed dataset
def identify_standard_variant_ids(df, p):
    ids_standard = []
    compressed_df = create_compressed_dataset(df)
    compressed_df["activity"] = compressed_df["activity"].astype(str)
    
    n = int(len(compressed_df.activity.unique())* p)
    if n == 0:
        n = 1
    
    for i in range(n):
        most_frequent = str(compressed_df["activity"].mode()[0])
        df_most_frequent = compressed_df[compressed_df["activity"] == most_frequent]
        ids_standard.extend(df_most_frequent.id.unique().tolist())
        for j in ids_standard:
            compressed_df.drop(compressed_df.loc[compressed_df.id == j].index, inplace = True)
    
    return ids_standard


# Function to add labels to standard variants with label zero as undesirable class
def add_class_labels(df, p):
    ids_undesirable = identify_standard_variant_ids(df, p)
    label_column = []
    
    for i in range(0, len(df)):
        if df.id.loc[i] in ids_undesirable:
            label_column.append(0)
        else:
            label_column.append(1)

    labelled_df = df
    labelled_df["label"] = label_column
  
    return labelled_df


# Function to build subdatasets and restore original dataset
def build_subdatasets(df, p):
    labelled_df = add_class_labels(df, p)
    undesirable = labelled_df[labelled_df.label == 0]
    undesirable = undesirable[["id", "activity", "duration", "time_since_start"]]

    positively_deviant = labelled_df[labelled_df.label == 1]
    positively_deviant = positively_deviant[["id", "activity", "duration", "time_since_start"]]
    
    df = df[["id", "activity", "duration", "time_since_start"]]
    
    return df, undesirable, positively_deviant

In [None]:
# Build subdatasets
df, undesirable, positively_deviant = build_subdatasets(df, 0.2)

In [None]:
# Function to normalize trace lengths
def normalize_trace_length(df, max_trace_len):
    for i in df.id.unique():
        trace_len = df.id[df.id == i].count()
        while trace_len < max_trace_len:
            new_row = {"id": i, "activity": "none", "duration": 0.0, "time_since_start": max_time_since_start}
            df = df.append(new_row, ignore_index=True)
            trace_len += 1

    df = df.rename_axis("index").sort_values(by= ["id", "index"])
    df = df.reset_index(drop = True)
    
    return df

In [None]:
# Function to fit one hot encoder to activity column
def fit_activity_encoder(df):
    encoder_activity = OneHotEncoder(sparse=False)
    encoder_activity.fit(df.activity.to_numpy().reshape(-1,1))
    return encoder_activity


# Function to fit MinMaxScaler on numerical values in duration column to [0,1] 
def fit_time_scalers(df):           
    scaler_duration = MinMaxScaler()
    scaler_duration.fit(df.duration.to_numpy().reshape(-1,1))
    
    scaler_time_since_start = MinMaxScaler()
    scaler_time_since_start.fit(df.time_since_start.to_numpy().reshape(-1,1))
    
    return scaler_duration, scaler_time_since_start


# Function to encode variables on original dataset with encoders fitted to the original dataset
def encode_variables(df):
    encoder_activity = fit_activity_encoder(df)
    scaler_duration, scaler_time_since_start = fit_time_scalers(df)
    id_column = df.id.to_numpy().reshape(-1,1)
    
    activity_transformed = encoder_activity.transform(df.activity.to_numpy().reshape(-1,1))
    duration_transformed = scaler_duration.transform(df.duration.to_numpy().reshape(-1,1))
    time_since_start_transformed = scaler_time_since_start.transform(df.duration.to_numpy().reshape(-1,1))
    transformed = np.hstack((id_column, activity_transformed, duration_transformed, time_since_start_transformed))
    
    return transformed


# Function to encode variables on subdatasets with encoders fitted to original dataset
def encode_variables_sub(subdf, df):
    encoder_activity = fit_activity_encoder(df)
    scaler_duration, scaler_time_since_start = fit_time_scalers(df)
    id_column = subdf.id.to_numpy().reshape(-1,1)
    
    activity_transformed = encoder_activity.transform(subdf.activity.to_numpy().reshape(-1,1))
    duration_transformed = scaler_duration.transform(subdf.duration.to_numpy().reshape(-1,1))
    time_since_start_transformed = scaler_time_since_start.transform(subdf.duration.to_numpy().reshape(-1,1))
    transformed = np.hstack((id_column, activity_transformed, duration_transformed, time_since_start_transformed))
    
    return transformed


# Function to split data into sequences according to a given trace length
def split_data_into_traces(df, trace_len):
    sequences = []
    j = 0
    for i in range(0, int((df.shape[0]+1)/trace_len)):
        sequence = df[j : j + trace_len, 1:]                 
        sequence = sequence.reshape(1, trace_len, -1)
        sequences.append(sequence)
        j += trace_len
    sequences = np.vstack((sequences))
    
    return sequences

In [None]:
# Normalizing trace lengths in all datasets
norm_df = normalize_trace_length(df, max_trace_len)
norm_undesirable = normalize_trace_length(undesirable, max_trace_len)
norm_positively_deviant = normalize_trace_length(positively_deviant, max_trace_len)

# Encoding the data
data_transformed = encode_variables(norm_df)
undesirable_data_transformed = encode_variables_sub(norm_undesirable, norm_df)
positively_deviant_data_transformed = encode_variables_sub(norm_positively_deviant, norm_df)

# Splitting the data into traces   
sequences = split_data_into_traces(data_transformed, max_trace_len)
undesirable_sequences = split_data_into_traces(undesirable_data_transformed, max_trace_len)
positively_deviant_sequences = split_data_into_traces(positively_deviant_data_transformed, max_trace_len)

## 3 Automated process improvement

In [None]:
# Function to get random noise in the correct data format
def get_noise(n_samples, data, device):
    z = torch.randn(n_samples, data.shape[1], data.shape[2], device = device)
    return z

In [None]:
# Generator class
class Generator(nn.Module):
    def __init__(self, data, batch_size, hidden_size, num_layers, num_directions):
        seq_len = data.shape[1]
        input_size = data.shape[2]
        
        super().__init__()
        
        self.input_size = input_size
        self.h = torch.randn(num_layers * num_directions, batch_size, hidden_size)
        self.c = torch.randn(num_layers * num_directions, batch_size, hidden_size)
        latent_vector_size = 50 * batch_size

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, dropout=0.25, batch_first=True, bidirectional=False)

        self.linear1 = nn.Linear(batch_size * seq_len * hidden_size, latent_vector_size)
        self.linearHC = nn.Linear(num_layers * hidden_size * batch_size, latent_vector_size)
        self.linearHCO = nn.Linear(3 * latent_vector_size, batch_size * seq_len * input_size)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax()
        
            
    def forward(self, x):  
        
        seq_len = data.shape[1]
        input_size = data.shape[2]
        
        output, (h,c) = self.lstm(x,(self.h, self.c))
        self.h = h.detach()
        self.c = c.detach()
        
        u = output.reshape((output.size()[0] * output.size()[1] * output.size()[2]))
        u = self.relu(self.linear1(u))
        
        uH = F.leaky_relu(self.linearHC(h.reshape((h.size()[0] * h.size()[1] * h.size()[2]))))
        uC = F.leaky_relu(self.linearHC(c.reshape((c.size()[0] * c.size()[1] * c.size()[2]))))
        uHCO = torch.cat((uH, uC, u))
        uHCO = self.linearHCO(uHCO)
        u = uHCO
        
        output = u.view((output.size()[0], output.size()[1], self.input_size))
        
        return output

In [None]:
# Discriminator class
class Discriminator(nn.Module):
    def __init__(self, data, batch_size, hidden_size, num_layers, num_directions):
        self.batch_size = batch_size 
        seq_len = data.shape[1]
        input_size = data.shape[2]
        
        super().__init__()
    
        self.h = torch.randn(num_layers * num_directions, batch_size, hidden_size)
        self.c = torch.randn(num_layers * num_directions, batch_size, hidden_size)
        latent_vector_size = 50 * batch_size
                        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, dropout = 0.25, batch_first = True, bidirectional = False)

    
        self.linear1 = nn.Linear(batch_size * seq_len * hidden_size, latent_vector_size)
        self.linearHC = nn.Linear(num_layers * hidden_size * batch_size, latent_vector_size)
        self.linearHCO = nn.Linear(3 * latent_vector_size, batch_size * seq_len * input_size)
        self.linear2 = nn.Linear(batch_size * seq_len * input_size, 100)
        self.linear3 = nn.Linear(100, 50)
        self.linear4 = nn.Linear(50, batch_size)

        self.linear5 = nn.Linear(batch_size * seq_len * input_size, 100)
        self.linear6 = nn.Linear(100, 50)
        self.linear7 = nn.Linear(50, batch_size)
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax()
        
        
    def forward(self,x):
        
        seq_len = data.shape[1]
        input_size = data.shape[2]
        
        output, (h,c) = self.lstm(x, (self.h, self.c))
        self.h = h.detach()
        self.c = c.detach()
        
        u = output.reshape((output.size()[0] * output.size()[1] * output.size()[2]))
        u = self.relu(self.linear1(u))
        
        uH = F.leaky_relu(self.linearHC(h.reshape((h.size()[0] * h.size()[1] * h.size()[2]))))
        uC = F.leaky_relu(self.linearHC(c.reshape((c.size()[0] * c.size()[1] * c.size()[2]))))
        uHCO = torch.cat((uH, uC, u))
        uHCO = self.linearHCO(uHCO)
        
        # Classification to determine real vs. fake
        u = F.relu(self.linear2(uHCO))
        u = F.relu(self.linear3(u))
        u = self.linear4(u)
        output = u
        standard_output = output.reshape((self.batch_size, -1))
        
        
        # Additional classification to determine standard/deviant with a sigmoid function 
        cl = F.relu(self.linear5(uHCO))
        cl = F.relu(self.linear6(cl))
        cl = self.linear7(cl)
        cl_output = cl
        class_output = output.reshape((self.batch_size, -1))
              
        return standard_output, class_output

In [None]:
# Function to train generator and discriminator
def train(disc, gen, disc_optimizer, gen_optimizer, data, standard_data, deviant_data, epochs, batch_size, device):
 
    current_step = 0
 
    batches_per_epoch = int(data.shape[0] / batch_size)
    target_nr_indices = batch_size * batches_per_epoch
    
    disc = disc.float()
            
    for epoch in range(epochs):
        indices_list = list(range(0, data.shape[0]))
        undesirable_list = list(range(0,undesirable_data.shape[0]))
        positively_deviant_list = list(range(0, positively_deviant_data.shape[0]))
                            
        total_gen_loss_of_epoch = []
        
        total_disc_loss_of_epoch = [] 
        
        if not len(undesirable_list) == target_nr_indices:
            nr_lists = int(target_nr_indices/len(undesirable_list))
            undesirable_indices_list = []
            for i in range(nr_lists):
                undesirable_indices_list.extend(undesirable_list)
            nr_single_values = target_nr_indices - len(undesirable_indices_list)
            values = sample(undesirable_list, nr_single_values)
            undesirable_indices_list.extend(values)
        else:
            undesirable_indices_list = undesirable_list 
                            
                            
        if not len(positively_deviant_list) == target_nr_indices:
            nr_lists = int(target_nr_indices/len(positively_deviant_list))
            positively_deviant_indices_list = []
            for i in range(nr_lists):
                positively_deviant_indices_list.extend(positively_deviant_list)
            nr_single_values = target_nr_indices - len(positively_deviant_indices_list)
            values = sample(positively_deviant_list, nr_single_values)
            positively_deviant_indices_list.extend(values)
        else:
            positively_deviant_indices_list = positively_deviant_list
                                                                 
                                    
        for batch in range(batches_per_epoch):
     
            # Update Discriminator
            disc_optimizer.zero_grad()
            
            ## Loss for real vs. fake classification 
            ### Produce synthetic sequences 
            z = get_noise(batch_size, data, device)
            fake = gen(z)
            
            ### Get real data batch                
            batch_indices = sample(indices_list, batch_size)
            for i in batch_indices:
                indices_list.remove(i)
            real = data[batch_indices]
            real = real.astype("float64")
            real = torch.from_numpy(real)  
            real = real.to(device)
            
            ### Get prediction results and calculate classification loss
            disc_fake_pred, non_necessary_output = disc(fake.detach())
            disc_fake_loss = criterion(torch.sigmoid(disc_fake_pred), torch.zeros_like(disc_fake_pred))  
           
            disc_real_pred, non_necessary_output = disc(real.float())
            disc_real_loss = criterion(torch.sigmoid(disc_real_pred), torch.ones_like(disc_real_pred))   
            
            real_fake_disc_loss = (disc_fake_loss + disc_real_loss)/2    

            ### Update gradients
            real_fake_disc_loss.backward(retain_graph = True)
            
            
            ## Loss for undesirable vs. positively deviant classification
            ### Get real undesirable batch
            undesirable_batch_indices = sample(undesirable_indices_list, batch_size)
            for i in undesirable_batch_indices:
                undesirable_indices_list.remove(i)
            undesirable = undesirable_data[undesirable_batch_indices]
            undesirable = undesirable.astype("float64")
            undesirable = torch.from_numpy(undesirable)
            undesirable = undesirable.to(device)
            
            ### Get real positively deviant batch
            positively_deviant_batch_indices = sample(positively_deviant_indices_list, batch_size)
            for i in positively_deviant_batch_indices:
                positively_deviant_indices_list.remove(i)                
            positively_deviant = positively_deviant_data[positively_deviant_batch_indices]
            positively_deviant = positively_deviant.astype("float64")
            positively_deviant = torch.from_numpy(positively_deviant)
            positively_deviant = positively_deviant.to(device)
            
            ### Get prediction results and calculate classification loss 
            non_necessary_output, disc_undesirable_pred = disc(undesirable.float())
            disc_undesirable_loss = criterion(torch.sigmoid(disc_undesirable_pred), torch.zeros_like(disc_undesirable_pred))
            
            non_necessary_output, disc_positively_deviant_pred = disc(positively_deviant.float())
            disc_positively_deviant_loss = criterion(torch.sigmoid(disc_positively_deviant_pred), torch.ones_like(disc_positively_deviant_pred))
            
            undesirable_positively_deviant_loss = (disc_undesirable_loss + disc_positively_deviant_loss)/2
            
            ### Update gradients
            undesirable_positively_deviant_loss.backward(retain_graph = True)
            
            ## Update optimizer
            disc_optimizer.step()
            
            ## Calculate total discriminator loss and keep track of average loss
            disc_loss = (real_fake_disc_loss + undesirable_positively_deviant_loss)/2
            total_disc_loss_of_epoch.append(disc_loss)

            
            
            # Update Generator
            gen_optimizer.zero_grad()
                        
            ## Get prediction results
            disc_fake_pred, disc_class_pred = disc(fake)
            
            ## Loss for similarity (real vs. fake classification)
            sim_gen_loss = criterion(torch.sigmoid(disc_fake_pred), torch.ones_like(disc_fake_pred))   
            
            ## Loss for innovation (undesirable vs. positively deviant) 
            undesirable_gen_loss = criterion(torch.sigmoid(disc_class_pred), torch.zeros_like(disc_class_pred))
            if undesirable_gen_loss < 0:
                undesirable_gen_loss = 0
                print(f"Epoch {current_step}: inno_loss had to be adjusted to 1") 
            elif undesirable_gen_loss >1:
                undesirable_gen_loss = 1
                print(f"Epoch {current_step}: inno_loss had to be adjusted to 0")
            
            inno_gen_loss = 1-undesirable_gen_loss
            
            
            ## Calculate total generator loss and keep track of average loss
            gen_loss = (3*sim_gen_loss + inno_gen_loss)/4
            total_gen_loss_of_epoch.append(gen_loss)
            
            ## Update gradients and optimizer
            gen_loss.backward()
            gen_optimizer.step()
            
            
        # Visualization            
        if current_step % display_step == 0 and current_step > 0:
            mean_gen_loss = sum(total_gen_loss_of_epoch)/len(total_gen_loss_of_epoch)
            mean_disc_loss = sum(total_disc_loss_of_epoch)/len(total_disc_loss_of_epoch)
            print(f"Epoch {current_step}: Generator loss: {mean_gen_loss}, Discriminator loss: {mean_disc_loss}")       
        current_step += 1

In [None]:
# Define the hyperparameters
beta1 = 0.5
beta2 = 0.999
betas = ((beta1, beta2))
lr = 0.0002 
device = "cpu"
criterion = nn.BCELoss() 
batch_size = 10                                    
epochs = 50 
display_step = 5

# Define the input data
data = sequences             
standard_data = standard_sequences
deviant_data = deviant_sequences
seq_len = data.shape[1]
input_size = data.shape[2]

In [None]:
# Initialize generator, discriminator and optimizers
gen = Generator(data = data, 
                batch_size = batch_size, 
                hidden_size = 2 * data.shape[2], 
                num_layers = 2, 
                num_directions = 1).to(device)

disc = Discriminator(data = data, 
                     batch_size = batch_size, 
                     hidden_size = 2 * data.shape[2], 
                     num_layers = 2, 
                     num_directions = 1).to(device)
 

    gen_optimizer = torch.optim.Adam(gen.parameters(), lr=lr, betas = betas)
disc_optimizer = torch.optim.Adam(disc.parameters(), lr=lr, betas = betas)

In [None]:
# Train
train(disc = disc, 
      gen = gen, 
      disc_optimizer = disc_optimizer, 
      gen_optimizer = gen_optimizer, 
      data = data, 
      standard_data = standard_data, 
      deviant_data = deviant_data, 
      epochs = epochs, 
      batch_size = batch_size, 
      device = device)

## 4 Output and data postprocessing

In [None]:
# Function to create a new synthetic batch
def create_synthetic_batch(batch_size, data, gen, device):
    z = get_noise(batch_size, data, device)
    synthetic_data = gen(z)
    synthetic_data = synthetic_data.detach().numpy()
    
    return synthetic_data
    
    
# Function to decode results
def decode_results(synthetic_data, df):
    encoder_activity = fit_activity_encoder(df)
    scaler_duration, scaler_time_since_start = fit_time_scalers(df)
    decoded_data = []
    
    for i in range(synthetic_data.shape[0]):
        for j in range(synthetic_data.shape[1]):
            temp = synthetic_data[i,j]
            
            activity_temp = temp[0:len(df.activity.unique())]
            duration_temp = temp[len(df.activity.unique()):-1]
            time_since_start_temp = temp[-1]
            
            activity_inverse = encoder_activity.inverse_transform(activity_temp.reshape(1,-1))[0][0]
            duration_inverse = scaler_duration.inverse_transform(duration_temp.reshape(1,-1))[0][0]
            time_since_start_inverse = scaler_time_since_start.inverse_transform(time_since_start_temp.reshape(1,-1))[0][0]
            
            decoded_data.append([activity_inverse, duration_inverse, time_since_start_inverse])   
    
    decoded_data = pd.DataFrame(decoded_data, columns = ["activity", "duration", "time_since_start"])
    
    return decoded_data


# Function to correct negative durations
def correct_negative_durations(decoded_data):
    for i in range(len(decoded_data)):
        decoded_data.duration[i] = abs(float(df.duration[i]))
        decoded_data.time_since_start[i] = abs(float(df.time_since_start[i]))
    
    return decoded_data
    

# Function to add new IDs
def add_IDs(decoded_data, trace_len):
    id_column = []
    for i in range(int(len(decoded_data)/trace_len)):
        for j in range(trace_len):
            id_column.append(i)
    new_data = decoded_data
    new_data["ID"] = id_column
    
    return new_data
    

# Function to add new timestamps (for processes without idle time, GAN-generated and sequential timestamps)
def add_timestamps(data_with_id, trace_len, start_time):
    start_indices = []
    for i in range(int(len(data_with_id)/trace_len)):
        j = 0
        start_indices.append(j + i*trace_len)
    
    seq_start_timestamps = []
    seq_end_timestamps = []
    
    for i in start_indices:               
        seq_start_time = start_time
        index = i
        for j in range(trace_len):
            seq_start_timestamps.append(seq_start_time)
            seq_end_timestamp = seq_start_time + timedelta(seconds = float(data_with_id.duration[index]))
            seq_end_timestamps.append(seq_end_timestamp)
            seq_start_time = seq_end_timestamp             
            index = i+j+1
    
    calc_start_timestamps = []
    calc_end_timestamps = []
    
    for i in start_indices:               
        index = i
        for j in range(trace_len):
            calc_start_timestamp = start_time + timedelta(seconds = float(data_with_id.time_since_start[index]))
            calc_start_timestamps.append(calc_start_timestamp)
            
            calc_end_timestamp = calc_start_timestamp + timedelta(seconds = float(data_with_id.duration[index]))
            calc_end_timestamps.append(calc_end_timestamp)
 
            index = i+j+1    
       
    data_with_timestamps = data_with_id
    
    data_with_timestamps["calculated_start_time"] = calc_start_timestamps
    data_with_timestamps["calculated_end_time"] = calc_end_timestamps
    
    data_with_timestamps["sequential_start_time"] = seq_start_timestamps
    data_with_timestamps["sequential_end_time"] = seq_end_timestamps
    
    return data_with_timestamps
    
    
# Function to generate and decode new synthetic data based on original dataset and transformed data
def generate_new_data(df, trace_len, batch_size, data, gen, device, start_time):
    synthetic_data = create_synthetic_batch(batch_size, data, gen, device)
    decoded_data = decode_results(synthetic_data, df)
    corrected_decoded_data = correct_negative_durations(decoded_data)
    data_with_id = add_IDs(corrected_decoded_data, trace_len)
    data_with_timestamps = add_timestamps(data_with_id, trace_len, start_time)
   
    return data_with_timestamps

In [None]:
# Generate and decode new data
trace_len = max_trace_len 
start_time = datetime("Placeholder")

results = generate_new_data(df, 
                            trace_len = trace_len, 
                            batch_size = batch_size, 
                            data=data, gen=gen, 
                            device=device, 
                            start_time = start_time)

In [None]:
# Export results as csv file
results.to_csv(output_path)