# CCC 19 Model to process tree

In [None]:
from pm4py.objects.petri_net.importer import importer as pnml_importer
from pm4py.visualization.petri_net import visualizer as pn_visualizer
from pm4py.objects.conversion.wf_net import converter as wf_net_converter
from pm4py.objects.process_tree.exporter import exporter as ptml_exporter

net, im, fm = pnml_importer.apply("ccc19_model.pnml")

gviz = pn_visualizer.apply(net, im, fm)
pn_visualizer.view(gviz)

tree = wf_net_converter.apply(net, im, fm)
print(tree)

ptml_exporter.apply(tree, "ccc19_process_tree.ptml")

# Generate parallel event logs

In [None]:
from pm4py.algo.simulation.tree_generator import simulator as tree_gen


def create_trees_with_parallel_probabilities(
    relative_parallel_probabilities, n_trees_per_probability=10
):
    trees = dict()

    for relative_parallel_node in relative_parallel_nodes:
        other_probabilty = (1 - relative_parallel_node) / 3
        parameters = {
            "mode": 10,
            "min": 10,
            "max": 10,
            "choice": other_probabilty,
            "sequence": other_probabilty,
            "parallel": relative_parallel_node,
            "loop": other_probabilty,
            "no_models": n_trees_per_probability,
        }
        trees[relative_parallel_node] = tree_gen.apply(parameters=parameters)

    return trees

In [None]:
from pm4py.objects.log.obj import Trace, Event, EventLog
import random


def get_distinct_activities_from_log(log):
    distinct_activities = set()

    for trace in log:
        for event in trace:
            distinct_activities.add(event["concept:name"])

    return distinct_activities


def add_deviations_to_log(log, trace_probability, event_probability, action_dist):
    assert sum(action_dist.values()) <= 1

    distinct_activities = list(get_distinct_activities_from_log(log))
    new_log = EventLog()

    for trace in log:
        if random.random() > trace_probability:
            continue

        new_trace = Trace()

        for event in trace:
            if random.random() > event_probability:
                new_trace.append(event)
                continue

            event_action_prop = random.random()

            if event_action_prop <= action_dist["before"]:
                e = Event()
                e["concept:name"] = random.choices(distinct_activities)[0]
                new_trace.append(e)
                new_trace.append(event)
            elif event_action_prop <= action_dist["before"] + action_dist["after"]:
                new_trace.append(event)
                e = Event()
                e["concept:name"] = random.choices(distinct_activities)[0]
                new_trace.append(e)
            else:
                e = Event()
                e["concept:name"] = random.choices(distinct_activities)[0]
                new_trace.append(e)

        new_log.append(new_trace)

    return new_log

In [None]:
from pm4py.objects.process_tree.utils.generic import parse
from pm4py.visualization.process_tree import visualizer as pt_visualizer
from pm4py.objects.process_tree import semantics
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
from pm4py.objects.process_tree.exporter import exporter as ptml_exporter


def generate_log_for_pt(tree, log_name, no_traces=1000, plot_tree=True):
    if plot_tree:
        gviz = pt_visualizer.apply(
            tree,
            parameters={
                pt_visualizer.Variants.WO_DECORATION.value.Parameters.FORMAT: "png"
            },
        )
        pt_visualizer.view(gviz)

    log = semantics.generate_log(tree, no_traces=no_traces)
    log = add_deviations_to_log(log, 0.3, 0.2, {"before": 0.25, "after": 0.25})
    xes_exporter.apply(log, log_name + ".xes")
    ptml_exporter.apply(tree, log_name + ".ptml")

In [None]:
relative_parallel_nodes = [i / 10 for i in range(11)]
trees_dict = create_trees_with_parallel_probabilities(relative_parallel_nodes)

for prop, trees in trees_dict.items():
    for i, tree in enumerate(trees):
        generate_log_for_pt(tree, f"logs_parallel/log_parallel_{prop}_{i}")

# Results

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

sns.set_theme(style="whitegrid", font_scale=0.8)

ALGORITHM_BASELINE_DIJKSTRA = "BASELINE_DIJKSTRA"
ALGORITHM_BASELINE_A_STAR = "BASELINE_A_STAR"
ALGORITHM_TP_DIJKSTRA_NAIVE = "TP_DIJKSTRA_NAIVE"
ALGORITHM_TP_DIJKSTRA_NOT_NAIVE = "TP_DIJKSTRA_NOT_NAIVE"
ALGORITHM_TP_DIJKSTRA_NOT_NAIVE_ENFORCE_FIRST_TAU = (
    "TP_DIJKSTRA_NOT_NAIVE_ENFORCE_FIRST_TAU_MOVE"
)
ALGORITHM_TP_A_STAR_NAIVE = "TP_A_STAR_NAIVE"
ALGORITHM_TP_A_STAR_NOT_NAIVE = "TP_A_STAR_NOT_NAIVE"
ALGORITHM_TP_A_STAR_NOT_NAIVE_ENFORCE_FIRST_TAU = (
    "TP_A_STAR_NOT_NAIVE_ENFORCE_FIRST_TAU_MOVE"
)

ALGORITHMS = [
    ALGORITHM_BASELINE_DIJKSTRA,
    ALGORITHM_BASELINE_A_STAR,
    ALGORITHM_TP_DIJKSTRA_NAIVE,
    ALGORITHM_TP_DIJKSTRA_NOT_NAIVE,
    ALGORITHM_TP_A_STAR_NAIVE,
    ALGORITHM_TP_A_STAR_NOT_NAIVE,
    ALGORITHM_TP_DIJKSTRA_NOT_NAIVE_ENFORCE_FIRST_TAU,
    ALGORITHM_TP_A_STAR_NOT_NAIVE_ENFORCE_FIRST_TAU,
]
ALGORITHMS_A_STAR = [alg for alg in ALGORITHMS if "A_STAR" in alg]
ALGORITHMS_DIJKSTRA = [alg for alg in ALGORITHMS if "DIJKSTRA" in alg]

In [None]:
import os

RESULTS = "./results_09"
# RESULTS = ''
# FILENAME = 'BPI_CH_2020_PrepaidTravelCost'
# FILENAME = 'BPI_Challenge_2012'
# FILENAME = 'BPI_Challenge_2019'
# FILENAME = 'ccc19'
# FILENAME = 'hospital_billing'
# FILENAME = 'receipt'
# FILENAME = 'RoadTrafficFineManagement'
FILENAME = "sepsis_cases"

from pathlib import Path

Path(os.path.join(RESULTS, "plots", FILENAME)).mkdir(parents=True, exist_ok=True)

results_df = pd.read_csv(os.path.join(RESULTS, FILENAME + "_infix_results.csv"))
results_df

In [None]:
from pm4py.objects.process_tree.importer import importer as ptml_importer
from pm4py.visualization.process_tree import visualizer as pt_visualizer

tree = ptml_importer.apply(os.path.join(RESULTS, FILENAME + ".ptml"))

gviz = pt_visualizer.apply(
    tree,
    parameters={pt_visualizer.Variants.WO_DECORATION.value.Parameters.FORMAT: "png"},
)
pt_visualizer.view(gviz)

In [None]:
# Remove longest 2% of infixes

long_df = results_df[["Infix", "Infix Length"]].groupby(by=["Infix"]).max()
n_infixes_to_remove = math.ceil(len(long_df) / 100) * 2
infixes_to_remove = set()

for i in range(n_infixes_to_remove):
    max_idx = long_df["Infix Length"].idxmax()
    infixes_to_remove.add(max_idx)
    long_df = long_df.drop(index=max_idx)

results_df = results_df[~results_df["Infix"].isin(infixes_to_remove)]

In [None]:
import re

results_df["Distinct Activities"] = results_df.apply(
    lambda row: len(set(re.findall("'(.*?)'", row["Infix"]))), axis=1
)
results_df["SPN State Space Size"] = results_df["State Space Size"] * (
    results_df["Infix Length"] + 1
)

In [None]:
from matplotlib.pyplot import figure

BASELINE_LABEL = "Baseline approach"
NAIVE_LABEL = "Naive approach"
ADVANCED_LABEL = "Advanced approach"
ADVANCED_FIRST_MODEL_LABEL = "Advanced approach + pruning + forcing model move"

DIJKSTRA_RENAMINGS = {
    ALGORITHM_BASELINE_DIJKSTRA: BASELINE_LABEL,
    ALGORITHM_TP_DIJKSTRA_NAIVE: NAIVE_LABEL,
    ALGORITHM_TP_DIJKSTRA_NOT_NAIVE: ADVANCED_LABEL,
    ALGORITHM_TP_DIJKSTRA_NOT_NAIVE_ENFORCE_FIRST_TAU: ADVANCED_FIRST_MODEL_LABEL,
}

A_STAR_RENAMINGS = {
    ALGORITHM_BASELINE_A_STAR: BASELINE_LABEL,
    ALGORITHM_TP_A_STAR_NAIVE: NAIVE_LABEL,
    ALGORITHM_TP_A_STAR_NOT_NAIVE: ADVANCED_LABEL,
    ALGORITHM_TP_A_STAR_NOT_NAIVE_ENFORCE_FIRST_TAU: ADVANCED_FIRST_MODEL_LABEL,
}

ALGORITHMS_A_STAR_BASELINE_AND_ADVANCED = ALGORITHMS_A_STAR.copy()
ALGORITHMS_A_STAR_BASELINE_AND_ADVANCED.remove(
    ALGORITHM_TP_A_STAR_NOT_NAIVE_ENFORCE_FIRST_TAU
)
ALGORITHMS_A_STAR_BASELINE_AND_ADVANCED.remove(ALGORITHM_TP_A_STAR_NAIVE)

ALGORITHMS_DIJKSTRA_BASELINE_AND_ADVANCED = ALGORITHMS_DIJKSTRA.copy()
ALGORITHMS_DIJKSTRA_BASELINE_AND_ADVANCED.remove(
    ALGORITHM_TP_DIJKSTRA_NOT_NAIVE_ENFORCE_FIRST_TAU
)
ALGORITHMS_DIJKSTRA_BASELINE_AND_ADVANCED.remove(ALGORITHM_TP_DIJKSTRA_NAIVE)

plot_configs = [
    ("DIJKSTRA", ALGORITHMS_DIJKSTRA, DIJKSTRA_RENAMINGS, 2),
    (
        "DIJKSTRA_BASELINE_VS_ADVANCED",
        ALGORITHMS_DIJKSTRA_BASELINE_AND_ADVANCED,
        DIJKSTRA_RENAMINGS,
        2,
    ),
    ("A_STAR", ALGORITHMS_A_STAR, A_STAR_RENAMINGS, 2),
    (
        "A_STAR_BASELINE_VS_ADVANCED",
        ALGORITHMS_A_STAR_BASELINE_AND_ADVANCED,
        A_STAR_RENAMINGS,
        2,
    ),
]

sns_palette = sns.color_palette()
palette = {
    BASELINE_LABEL: sns_palette[0],
    NAIVE_LABEL: sns_palette[2],
    ADVANCED_LABEL: sns_palette[1],
    ADVANCED_FIRST_MODEL_LABEL: sns_palette[3],
}

for name, algorithms, renaming, n_cols in plot_configs:
    Path(os.path.join(RESULTS, "plots", FILENAME, name)).mkdir(
        parents=True, exist_ok=True
    )

    for with_outliers in [True, False]:
        d = results_df.copy()
        d = results_df[results_df["Algorithm"].isin(algorithms)]

        d["Infix Length Bin"] = pd.cut(d["Infix Length"], bins=6)
        d["Infix Length Bin Formatted"] = d["Infix Length Bin"].apply(
            lambda d: str(math.floor(d.left) + 1) + "-" + str(math.floor(d.right))
        )
        for curr_name, renamed_name in renaming.items():
            d.loc[d["Algorithm"] == curr_name, "Algorithm"] = renamed_name

        attributes_with_title = [
            ("Consumed Time", "Consumed Time (in seconds)", True),
            ("Preprocessing Duration", "Preprocessing Duration (in seconds)", True),
            ("Alignment Duration", "Alignment Duration (in seconds)", True),
            ("Visited States", "Visited States", True),
            ("Queued States", "Queued States", True),
            ("Added Tau Transitions", "Added Tau Transitions", False),
            ("SPN State Space Size", "SPN State Space Size", False),
        ]

        for attribute, y_label, should_save in attributes_with_title:
            figure(figsize=(6.3, 4))
            sns.boxplot(
                data=d,
                x="Infix Length Bin Formatted",
                y=attribute,
                hue="Algorithm",
                palette=palette,
                showfliers=with_outliers,
            )
            y_legend = 1.15 if n_cols == 2 and len(algorithms) > 2 else 1.09
            plt.legend(
                bbox_to_anchor=(0, y_legend), loc=2, borderaxespad=0.0, ncol=n_cols
            )
            plt.xlabel("Infix Length")
            plt.ylabel(y_label)
            if should_save:
                outlier_extension = ""
                if not with_outliers:
                    outlier_extension = " without outliers"
                plt.savefig(
                    os.path.join(
                        RESULTS,
                        "plots",
                        FILENAME,
                        name,
                        attribute + outlier_extension + ".pdf",
                    ),
                    bbox_inches="tight",
                )
            plt.show()

In [None]:
print("Non-deviating infixes:", len(results_df[results_df["Cost"] < 10000]))
print("Deviating infixes:", len(results_df[results_df["Cost"] >= 10000]))

In [None]:
results_df["Infix Length"].hist()

In [None]:
results_df["Distinct Activities"].hist()

In [None]:
mean_per_length_df_a_star = (
    results_df[results_df["Algorithm"].isin(ALGORITHMS_A_STAR)]
    .groupby(by=["Infix Length", "Algorithm"])
    .mean()
)
mean_per_length_df_dijkstra = (
    results_df[results_df["Algorithm"].isin(ALGORITHMS_DIJKSTRA)]
    .groupby(by=["Infix Length", "Algorithm"])
    .mean()
)
mean_dfs = [
    ("A Star", mean_per_length_df_a_star),
    ("Dijkstra", mean_per_length_df_dijkstra),
]

In [None]:
attributes_with_title = [
    ("Consumed Time", "Mean consumed time per infix length"),
    ("Preprocessing Duration", "Mean preprocessing duration per infix length"),
    ("Alignment Duration", "Mean alignment duration per infix length"),
    ("Visited States", "Mean visited states per infix length"),
    ("Queued States", "Mean queued states per infix length"),
    # ('LP Solved', 'Mean solved linear programs'),
    ("Added Tau Transitions", "Mean added tau transitions per infix length"),
]

for attribute, title in attributes_with_title:
    for search_alg, mean_df in mean_dfs:
        # sns.lineplot(data=mean_df.loc[pd.IndexSlice[0:60,:],:], x='Infix Length', y=attribute, hue='Algorithm', marker='.')
        sns.lineplot(
            data=mean_df, x="Infix Length", y=attribute, hue="Algorithm", marker="."
        )
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
        plt.title(title + " (" + search_alg + ")")
        plt.show()

In [None]:
results_df[results_df["Algorithm"] == ALGORITHM_TP_A_STAR_NAIVE]

In [None]:
for alg in ALGORITHMS_DIJKSTRA:
    filtered_df = (
        results_df[results_df["Algorithm"] == alg].groupby(by=["Infix Length"]).mean()
    )
    x = filtered_df.index
    y = [filtered_df["Preprocessing Duration"], filtered_df["Alignment Duration"]]

    plt.stackplot(x, y, labels=["Preprocessing Duration", "Alignment Duration"])
    plt.legend(loc="upper left")
    plt.xlabel("Infix Length")
    plt.ylabel("Duration")
    plt.title("Mean duration: " + alg)
    plt.show()

In [None]:
columns_of_interest = [
    "Infix Length",
    "Consumed Time",
    "Preprocessing Duration",
    "Alignment Duration",
    "Visited States",
]
datasets = [
    results_df[results_df["Algorithm"] == alg][["Infix"] + columns_of_interest]
    for alg in ALGORITHMS
]
for i in range(len(datasets)):
    for column in columns_of_interest:
        datasets[i] = datasets[i].rename(columns={column: column + "_" + ALGORITHMS[i]})

merged_df = datasets[0]
for i in range(1, len(datasets)):
    merged_df = merged_df.merge(datasets[i], on="Infix")

In [None]:
for variable in [
    "Consumed Time",
    "Preprocessing Duration",
    "Alignment Duration",
    "Visited States",
]:
    df = merged_df.copy()
    df[variable + " Dijkstra: Baseline - TP_Naive"] = (
        df[variable + "_" + ALGORITHM_BASELINE_DIJKSTRA]
        - df[variable + "_" + ALGORITHM_TP_DIJKSTRA_NAIVE]
    )
    df[variable + " Dijkstra: Baseline - TP_Not_Naive"] = (
        df[variable + "_" + ALGORITHM_BASELINE_DIJKSTRA]
        - df[variable + "_" + ALGORITHM_TP_DIJKSTRA_NOT_NAIVE]
    )
    df["Infix Length"] = df["Infix Length_" + ALGORITHM_BASELINE_DIJKSTRA]

    mean_per_length_df = pd.melt(
        df,
        id_vars=["Infix", "Infix Length"],
        value_vars=[
            variable + " Dijkstra: Baseline - TP_Naive",
            variable + " Dijkstra: Baseline - TP_Not_Naive",
        ],
        value_name=variable,
        var_name="alg",
    )
    mean_per_length_df = mean_per_length_df.groupby(by=["Infix Length", "alg"]).mean()
    sns.lineplot(
        data=mean_per_length_df, x="Infix Length", y=variable, hue="alg", marker="."
    )
    plt.ylabel(variable + " difference")
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
    plt.title(variable + ": Baseline - Algorithm (mean)")
    plt.show()

In [None]:
results_df[results_df["Infix"] == results_df.iloc[8057]["Infix"]]

In [None]:
results_df[results_df["Preprocessing Duration"] > 0.5]

In [None]:
results_df.iloc[2750]["Infix"]

In [None]:
from pm4py.visualization.transition_system import visualizer as ts_visualizer
from pm4py.objects.petri_net.utils import reachability_graph
from pm4py.objects.conversion.process_tree import converter as pt_converter

net, im, fm = pt_converter.apply(tree)
ts = reachability_graph.construct_reachability_graph(net, im)
gviz = ts_visualizer.apply(
    ts, parameters={ts_visualizer.Variants.VIEW_BASED.value.Parameters.FORMAT: "png"}
)
ts_visualizer.view(gviz)
print("States in reachability graph:", len(ts.states))

In [None]:
def plot_for_each_algorithm(plotting_func):
    for algorithm in ALGORITHMS:
        plotting_func(results_df[results_df["Algorithm"] == algorithm])
        plt.title(algorithm)
        plt.show()

In [None]:
sns.kdeplot(data=results_df, x="Consumed Time", hue="Algorithm")

In [None]:
plotting_func = lambda d: sns.histplot(data=d, x="Consumed Time")
plot_for_each_algorithm(plotting_func)

## Results for synthetic parallel trees

In [None]:
results = dict()
amount_parallel_nodes = [i / 10 for i in range(11)]
parallel_df = None

for amount_parallel in amount_parallel_nodes:
    for i in range(10):
        df = pd.read_csv(
            os.path.join(
                "./results_parallel",
                f"log_parallel_{amount_parallel}_{i}_infix_results.csv",
            )
        )
        df = df[
            [
                "Infix Length",
                "Preprocessing Duration",
                "Algorithm",
                "Alignment Duration",
            ]
        ]
        df["Amount Parallelism"] = amount_parallel
        df["Original Log"] = i

        if parallel_df is None:
            parallel_df = df
        else:
            parallel_df = pd.concat([parallel_df, df])

In [None]:
d = parallel_df[parallel_df["Algorithm"].isin(ALGORITHMS_DIJKSTRA)]
sns.boxplot(data=d, x="Amount Parallelism", y="Preprocessing Duration", hue="Algorithm")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
plt.title("Preprocessing duration")
plt.show()

In [None]:
sns_palette = sns.color_palette()
palette = {
    BASELINE_LABEL: sns_palette[0],
    NAIVE_LABEL: sns_palette[2],
    ADVANCED_LABEL: sns_palette[1],
    ADVANCED_FIRST_MODEL_LABEL: sns_palette[3],
}

d = parallel_df[
    (parallel_df["Algorithm"].isin(ALGORITHMS_A_STAR_BASELINE_AND_ADVANCED))
].copy()
d.loc[d["Algorithm"] == ALGORITHM_BASELINE_A_STAR, "Algorithm"] = BASELINE_LABEL
d.loc[d["Algorithm"] == ALGORITHM_TP_A_STAR_NOT_NAIVE, "Algorithm"] = ADVANCED_LABEL
d = d.groupby(by=["Amount Parallelism", "Algorithm"]).mean()

sns.lineplot(
    data=d,
    x="Amount Parallelism",
    y="Preprocessing Duration",
    hue="Algorithm",
    palette=palette,
)
plt.xlabel("Amount of Parallelism")
plt.ylabel("Preprocessing Duration\n(in seconds, log scale)")
plt.yscale("log")
handles, labels = plt.gca().get_legend_handles_labels()
order = [labels.index(BASELINE_LABEL), labels.index(ADVANCED_LABEL)]
plt.legend(
    [handles[idx] for idx in order],
    [labels[idx] for idx in order],
    bbox_to_anchor=(0, 1.09),
    loc=2,
    borderaxespad=0.0,
    ncol=3,
)
Path(os.path.join(RESULTS, "plots", "parallel_synthetic")).mkdir(
    parents=True, exist_ok=True
)
plt.savefig(
    os.path.join(
        RESULTS,
        "plots",
        "parallel_synthetic",
        "synthetic_parallel_logs_preprocessing_duration.pdf",
    ),
    bbox_inches="tight",
)
plt.show()

In [None]:
parallel_df[
    (parallel_df["Infix Length"] < 20)
    & (parallel_df["Amount Parallelism"] == 0.8)
    & (parallel_df["Algorithm"] == ALGORITHM_BASELINE_A_STAR)
]["Preprocessing Duration"].hist()

In [None]:
algs = [
    ALGORITHM_BASELINE_DIJKSTRA,
    ALGORITHM_TP_DIJKSTRA_NAIVE,
    ALGORITHM_TP_DIJKSTRA_NOT_NAIVE,
]
alg_to_name = {
    ALGORITHM_BASELINE_DIJKSTRA: BASELINE_LABEL,
    ALGORITHM_TP_DIJKSTRA_NAIVE: NAIVE_LABEL,
    ALGORITHM_TP_DIJKSTRA_NOT_NAIVE: ADVANCED_LABEL,
}

for alg in algs:
    alg_f = alg_to_name[alg]
    for prop in [j / 10 for j in range(11)]:
        Path(
            os.path.join(
                RESULTS,
                "plots",
                "parallel_synthetic",
                "algorithm_" + alg_f,
                "amout_parallel_" + str(prop),
            )
        ).mkdir(parents=True, exist_ok=True)
        for i in range(10):
            filtered_df = (
                parallel_df[
                    (parallel_df["Algorithm"] == ALGORITHM_BASELINE_DIJKSTRA)
                    & (parallel_df["Amount Parallelism"] == 0.8)
                    & (parallel_df["Original Log"] == i)
                ]
                .groupby(by=["Infix Length"])
                .median()
            )
            x = filtered_df.index
            y = [
                filtered_df["Preprocessing Duration"],
                filtered_df["Alignment Duration"],
            ]

            plt.stackplot(x, y, labels=["Preprocessing Duration", "Alignment Duration"])
            plt.legend(bbox_to_anchor=(0, 1.1), loc=2, borderaxespad=0.0, ncol=3)
            plt.xlabel("Infix Length")
            plt.ylabel("Duration (in seconds)")
            plt.savefig(
                os.path.join(
                    RESULTS,
                    "plots",
                    "parallel_synthetic",
                    "algorithm_" + alg_f,
                    "amout_parallel_" + str(prop),
                    "log_" + str(i) + ".pdf",
                ),
                bbox_inches="tight",
            )
            plt.show()

## Compare naive and not naive runtimes

In [None]:
sns.histplot(
    data=results_df[
        results_df["Algorithm"].isin(
            [
                ALGORITHM_TP_DIJKSTRA_NAIVE,
                ALGORITHM_TP_DIJKSTRA_NOT_NAIVE,
                ALGORITHM_BASELINE_DIJKSTRA,
            ]
        )
    ],
    x="Consumed Time",
    hue="Algorithm",
)

## Get overview of different models for different noise thresholds

In [None]:
from pm4py.visualization.process_tree import visualizer as pt_visualizer
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.discovery.inductive import algorithm as inductive_miner

NOISE_THRESHOLD = 0.99
FILENAME = "BPI_Challenge_2012"
log = xes_importer.apply(os.path.join("./results", FILENAME + ".xes"))
tree = inductive_miner.apply_tree(
    log,
    variant=inductive_miner.Variants.IM_CLEAN,
    parameters={
        inductive_miner.Variants.IM_CLEAN.value.Parameters.NOISE_THRESHOLD: NOISE_THRESHOLD
    },
)

gviz = pt_visualizer.apply(
    tree,
    parameters={pt_visualizer.Variants.WO_DECORATION.value.Parameters.FORMAT: "png"},
)
pt_visualizer.view(gviz)

# Check for infixes with different costs

This should never happen. All approach should determine equal minimal costs.

In [None]:
datasets = [
    results_df[results_df["Algorithm"] == alg][["Infix", "Cost", "Alignment"]]
    for alg in ALGORITHMS
]
merged_df = datasets[0].rename(
    columns={"Cost": "Cost_" + ALGORITHMS[0], "Alignment": "Alignment_" + ALGORITHMS[0]}
)

for i in range(1, len(ALGORITHMS)):
    merged_df = merged_df.merge(
        datasets[i].rename(
            columns={
                "Cost": "Cost_" + ALGORITHMS[i],
                "Alignment": "Alignment_" + ALGORITHMS[i],
            }
        ),
        on="Infix",
    )

assert len(merged_df) == len(results_df) / len(ALGORITHMS)
merged_df

In [None]:
cost_column_names = ["Cost_" + alg for alg in ALGORITHMS]
for i in range(0, len(cost_column_names) - 1):
    assert merged_df[cost_column_names[i]].equals(merged_df[cost_column_names[i + 1]])

In [None]:
merged_df_without_timeouts = merged_df.dropna()
print(len(merged_df_without_timeouts))
cost_column_names = ["Cost_" + alg for alg in ALGORITHMS]
for i in range(0, len(cost_column_names) - 1):
    assert merged_df_without_timeouts[cost_column_names[i]].equals(
        merged_df_without_timeouts[cost_column_names[i + 1]]
    )

In [None]:
compare_algorithms = [ALGORITHM_BASELINE_DIJKSTRA, ALGORITHM_TP_DIJKSTRA_NOT_NAIVE]
compare_algorithms_c = ["Cost_" + alg for alg in compare_algorithms]
compare_algorithms_a = ["Alignment_" + alg for alg in compare_algorithms]
filter_columns = ["Infix"] + compare_algorithms_a + compare_algorithms_c
merged_df[merged_df[compare_algorithms_c[0]] != merged_df[compare_algorithms_c[1]]][
    filter_columns
]

In [None]:
for alg in ALGORITHMS:
    print(alg, merged_df["Cost_" + alg].isna().sum())

In [None]:
results_df[results_df["Timeout"]]

In [None]:
merged_df.iloc[726][compare_algorithms_a[0]]

In [None]:
merged_df.iloc[726][compare_algorithms_a[1]]