In [1]:

import pandas as pd
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter


In [24]:
df = pd.read_pickle("./data/1_logs_org_a2.pkl")
df = df.sort_values(["case_seq_num", "timestamp"])


In [25]:

cases_before = df["case_seq_num"].nunique()
df = df.sort_values(["case_seq_num", "timestamp"])
df = df.groupby("case_seq_num").filter(lambda trace: trace["concept_name"].iloc[-1] == "Completed").reset_index(drop=True)
cases_after = df["case_seq_num"].nunique()
dropped = cases_before - cases_after
dropped, cases_before


(453, 1740)

In [11]:
from pm4py.objects.log.util import dataframe_utils

def clean_df(df):
    """
        Takes df as input, returns a new df cleaned by:
        map lifecycle transitions into new names (to reduce noise)
        drop consecutive duplicates of lifecycle transition in the same case's trace
        Sort by case_seq_num, timestamp
        Change column names
    """
    mapping = {
        "Assigned": "assignment",
        "Awaiting Assignment": "assignment",

        "In Progress": "work",

        "Wait - Implementation": "wait",
        "Wait": "wait",
        "Wait - User": "wait",
        "Wait - Vendor": "wait",

        "Resolved": "close",
        "Closed": "close",
        "In Call": "close",

        "Cancelled": "cancelled",
    }

    df["lifecycle_transition"] = df["lifecycle_transition"].map(mapping)

    cases_before = df["case_seq_num"].nunique()
    df = df.sort_values(["case_seq_num", "timestamp"])
    df = df.groupby("case_seq_num").filter(lambda trace: trace["lifecycle_transition"].iloc[-1] == "close").reset_index(drop=True)
    cases_after = df["case_seq_num"].nunique()
    dropped = cases_before - cases_after
    print(f"Dropped {dropped} cases out of {cases_before}  for not having CLOSE as end LIFECYCLE ")

    df = df[df["lifecycle_transition"].ne(df.groupby("case_seq_num")["lifecycle_transition"].shift())].reset_index(drop=True)
    df = df[["case_seq_num", "lifecycle_transition", "timestamp"]]
    df = df.rename(columns={
        "case_seq_num": "case:concept:name",
        "lifecycle_transition": "concept:name",
        "timestamp": "time:timestamp"
    })
    df = df.sort_values(["case:concept:name", "concept:name", "time:timestamp"])
    df = dataframe_utils.convert_timestamp_columns_in_df(df)

    return df


In [15]:


def get_most_common_trace(df):
    df = df.sort_values(["case:concept:name", "time:timestamp"])
    trace_variants=  (
        df.groupby("case:concept:name")["concept:name"].apply(lambda seq:tuple(seq))
    )
    variants_counts = trace_variants.value_counts()
    top10 = variants_counts.head(10)
    return top10


In [17]:
df_org_a2 = pd.read_pickle("./data/1_logs_org_a2.pkl")
df_org_c = pd.read_pickle("./data/1_logs_org_c.pkl")



#cleaning
cleaned_a2 = clean_df(df_org_a2)
cleaned_c = clean_df(df_org_c)

a2_common_trace = get_most_common_trace(cleaned_a2)
a2_common_trace

Dropped 453 cases out of 1740  for not having CLOSE as end LIFECYCLE 
Dropped 969 cases out of 6080  for not having CLOSE as end LIFECYCLE 


concept:name
(assignment, work, close)                            293
(work, close)                                        167
(assignment, work, wait, close)                      132
(work, wait, close)                                   76
(work, assignment, work, assignment, work, close)     62
(work, assignment, work, close)                       52
(assignment, work, assignment, work, close)           52
(assignment, work, assignment, work, wait, close)     44
(work, assignment, work, wait, close)                 30
(assignment, work, wait, work, close)                 21
Name: count, dtype: int64