# Basic Analysis

## Imports & read file

In [85]:
import pm4py
from pm4py.objects.conversion.log import converter as log_converter

# read XES file
log = pm4py.read_xes("./BPI Challenge 2017_1_all/BPI Challenge 2017.xes")
df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

## Events, Cases, Activities

In [86]:
print("#events:", len(df))
print("#cases:", df["case:concept:name"].nunique())
print("#activities:", df["concept:name"].nunique())


#events: 1202267
#cases: 31509
#activities: 26



## Variants

In [87]:
variants_dict = pm4py.stats.get_variants_as_tuples(log)
variant_counts = len(variants_dict)
print("#process variants: ", variant_counts)

#process variants:  15930


## Variants with pandas

In [88]:
case_col = "case:concept:name"
act_col = "concept:name"

seq_per_case = (
    df.sort_values([case_col, "time:timestamp"]).groupby(case_col)[act_col].apply(list)
)
variant_counts = seq_per_case.value_counts()
print("#process variants with pandas:", len(variant_counts))
for seq, cnt in variant_counts.head().items():
    print(cnt, "cases |", " → ".join(seq))

#process variants with pandas: 15930
1056 cases | A_Create Application → A_Submitted → W_Handle leads → W_Handle leads → W_Complete application → A_Concept → W_Complete application → A_Accepted → O_Create Offer → O_Created → O_Sent (mail and online) → W_Complete application → W_Call after offers → W_Call after offers → A_Complete → W_Call after offers → W_Call after offers → W_Call after offers → A_Cancelled → O_Cancelled → W_Call after offers
1021 cases | A_Create Application → W_Complete application → W_Complete application → A_Concept → A_Accepted → O_Create Offer → O_Created → O_Sent (mail and online) → W_Complete application → W_Call after offers → W_Call after offers → A_Complete → W_Call after offers → W_Call after offers → W_Call after offers → A_Cancelled → O_Cancelled → W_Call after offers
734 cases | A_Create Application → A_Submitted → W_Handle leads → W_Handle leads → W_Complete application → A_Concept → W_Complete application → W_Complete application → A_Accepted → O_Crea

# Case labels

In [95]:
case_labels = [c for c in df.columns if c.startswith("case:")]
print("#case labels:", len(case_labels))
print("Case labels:", case_labels)      

#case labels: 4
Case labels: ['case:ApplicationType', 'case:RequestedAmount', 'case:LoanGoal', 'case:concept:name']


## Event labels

In [96]:
event_labels = [c for c in df.columns if not c.startswith("case:")]
print("#event labels:", len(event_labels))
print("Event labels:", event_labels)

#event labels: 15
Event labels: ['Accepted', 'lifecycle:transition', 'Action', 'FirstWithdrawalAmount', 'MonthlyCost', 'concept:name', 'Selected', 'EventID', 'OfferID', 'EventOrigin', 'NumberOfTerms', 'time:timestamp', 'CreditScore', 'org:resource', 'OfferedAmount']


## Case length

In [98]:
case_length = df.groupby(case_col).size()
print("Case length: mean", case_length.mean().round(2))
print("Case length: median", case_length.median().round(2))
print("Case length: standard deviation", case_length.std().round(2))


Case length: mean 38.16
Case length: median 35.0
Case length: standard deviation 16.72


## Case duration

In [92]:
def format_timedelta(td):
    days = td.days
    hours = td.seconds // 3600
    minutes = (td.seconds // 60) % 60
    seconds = td.seconds % 60
    return f"{days} days, {hours} hours, {minutes} minutes, {seconds} seconds"

case_duration = (
    df.groupby(case_col)["time:timestamp"].max() - 
    df.groupby(case_col)["time:timestamp"].min()
)

print(f"Case duration: mean {format_timedelta(case_duration.mean())}")
print(f"Case duration: median {format_timedelta(case_duration.median())}")
print(f"Case duration: standard deviation {format_timedelta(case_duration.std())}")

Case duration: mean 21 days, 21 hours, 35 minutes, 25 seconds
Case duration: median 19 days, 2 hours, 6 minutes, 20 seconds
Case duration: standard deviation 13 days, 4 hours, 3 minutes, 41 seconds


## Categorical event attributes

In [93]:
categorical_event_attributes = [c for c in event_labels if df[c].dtype == "object"]

print("Categorical event attributes:", categorical_event_attributes)
print("#categorical event attributes:", len(categorical_event_attributes))


Categorical event attributes: ['Accepted', 'lifecycle:transition', 'Action', 'concept:name', 'Selected', 'EventID', 'OfferID', 'EventOrigin', 'org:resource']
#categorical event attributes: 9


## Originators ( employees or systems of the company)

In [94]:
unique_originators = df['org:resource'].dropna().unique()
print(f"#Originators: {len(unique_originators)}")
print("Originators range from User_1 to User_149")

#Originators: 149
Originators range from User_1 to User_149
