# Advanced Analysis: Comparison of Variant Calculation between pm4py and Celonis

In [16]:
import pm4py
from pm4py.objects.conversion.log import converter as log_converter

In [11]:
# read XES file
log = pm4py.read_xes("./BPI Challenge 2017_1_all/BPI Challenge 2017.xes")
df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

case_col = "case:concept:name"
act_col = "concept:name"

# Create activity sequences per case
seq_per_case = (
    df.sort_values([case_col, "time:timestamp"])
    .groupby(case_col)[act_col]
    .apply(list)
)

## Raw variants

In [13]:
variants_dict = pm4py.stats.get_variants_as_tuples(log)
print(f"pm4py (raw): {len(variants_dict):,} variants")

pm4py (raw): 15,930 variants


## Full deduplication 

**Note:** Code generated with Claude (Anthropic).

In [14]:
def deduplicate_trace(trace):
    if len(trace) == 0:
        return trace
    deduped = [trace[0]]
    for act in trace[1:]:
        if act != deduped[-1]:
            deduped.append(act)
    return deduped

seq_per_case_deduped = seq_per_case.apply(deduplicate_trace)
variant_counts_deduped = seq_per_case_deduped.value_counts()

print(f"pm4py (full deduplication): {len(variant_counts_deduped):,} variants")

pm4py (full deduplication): 4,290 variants


## Shortened variants (max_cycle_length=2, Celonis default)

**Note:** Code generated with Claude (Anthropic).

In [15]:
def shorten_trace(trace, max_cycle_length=2):
    if len(trace) == 0:
        return trace
    shortened = [trace[0]]
    count = 1
    for act in trace[1:]:
        if act == shortened[-1]:
            if count < max_cycle_length:
                shortened.append(act)
                count += 1
        else:
            shortened.append(act)
            count = 1
    return shortened

seq_per_case_shortened = seq_per_case.apply(lambda x: shorten_trace(x, max_cycle_length=2))
variant_counts_shortened = seq_per_case_shortened.value_counts()

print(f"pm4py (shortened, max_cycle=2): {len(variant_counts_shortened):,} variants")

pm4py (shortened, max_cycle=2): 7,084 variants
