# Process Mining — Event Log Analysis (PM4Py)

Neutral, portfolio-oriented notebook for **Process Discovery, Conformance (optional), and KPIs**.

**Datasets:** Start with **BPI 2017**; optionally include BPI 2012 / Helpdesk / Road Traffic / BPI 2020 to add originality.


In [ ]:
# If running on Colab, install dependencies:
try:
    import pm4py  # noqa
except Exception:
    !pip -q install pm4py==2.7.5 pandas matplotlib
import warnings; warnings.filterwarnings('ignore')


In [ ]:
from pathlib import Path
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.statistics.traces.generic.log import case_statistics
from pm4py.util import xes_constants
from pm4py.visualization.process_tree import visualizer as pt_visualizer
from pm4py.objects.conversion.process_tree import converter as pt_converter
from pm4py.visualization.petri_net import visualizer as pn_visualizer
from pm4py.algo.conformance.alignments.petri_net import algorithm as alignments
from pm4py.objects.log.util import sampling
import pm4py


## Provide log path
Upload or mount a log file (e.g., `BPI2017.xes.gz`) and set its path below. Keep the filename generic so it doesn't look tailored to any org.


In [ ]:
LOG_PATH = "/content/event_log.xes"  # e.g., /content/BPI2017.xes.gz
OUTDIR = Path("/content/pm_outputs"); OUTDIR.mkdir(parents=True, exist_ok=True)
print('Using:', LOG_PATH)


In [ ]:
log = xes_importer.apply(LOG_PATH)
num_events = sum(len(trace) for trace in log)
num_cases = len(log)
variants = case_statistics.get_variant_statistics(log)
num_variants = len(variants)
print(f"Cases: {num_cases} | Events: {num_events} | Variants: {num_variants}")


## Process Discovery — Inductive Miner


In [ ]:
sampled_log = sampling.sample_log_random(log, 1000) if num_events > 50000 else log
process_tree = inductive_miner.apply_tree(sampled_log)
gviz_tree = pt_visualizer.apply(process_tree)
tree_png = OUTDIR / "process_tree.png"; pt_visualizer.save(gviz_tree, str(tree_png))
net, im, fm = pt_converter.apply(process_tree)
gviz_pn = pn_visualizer.apply(net, im, fm)
pn_png = OUTDIR / "petri_net.png"; pn_visualizer.save(gviz_pn, str(pn_png))
print('Saved:', tree_png, pn_png)


## (Optional) Conformance — Alignments on small sample


In [ ]:
sample_for_align = sampling.sample_log_random(log, 200) if num_cases > 200 else log
aligned_traces = alignments.apply_log(sample_for_align, net, im, fm)
avg_fitness = sum(a['fitness'] for a in aligned_traces) / len(aligned_traces)
print(f"Average fitness (sample): {avg_fitness:.3f}")


## KPIs — Case durations (median)


In [ ]:
durations = case_statistics.get_all_casedurations(log, parameters={
    xes_constants.DEFAULT_TIMESTAMP_KEY: "time:timestamp"
})
median_duration = pm4py.statistics.util.common.median(durations)
print(f"Median case duration (seconds): {median_duration:.2f}")


## Save short text summary


In [ ]:
summary = OUTDIR / "analysis_summary.md"
with open(summary, 'w') as f:
    f.write("# Mini Event‑Log Analysis — Summary\n\n")
    f.write(f"- Cases: **{num_cases}**\n")
    f.write(f"- Events: **{num_events}**\n")
    f.write(f"- Variants: **{num_variants}**\n")
    try:
        f.write(f"- Average alignment fitness (sample): **{avg_fitness:.3f}**\n")
    except NameError:
        f.write("- Average alignment fitness (sample): _not computed_\n")
    f.write(f"- Median case duration (seconds): **{median_duration:.2f}**\n")
    f.write("\n## Figures\n")
    f.write("- Process tree: `process_tree.png`\n")
    f.write("- Petri net: `petri_net.png`\n")
print('Saved:', summary)
