In [None]:
import rustxes
import json

xes_path = "../../pm4py-core/tests/input_data/roadtraffic100traces.xes"

# log: polars DataFrame containing event data (i.e., event and trace attributes)
# other_data_str: JSON-encoded other Log data (like log-level attributes or extensions)
(log,other_data_str) = rustxes.import_xes(xes_path,None,None)
other_data = json.loads(other_data_str)

log

In [None]:
import pandas as pd
from pandas import DataFrame

# Rough comparison checks of two DataFrames
# Ignores certain differences (e.g., different datatypes) on purpose
def compare_log_dfs(df1: DataFrame,df2: DataFrame):
  col_diff =  set(df1.columns).symmetric_difference(set(df2.columns));
  if len(col_diff) > 0:
      print(f"❌ Column are different: {col_diff}")
      return False

  all_columns = set(df1.columns).union(set(df2.columns))
  all_equal = True
  for col in all_columns:
    # or all(...) to handle some weird datatype stuff; if all values are equal, we also consider the column as equal
    if  df1[col].equals(df2[col]) or all([v1 == v2 or (pd.isnull(v1) and pd.isnull(v2)) or str(v1) == str(v2) for (v1,v2) in zip(df1[col],df2[col])]):
      pass
    else:
      all_equal = False
      print(f"❌ For column {col} not all entries are equal!")
  if all_equal:
    print(f"✅✅✅ All values in all column are equal!")
    return True
  else:
    print(f"❌❌❌ NOT all values in all column are equal!")
    return False

In [None]:
log_import_res = dict()

In [None]:
import time
import pm4py
import numpy as np
import glob
import pandas as pd

for log_name in glob.glob("../../pm4py-core/tests/input_data/*.xes"):
  print(log_name)
  log_name_postfix = log_name.split("/")[-1]

  # Measure execution times
  start = time.time()
  (log_rs,other_data_str) = rustxes.import_xes(log_name)
  log_rs = log_rs.to_pandas() if log_rs.shape != (0,0) else pd.DataFrame()
  log_rs = log_rs.fillna(np.nan)
  rs_dur = time.time() - start
  start = time.time()
  log_py_lbl = pm4py.read_xes(log_name,variant="line_by_line")
  py_lbl_dur = time.time() - start
  start = time.time()
  try:
    log_py_iter = pm4py.read_xes(log_name,variant="iterparse")
  except Exception as e:
    print(e)
  py_iter_dur = time.time() - start

  # Print & save execution times in log_import_res for plotting
  print(rs_dur,py_lbl_dur,py_iter_dur)
  log_import_res[log_name_postfix] = {"Rust": rs_dur, "PM4Py (line_by_line)": py_lbl_dur, "PM4Py (iterparse)": py_iter_dur }
  # Check if DataFrames are _roughtly_ equal
  if not compare_log_dfs(log_rs,log_py_iter):
    print("WARNING: Logs do not match for " + log_name)

In [None]:
import pandas as pd
df = pd.DataFrame(log_import_res)
import plotly.express as px
fig = px.bar(df.transpose(), barmode="group", labels={"index": "Event Log", "value": "Parse Duration [s]", "variable": "Parsing Implementation"})
fig.show()
for log_name in log_import_res: 
  x = [ "Rust", "PM4Py (line_by_line)","PM4Py (iterparse)"]
  fig = px.bar(labels={"x": "XES Parser", "y": "Parse Duration [s]"}, title=f"XES Parsing Performance on <i>{log_name}</i>",x=x, y=[log_import_res[log_name][n] for n in x],color=x)
  fig.show()