## imports


In [1]:
# Standard library imports
import os
import sys
import time
import contextlib
import concurrent.futures
from datetime import datetime, timedelta
import io
import logging
import concurrent.futures
import threading
import json

# Third-party imports
import pandas as pd
import pytz

# pm4py imports
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.algo.conformance.tokenreplay import algorithm as token_replay

from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristic_miner
from pm4py.algo.discovery.ilp import algorithm as ilp_miner
from pm4py.algo.evaluation import algorithm as evaluation
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
import calcEventLogPs



In [None]:
#log_folder = "/home/jupyter-benjamin.andrick-3cf07/test/logs/inductive_logs"


## Basic Functions


In [None]:
import logging
import os
from datetime import datetime


# Directory for logs
log_dir = "/home/jupyter-benjamin.andrick-3cf07/test/logfiles"
# Setup logging with the new path
log_filename = os.path.join(log_dir, f"process_mining_{datetime.now().strftime('%Y%m%d_%H%M%S')}conformance_tag.log")
logging.basicConfig(
    filename=log_filename,
    level=logging.INFO,  # Use a valid logging level here
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logging.info('NEW RUN\n\n\n------------------------------------------------------------------------\n\n')

In [4]:
def discover_process_models(log):
 
    results = {}
    
    # ILP Miner
    #logging.info("Applying ILP Miner - Classic Variant...")
    ilp_net, ilp_im, ilp_fm = ilp_miner.apply(log)
    results['ILP Miner'] = (ilp_net, ilp_im, ilp_fm)
    
    # Heuristic Miner - Classic
    parameters = {
        heuristic_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: 0.75
    }
    #logging.info("Applying Heuristic Miner - Classic Variant...")
    heur_net, heur_im, heur_fm = heuristic_miner.apply(
        log,
        parameters=parameters,
        variant=heuristic_miner.Variants.CLASSIC
    )
    results['Heuristic Miner - Classic'] = (heur_net, heur_im, heur_fm)
    
    # Heuristic Miner - Plus
    parameters_plus = {
        heuristic_miner.Variants.PLUSPLUS.value.Parameters.DEPENDENCY_THRESH: 0.75
    }
    #logging.info("Applying Heuristic Miner - Plusplus Variant...")
    heur_plus_net, heur_plus_im, heur_plus_fm = heuristic_miner.apply(
        log,
        parameters=parameters_plus,
        variant=heuristic_miner.Variants.PLUSPLUS
    )
    results['Heuristic Miner - Plus'] = (heur_plus_net, heur_plus_im, heur_plus_fm)
    
    # Alpha Miner - Classic
    #logging.info("Applying Alpha Miner - Classic Variant...")
    alpha_net, alpha_im, alpha_fm = alpha_miner.apply(
        log,
        variant=alpha_miner.Variants.ALPHA_VERSION_CLASSIC
    )
    results['Alpha Miner - Classic'] = (alpha_net, alpha_im, alpha_fm)
    
    # Alpha Miner - Plus
    #logging.info("Applying Alpha Miner - Plus Variant...")
    #alpha_plus_net, alpha_plus_im, alpha_plus_fm = alpha_miner.apply(
    #    log,
   #     variant=alpha_miner.Variants.ALPHA_VERSION_PLUS
    #)
    #results['Alpha Miner - Plus'] = (alpha_plus_net, alpha_plus_im, alpha_plus_fm)
    #logging.info('mining done in def')
    return results

In [5]:
def perform_conformance_checking(log, models):
 
    results = {}
    
    for algorithm_name, (net, initial_marking, final_marking) in models.items():
        #logging.info(f"\n=== Evaluating {algorithm_name} ===")
        try:
            # Use the evaluation algorithm which provides comprehensive metrics
            metrics = evaluation.apply(
                log,
                net,
                initial_marking,
                final_marking
            )
            
            # Flatten the fitness metrics and combine with other metrics
            flattened_metrics = {
                'perc_fit_traces': metrics['fitness']['perc_fit_traces'],
                'average_trace_fitness': metrics['fitness']['average_trace_fitness'],
                'log_fitness': metrics['fitness']['log_fitness'],
                'percentage_of_fitting_traces': metrics['fitness']['percentage_of_fitting_traces'],
                'precision': metrics['precision'],
                'generalization': metrics['generalization'],
                'simplicity': metrics['simplicity'],
                'metricsAverageWeight': metrics.get('metricsAverageWeight', None),
                'fscore': metrics.get('fscore', None)
            }
            
            results[algorithm_name] = flattened_metrics
            
          
            
        except Exception as e:
            #logging.info(f"Error evaluating {algorithm_name}: {str(e)}")
            results[algorithm_name] = {
                "error": str(e)
            }
    
    return results

In [6]:
def process_xes_file_with_timeout(file_path):
    #logging.info(f"Processing file: {file_path}") 
    # Import the XES log
    log = xes_importer.apply(file_path)
    
    log_name = os.path.basename(file_path)
    df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)
    
    #Step 3: Add artificial timestamps if missing
    if 'time:timestamp' not in df.columns:
       df['time:timestamp'] = pd.date_range(start='2024-01-01', periods=len(df), freq='s')
    
    #Step 4: Convert back to event log
    log = log_converter.apply(df, variant=log_converter.Variants.TO_EVENT_LOG)
    #logging.info("Calculating Event Log Properties...")    
    #props = calcEventLogPs.calculate_event_log_ps(log)
    #logging.info("Performing Conformance Checking...")
    res = discover_process_models(log)
    #logging.info("Mining done nach def")
    conformance_results = perform_conformance_checking(log, res)
    #logging.info("Merging Results...")
    merged_results = {
        log_name: {
            "conformance": conformance_results
        }
    }
    
    return merged_results


 


def run_with_timeout(file_path):
    def thread_function(future, file_path):
        try:
            result = process_xes_file_with_timeout(file_path)
            future.set_result(result)
        except Exception as e:
            future.set_exception(e)

    future = concurrent.futures.Future()
    thread = threading.Thread(target=thread_function, args=(future, file_path))
    # Set thread as daemon so it will be terminated when main thread exits
    thread.daemon = True
    thread.start()

    try:
        return future.result(timeout=60*12)  
    except concurrent.futures.TimeoutError:
        logging.info(f"Processing timed out for {os.path.basename(file_path)}")
        # Add the file to timeout_fails.txt
        folder_path = os.path.dirname(file_path)
        timeout_file = os.path.join(folder_path, 'timeout_fails.txt')
        with open(timeout_file, 'a') as f:
            f.write(os.path.basename(file_path) + '\n')
        return None
    finally:
        # More aggressive thread termination
        try:
            thread.join(timeout=1)  # Give the thread 1 second to finish
            if thread.is_alive():
                logging.info(f"Thread for {os.path.basename(file_path)} did not terminate gracefully")
                # Attempt to raise exception in thread to force termination
                import ctypes
                thread_id = thread.ident
                if thread_id is not None:
                    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
                        ctypes.c_long(thread_id), 
                        ctypes.py_object(SystemExit)
                    )
                    if res > 1:
                        # If it went wrong, cancel it
                        ctypes.pythonapi.PyThreadState_SetAsyncExc(
                            ctypes.c_long(thread_id), 
                            None
                        )
                # Give it a moment to terminate
                thread.join(0.1)
        except Exception as e:
            logging.error(f"Error while terminating thread: {str(e)}")
      
      

## main

### main multiple


In [None]:
# ... existing imports and setup ...

def process_folders(base_folder):
    fitness_results = []
    total_folders = [f for f in os.listdir(base_folder) 
                    if os.path.isdir(os.path.join(base_folder, f)) and not f.startswith('.')]
    total_count = len(total_folders)
    folder_count = 0

    for folder in total_folders:
        folder_path = os.path.join(base_folder, folder)
        print(f"Processing folder {folder_count + 1} of {total_count}: {folder}")
        
        # Initialize or load existing JSON file for this folder
        results_file = os.path.join(folder_path, 'conformance_results.json')
        if os.path.exists(results_file):
            with open(results_file, 'r') as f:
                try:
                    existing_results = json.load(f)
                    processed_files = {list(result.keys())[0] for result in existing_results if result}
                except json.JSONDecodeError:
                    existing_results = []
                    processed_files = set()
        else:
            existing_results = []
            processed_files = set()

        # Process XES files in the folder
        files = [f for f in os.listdir(folder_path) 
                if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.xes')]
        file_count = 0
        for file in files:
            if file in processed_files:
                #print(f"File '{file}' already processed. Skipping...")
                file_count += 1
                continue
            print(f"Processing file {file}, {file_count} of {len(files)} in folder {folder}")
            file_path = os.path.join(folder_path, file)
            try:
                fitness = run_with_timeout(file_path)
                
                if fitness is not None:
                    existing_results.append(fitness)
                    # Write results immediately after each file
                    with open(results_file, 'w') as f:
                        json.dump(existing_results, f, indent=4)
                    
            except Exception as e:
                logging.error(f"Error processing {file}: {str(e)}")
            file_count += 1

        folder_count += 1
        print(f"Folder progress: {(folder_count / total_count) * 100:.2f}%")
        #logging.info(f"Completed folder {folder} ({folder_count}/{total_count})")

    return fitness_results

# Usage
base_folder = "/home/jupyter-benjamin.andrick-3cf07/test/logs"  # Replace with your folder path
results = process_folders(base_folder)

Processing folder 1 of 5: standard
Folder progress: 20.00%
Processing folder 2 of 5: inductive_logs
Folder progress: 40.00%
Processing folder 3 of 5: variants_coverage_filtered
Folder progress: 60.00%
Processing folder 4 of 5: variants_top_k_filtered
Processing file pdc_2020_0111101.xes_variants_top_k.xes, 0 of 543 in folder variants_top_k_filtered


parsing log, completed traces :: 100%|██████████| 800/800 [00:00<00:00, 6243.59it/s]
discovering Petri net using ILP miner, completed causal relations :: 100%|██████████| 82/82 [00:10<00:00,  7.48it/s]
replaying log with TBR, completed traces :: 100%|██████████| 662/662 [00:01<00:00, 491.02it/s]
replaying log with TBR, completed traces :: 100%|██████████| 8820/8820 [00:32<00:00, 273.29it/s]
replaying log with TBR, completed traces :: 100%|██████████| 662/662 [00:01<00:00, 538.27it/s]
replaying log with TBR, completed traces :: 100%|██████████| 8820/8820 [00:05<00:00, 1687.42it/s]
replaying log with TBR, completed traces :: 100%|██████████| 662/662 [00:02<00:00, 326.65it/s]
replaying log with TBR, completed traces :: 100%|██████████| 8820/8820 [00:05<00:00, 1574.70it/s]
replaying log with TBR, completed traces :: 100%|██████████| 662/662 [00:00<00:00, 1336.15it/s]
replaying log with TBR, completed traces :: 100%|██████████| 8820/8820 [00:01<00:00, 7769.17it/s]


Processing file activitylog_uci_detailed_labour.xes_variants_top_k.xes, 1 of 543 in folder variants_top_k_filtered


parsing log, completed traces :: 100%|██████████| 20/20 [00:00<00:00, 1273.95it/s]
discovering Petri net using ILP miner, completed causal relations :: 100%|██████████| 32/32 [00:00<00:00, 48.68it/s]
replaying log with TBR, completed traces :: 100%|██████████| 20/20 [00:00<00:00, 294.18it/s]
replaying log with TBR, completed traces :: 100%|██████████| 987/987 [00:02<00:00, 382.92it/s]
replaying log with TBR, completed traces :: 100%|██████████| 20/20 [00:00<00:00, 225.79it/s]
replaying log with TBR, completed traces :: 100%|██████████| 987/987 [00:00<00:00, 1251.84it/s]
replaying log with TBR, completed traces :: 100%|██████████| 20/20 [00:00<00:00, 486.22it/s]
replaying log with TBR, completed traces :: 100%|██████████| 987/987 [00:00<00:00, 10036.39it/s]
replaying log with TBR, completed traces :: 100%|██████████| 20/20 [00:00<00:00, 1057.05it/s]
replaying log with TBR, completed traces :: 100%|██████████| 987/987 [00:00<00:00, 6216.53it/s]


Processing file pdc2021_021000.xes_variants_top_k.xes, 2 of 543 in folder variants_top_k_filtered


parsing log, completed traces :: 100%|██████████| 200/200 [00:00<00:00, 2874.21it/s]
discovering Petri net using ILP miner, completed causal relations ::  84%|████████▍ | 109/130 [00:30<00:07,  3.00it/s]

KeyboardInterrupt: 

discovering Petri net using ILP miner, completed causal relations :: 100%|██████████| 130/130 [00:36<00:00,  3.57it/s]
replaying log with TBR, completed traces :: 100%|██████████| 200/200 [00:01<00:00, 136.93it/s]
replaying log with TBR, completed traces :: 100%|██████████| 5472/5472 [00:44<00:00, 123.20it/s]
replaying log with TBR, completed traces :: 100%|██████████| 200/200 [00:01<00:00, 180.84it/s]
replaying log with TBR, completed traces :: 100%|██████████| 5472/5472 [00:08<00:00, 656.67it/s] 
replaying log with TBR, completed traces :: 100%|██████████| 200/200 [00:01<00:00, 129.33it/s]
replaying log with TBR, completed traces :: 100%|██████████| 5472/5472 [00:07<00:00, 740.22it/s] 
replaying log with TBR, completed traces :: 100%|██████████| 200/200 [00:00<00:00, 311.39it/s]
replaying log with TBR, completed traces :: 100%|██████████| 5472/5472 [00:01<00:00, 5459.56it/s]


### main singular


In [7]:
fitness_results = []
count = 0

dateien = [f for f in os.listdir(log_folder) if os.path.isfile(os.path.join(log_folder, f))]
gesamtanzahl = len(dateien)

# Initialize or load existing JSON file
results_file = os.path.join(log_folder, 'conformance_results.json')
if os.path.exists(results_file):
    with open(results_file, 'r') as f:
        try:
            existing_results = json.load(f)
            # Extract already processed filenames from existing results
            processed_files = {list(result.keys())[0] for result in existing_results if result}
        except json.JSONDecodeError:
            existing_results = []
            processed_files = set()
else:
    existing_results = []
    processed_files = set()

for datei in dateien:
    if not datei.endswith(".xes"):
        continue
    if datei in processed_files:
        print(f"Datei '{datei}' wurde bereits verarbeitet. Überspringe...")
        continue

    print(f"Verarbeite Datei {count} von {gesamtanzahl}: {datei}")
    dateipfad = os.path.join(log_folder, datei)

    try:
        with open(dateipfad, 'r') as f:
            fitness = run_with_timeout(dateipfad)
            
            # Append new result to existing results and write immediately
            if fitness is not None:
                existing_results.append(fitness)
                with open(results_file, 'w') as f:
                    json.dump(existing_results, f, indent=4)

    except Exception as e:
        logging.error(f"Fehler beim Laden von {datei}: {e}")

    count += 1
    print(f"{(count / gesamtanzahl) * 100:.2f}% abgeschlossen")

Datei 'inductive_pdc2024_020010.xes' wurde bereits verarbeitet. Überspringe...
Datei 'inductive_BPI Challenge 2017 - Offer log.xes' wurde bereits verarbeitet. Überspringe...
Datei 'inductive_pdc_2016_5.xes' wurde bereits verarbeitet. Überspringe...
Datei 'inductive_pdc2024_100010.xes' wurde bereits verarbeitet. Überspringe...
Datei 'inductive_pdc2024_101110.xes' wurde bereits verarbeitet. Überspringe...
Datei 'inductive_BPI_Challenge_2013_incidents.xes' wurde bereits verarbeitet. Überspringe...
Datei 'inductive_SEPSIS.xes' wurde bereits verarbeitet. Überspringe...
Datei 'inductive_edited_hh102_labour.xes' wurde bereits verarbeitet. Überspringe...
Datei 'inductive_pdc2024_101000.xes' wurde bereits verarbeitet. Überspringe...
Datei 'inductive_pdc2024_001010.xes' wurde bereits verarbeitet. Überspringe...
Datei 'inductive_edited_hh110_weekends.xes' wurde bereits verarbeitet. Überspringe...
Datei 'inductive_pdc2024_000111.xes' wurde bereits verarbeitet. Überspringe...
Datei 'inductive_pdc20

In [12]:
#calc_features = False
# Folder containing XES files
#log_folder = r'/Volumes/NO NAME/Event Logs/Test_Logs'
#log_folder = '/Users/benjaminandrick/Documents/Studium/Semester 7/Bachelorarbeit/Code/Logs'
      # You might want to add more aggressive termination here if necessary
# Process all XES files in the folder
fitness_results = []
count = 0
geladene_dateien_pfad = os.path.join(log_folder, 'geladene_dateien.txt')
if os.path.exists(geladene_dateien_pfad):
    with open(geladene_dateien_pfad, 'r') as f:
        bereits_geladene_dateien = set(line.strip() for line in f)
else:
    bereits_geladene_dateien = set()

dateien = [f for f in os.listdir(log_folder) if os.path.isfile(os.path.join(log_folder, f))]
gesamtanzahl = len(dateien)

# Initialize or load existing JSON file
results_file = os.path.join(log_folder, 'conformance_results.json')
if os.path.exists(results_file):
    with open(results_file, 'r') as f:
        try:
            existing_results = json.load(f)
        except json.JSONDecodeError:
            existing_results = []
else:
    existing_results = []

for datei in dateien:
    if not datei.endswith(".xes"):
        continue
    if datei in bereits_geladene_dateien:
        print(f"Datei '{datei}' wurde bereits geladen. Überspringe...")
        continue

    print(f"Verarbeite Datei {count} von {gesamtanzahl}: {datei}")
    dateipfad = os.path.join(log_folder, datei)

    try:
        with open(dateipfad, 'r') as f:
            fitness = run_with_timeout(dateipfad)
            
            # Append new result to existing results and write immediately
            if fitness is not None:
                existing_results.append(fitness)
                with open(results_file, 'w') as f:
                    json.dump(existing_results, f, indent=4)

        # Mark file as processed
        with open(geladene_dateien_pfad, 'a') as f:
            f.write(datei + '\n')

    except Exception as e:
        logging.error(f"Fehler beim Laden von {datei}: {e}")

    count += 1
    print(f"{(count / gesamtanzahl) * 100:.2f}% abgeschlossen")
    logging.info(f"{(count / gesamtanzahl) * 100:.2f}% abgeschlossen")





replaying log with TBR, completed traces :: 100%|██████████| 4911/4911 [01:29<00:00, 54.83it/s] 
replaying log with TBR, completed traces :: 100%|██████████| 764/764 [00:12<00:00, 61.32it/s] 
replaying log with TBR, completed traces :: 100%|██████████| 4911/4911 [00:25<00:00, 190.02it/s] 
replaying log with TBR, completed traces :: 100%|██████████| 764/764 [00:14<00:00, 52.37it/s] 
replaying log with TBR, completed traces :: 100%|██████████| 4911/4911 [01:09<00:00, 70.18it/s] 


## File Management

In [6]:
output = os.path.join(log_folder, f"fitness_results_inductive.json")
with open(output, 'w') as f:
    json.dump(fitness_results, f, indent = 4)




In [6]:
import os
import json
from datetime import datetime

# Define the output file path (using a consistent filename)
output = os.path.join(log_folder, 'fitness_results.json')

# Initialize a list to hold all fitness results
all_fitness_results = []

# Check if the file already exists
if os.path.exists(output):
    # Read the existing data from the file
    with open(output, 'r') as f:
        try:
            all_fitness_results = json.load(f)
        except json.JSONDecodeError:
            # Handle empty or invalid JSON file
            all_fitness_results = []

# Append the new fitness results to the list
all_fitness_results.extend(fitness_results)

# Write the updated list back to the file
with open(output, 'w') as f:
    json.dump(all_fitness_results, f, indent=4)


In [5]:
import os

def get_unprocessed_files(folder_path, processed_files_path, extension=None):
    """
    Gibt eine Liste von Dateien im Ordner zurück, die nicht in processed_files_path aufgeführt sind.
    Wenn eine Erweiterung angegeben ist, werden nur Dateien mit dieser Erweiterung berücksichtigt.
    """
    # Alle Dateien im Ordner abrufen
    all_files = set(os.listdir(folder_path))
    
    # Optional: Dateien nach Erweiterung filtern
    if extension:
        all_files = {f for f in all_files if f.endswith(extension)}
    
    # Set mit bereits verarbeiteten Dateien initialisieren
    if os.path.exists(processed_files_path):
        with open(processed_files_path, 'r') as f:
            processed_files = set(line.strip() for line in f)
    else:
        processed_files = set()
    
    # Unverarbeitete Dateien ermitteln
    unprocessed_files = all_files - processed_files
    
    return list(unprocessed_files)
    
geladene_dateien_pfad = 'geladene_dateien.txt'

# Unverarbeitete Dateien abrufen (nur .xes-Dateien)
unprocessed_files = get_unprocessed_files(log_folder, geladene_dateien_pfad, extension='.xes')

logging.info("Unverarbeitete Dateien:")
for filename in unprocessed_files:
    logging.info(filename)


NameError: name 'log_folder' is not defined

In [None]:

#ERROR - Fehler beim Laden von Road_Traffic_Fine_Management_Process.xes: Attribute value redefined, line 2498204, column 98 (Road_Traffic_Fine_Management_Process.xes, line 2498204)

In [8]:
fitness_results


[{'BPIC12.xes': {'conformance': {'ILP Miner': {'perc_fit_traces': 100.0,
     'average_trace_fitness': 1.0,
     'log_fitness': 1.0,
     'percentage_of_fitting_traces': 100.0,
     'precision': 0.1842215110616391,
     'generalization': 0.6892322645020896,
     'simplicity': 0.09298998569384835,
     'metricsAverageWeight': 0.4916109403143943,
     'fscore': 0.31112677711197273},
    'Heuristic Miner - Classic': {'perc_fit_traces': 11.0,
     'average_trace_fitness': 0.8483540326185368,
     'log_fitness': 0.8569110264843345,
     'percentage_of_fitting_traces': 11.0,
     'precision': 0.7678244972577697,
     'generalization': 0.5266178800130592,
     'simplicity': 0.5212765957446809,
     'metricsAverageWeight': 0.6681574998749611,
     'fscore': 0.8099253921519004},
    'Heuristic Miner - Plus': {'perc_fit_traces': 37.0,
     'average_trace_fitness': 0.8169760209957636,
     'log_fitness': 0.7223942153424974,
     'percentage_of_fitting_traces': 37.0,
     'precision': 1.0,
     'g