In [2]:
import os
import pm4py
from pm4py.filtering import filter_case_size, filter_variants_top_k, filter_variants_by_coverage_percentage
from pm4py.objects.conversion.log import converter as log_converter
import pandas as pd
import numpy as np
from pm4py.statistics.variants.log import get as variants_module
from pm4py.algo.filtering.log.variants import variants_filter
import csv
import calcEventLogPs
from pm4py.algo.discovery.inductive import algorithm as inductive_miner



In [None]:
#log_path = '/Users/benjaminandrick/Documents/Studium/Semester 7/Bachelorarbeit/Code/Test_Logs'
log_path = '/home/jupyter-benjamin.andrick-3cf07/test/logs/standard'

In [3]:
def dynamic_filter_variants_by_coverage(log):
    """
    Dynamically filters variants based on their cumulative coverage in the log.
    
    Parameters:
    - log: The event log to filter.

    Returns:
    - filtered_log: The log filtered to include only the most significant variants.
    """
    # Get all variants and their frequency
    variants = variants_module.get_variants(log)
    
    # Calculate total number of cases in the log
    total_cases = len(log)
    
    # Sort variants by frequency (descending)
    sorted_variants = sorted(variants.items(), key=lambda x: -len(x[1]))
    
    # Determine dynamic coverage threshold
    # For example: Use 80% coverage if the top variants dominate, else lower threshold
    coverage_threshold = 0.8 if len(sorted_variants) <= 20 else 0.5
    
    # Calculate cumulative coverage and retain variants until the threshold is met
    cumulative_coverage = 0
    retained_variants = []
    
    for variant, cases in sorted_variants:
        variant_coverage = len(cases) / total_cases
        cumulative_coverage += variant_coverage
        retained_variants.append(variant)
        
        if cumulative_coverage >= coverage_threshold:
            break
    
    # Apply the filter to retain only the selected variants
    filtered_log = variants_filter.apply(log, retained_variants)
    
    return filtered_log, coverage_threshold

## from standard to filtered 3x + props

In [None]:
# Create output directories if they don't exist
logs_path_parent = "/home/jupyter-benjamin.andrick-3cf07/test/logs"
output_dirs = {
    'case_size': os.path.join(logs_path_parent, 'case_size_filtered'),
    'variants_top_k': os.path.join(logs_path_parent, 'variants_top_k_filtered'),
    'variants_coverage': os.path.join(logs_path_parent, 'variants_coverage_filtered')
}

for directory in output_dirs.values():
    os.makedirs(directory, exist_ok=True)

# Initialize CSV files with headers
csv_files = {
    'case_size': os.path.join(output_dirs['case_size'], 'parameters.csv'),
    'variants_top_k': os.path.join(output_dirs['variants_top_k'], 'parameters.csv'),
    'variants_coverage': os.path.join(output_dirs['variants_coverage'], 'parameters.csv')
}

# Create CSV files with headers
csv_headers = {
    'case_size': ['filename', 'min_cases', 'max_cases'],
    'variants_top_k': ['filename', 'top_k', 'coverage_threshold'],
    'variants_coverage': ['filename', 'coverage_threshold']
}

props_csv_files = {
    'case_size': os.path.join(output_dirs['case_size'], 'log_properties.csv'),
    'variants_top_k': os.path.join(output_dirs['variants_top_k'], 'log_properties.csv'),
    'variants_coverage': os.path.join(output_dirs['variants_coverage'], 'log_properties.csv')
}

for file_path in props_csv_files.values():
    # The properties DataFrame will create its own headers when first saved
    if not os.path.exists(file_path):
        pd.DataFrame().to_csv(file_path, index=False)

for file_path, headers in csv_headers.items():
    with open(csv_files[file_path], 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(headers)

for file in os.listdir(log_path):
    if file.endswith(".xes"):
        # Check if the file has already been processed
        case_size_path = os.path.join(output_dirs['case_size'], f"{file}_case_size.xes")
        variants_top_k_path = os.path.join(output_dirs['variants_top_k'], f"{file}_variants_top_k.xes")
        variants_coverage_path = os.path.join(output_dirs['variants_coverage'], f"{file}_variants_coverage.xes")
        
        if os.path.exists(case_size_path) and os.path.exists(variants_top_k_path) and os.path.exists(variants_coverage_path):
            print(f"Skipping {file} - already processed")
            continue
        
        print(file)
        log = pm4py.read_xes(os.path.join(log_path, file))
        df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)
    
        # Step 3: Add artificial timestamps if missing
        if 'time:timestamp' not in df.columns:
            df['time:timestamp'] = pd.date_range(start='2024-01-01', periods=len(df), freq='s')
        
        # Add case:concept:name if missing
        if 'case:concept:name' not in df.columns:
            # If case:id exists, use that
            if 'case:id' in df.columns:
                df['case:concept:name'] = df['case:id']
            # Otherwise create sequential case IDs
            else:
                df['case:concept:name'] = df.index.astype(str)
        
        # Step 4: Convert back to event log
        log = log_converter.apply(df, variant=log_converter.Variants.TO_EVENT_LOG)

        case_sizes = [len(trace) for trace in log]
        min_cases = np.percentile(case_sizes, 20)
        max_cases = np.percentile(case_sizes, 80)
        filtered_log_case_size = filter_case_size(log, min_cases, max_cases)

        variants = variants_module.get_variants(log)
        coverage_threshold = 0.8
        cumulative_coverage = 0
        top_k = 0
        # Modified sorting to handle list values by getting their length
        for variant, count in sorted(variants.items(), key=lambda x: len(x[1]), reverse=True):
            # Adjust the coverage calculation to use the length of the list
            cumulative_coverage += len(count) / len(log)
            top_k += 1
            if cumulative_coverage >= coverage_threshold:
                break

        #print(top_k)
        filtered_log_variants_top_k = filter_variants_top_k(log, top_k)
        filtered_log_variants_by_coverage_percentage, coverage_threshold = dynamic_filter_variants_by_coverage(log)
        
        pm4py.write_xes(filtered_log_case_size, case_size_path)
        pm4py.write_xes(filtered_log_variants_top_k, variants_top_k_path)
        pm4py.write_xes(filtered_log_variants_by_coverage_percentage, variants_coverage_path)

        # Update CSV files with parameters
        with open(csv_files['case_size'], 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([file, min_cases, max_cases])

        with open(csv_files['variants_top_k'], 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([file, top_k, coverage_threshold])

        with open(csv_files['variants_coverage'], 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([file, coverage_threshold])
        

Skipping pdc2022_110010.xes - already processed
Skipping pdc2022_000000.xes - already processed
Skipping pdc2021_110111.xes - already processed
Skipping pdc2024_101011.xes - already processed
Skipping pdc_2020_0201110.xes - already processed
Skipping pdc2024_121101.xes - already processed
Skipping pdc_2020_1000111.xes - already processed
Skipping pdc2021_011100.xes - already processed
Skipping pdc2022_011101.xes - already processed
Skipping pdc2021_001110.xes - already processed
Skipping pdc2021_101111.xes - already processed
Skipping pdc2021_001100.xes - already processed
Skipping pdc_2016_7.xes - already processed
Skipping pdc_2020_1201101.xes - already processed
Skipping pdc_2020_1011101.xes - already processed
Skipping pdc2022_111111.xes - already processed
Skipping pdc2024_121001.xes - already processed
Skipping pdc2022_011011.xes - already processed
Skipping pdc2022_100010.xes - already processed
Skipping pdc2022_121110.xes - already processed
Skipping pdc2022_020100.xes - alread

In [None]:
# Create output directories if they don't exist
output_dirs = {
    'case_size': os.path.join(log_path, 'case_size_filtered'),
    'variants_top_k': os.path.join(log_path, 'variants_top_k_filtered'),
    'variants_coverage': os.path.join(log_path, 'variants_coverage_filtered')
}

for directory in output_dirs.values():
    os.makedirs(directory, exist_ok=True)

# Initialize CSV files with headers
csv_files = {
    'case_size': os.path.join(output_dirs['case_size'], 'parameters.csv'),
    'variants_top_k': os.path.join(output_dirs['variants_top_k'], 'parameters.csv'),
    'variants_coverage': os.path.join(output_dirs['variants_coverage'], 'parameters.csv')
}

# Create CSV files with headers
csv_headers = {
    'case_size': ['filename', 'min_cases', 'max_cases'],
    'variants_top_k': ['filename', 'top_k', 'coverage_threshold'],
    'variants_coverage': ['filename', 'coverage_threshold']
}

props_csv_files = {
    'case_size': os.path.join(output_dirs['case_size'], 'log_properties.csv'),
    'variants_top_k': os.path.join(output_dirs['variants_top_k'], 'log_properties.csv'),
    'variants_coverage': os.path.join(output_dirs['variants_coverage'], 'log_properties.csv')
}

for file_path in props_csv_files.values():
    # The properties DataFrame will create its own headers when first saved
    if not os.path.exists(file_path):
        pd.DataFrame().to_csv(file_path, index=False)

for file_path, headers in csv_headers.items():
    with open(csv_files[file_path], 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(headers)

for file in os.listdir(log_path):
    if file.endswith(".xes"):
        print(file)
        log = pm4py.read_xes(os.path.join(log_path, file))
        df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)
    
        # Step 3: Add artificial timestamps if missing
        if 'time:timestamp' not in df.columns:
            df['time:timestamp'] = pd.date_range(start='2024-01-01', periods=len(df), freq='s')
        
        # Add case:concept:name if missing
        if 'case:concept:name' not in df.columns:
            # If case:id exists, use that
            if 'case:id' in df.columns:
                df['case:concept:name'] = df['case:id']
            # Otherwise create sequential case IDs
            else:
                df['case:concept:name'] = df.index.astype(str)
        # Step 3: Add artificial timestamps if missing
        if 'time:timestamp' not in df.columns:
            df['time:timestamp'] = pd.date_range(start='2024-01-01', periods=len(df), freq='s')
        
        # Step 4: Convert back to event log
        log = log_converter.apply(df, variant=log_converter.Variants.TO_EVENT_LOG)

        case_sizes = [len(trace) for trace in log]
        min_cases = np.percentile(case_sizes, 20)
        max_cases = np.percentile(case_sizes, 80)
        filtered_log_case_size = filter_case_size(log, min_cases, max_cases)

        variants = variants_module.get_variants(log)
        coverage_threshold = 0.8
        cumulative_coverage = 0
        top_k = 0
        # Modified sorting to handle list values by getting their length
        for variant, count in sorted(variants.items(), key=lambda x: len(x[1]), reverse=True):
            # Adjust the coverage calculation to use the length of the list
            cumulative_coverage += len(count) / len(log)
            top_k += 1
            if cumulative_coverage >= coverage_threshold:
                break

        # ... rest of the code ...
        print(top_k)
        filtered_log_variants_top_k = filter_variants_top_k(log, top_k)
        filtered_log_variants_by_coverage_percentage, coverage_threshold = dynamic_filter_variants_by_coverage(log)
        
        case_size_path = os.path.join(output_dirs['case_size'], f"{file}_case_size.xes")
        variants_top_k_path = os.path.join(output_dirs['variants_top_k'], f"{file}_variants_top_k.xes")
        variants_coverage_path = os.path.join(output_dirs['variants_coverage'], f"{file}_variants_coverage.xes")

        pm4py.write_xes(filtered_log_case_size, case_size_path)
        pm4py.write_xes(filtered_log_variants_top_k, variants_top_k_path)
        pm4py.write_xes(filtered_log_variants_by_coverage_percentage, variants_coverage_path)

        # Calculate and save properties for each filtered log
        # # Calculate and save properties for each filtered log
        # try:
        #     props_case_size = calcEventLogPs.calculate_event_log_ps(filtered_log_case_size)
        #     # Convert dictionary to DataFrame
        #     props_case_size = pd.DataFrame([props_case_size])
        # except Exception as e:
        #     # Create DataFrame with error message
        #     props_case_size = pd.DataFrame([{
        #         'error': f"Failed to calculate properties: {str(e)}",
        #         'filename': f"{file}_case_size.xes"
        #     }])

        # try:
        #     props_variants_top_k = calcEventLogPs.calculate_event_log_ps(filtered_log_variants_top_k)
        #     props_variants_top_k = pd.DataFrame([props_variants_top_k])
        # except Exception as e:
        #     props_variants_top_k = pd.DataFrame([{
        #         'error': f"Failed to calculate properties: {str(e)}",
        #         'filename': f"{file}_variants_top_k.xes"
        #     }])

        # try:
        #     props_variants_coverage = calcEventLogPs.calculate_event_log_ps(filtered_log_variants_by_coverage_percentage)
        #     props_variants_coverage = pd.DataFrame([props_variants_coverage])
        # except Exception as e:
        #     props_variants_coverage = pd.DataFrame([{
        #         'error': f"Failed to calculate properties: {str(e)}",
        #         'filename': f"{file}_variants_coverage.xes"
        #     }])

        # # Add filename column to each properties DataFrame (only if not already added in error case)
        # if 'filename' not in props_case_size.columns:
        #     props_case_size['filename'] = f"{file}_case_size.xes"
        # if 'filename' not in props_variants_top_k.columns:
        #     props_variants_top_k['filename'] = f"{file}_variants_top_k.xes"
        # if 'filename' not in props_variants_coverage.columns:
        #     props_variants_coverage['filename'] = f"{file}_variants_coverage.xes"

        # # Append properties to respective CSV files
        # props_case_size.to_csv(props_csv_files['case_size'], mode='a', header=False, index=False)
        # props_variants_top_k.to_csv(props_csv_files['variants_top_k'], mode='a', header=False, index=False)
        # props_variants_coverage.to_csv(props_csv_files['variants_coverage'], mode='a', header=False, index=False)# Update CSV files with parameters
        with open(csv_files['case_size'], 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([file, min_cases, max_cases])

        with open(csv_files['variants_top_k'], 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([file, top_k, coverage_threshold])

        with open(csv_files['variants_coverage'], 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([file, coverage_threshold])
        

## from standard to  3x filtered (combined_filtered) + props

In [None]:
# Setup output directory
output_dir = os.path.join("/home/jupyter-benjamin.andrick-3cf07/test/logs", 'combined_filters')
os.makedirs(output_dir, exist_ok=True)

# Initialize CSV files
params_csv = os.path.join(output_dir, 'parameters.csv')
props_csv = os.path.join(output_dir, 'log_properties.csv')

# Create or load parameters CSV
if os.path.exists(params_csv):
    # Read existing parameters to skip processed files
    processed_files = pd.read_csv(params_csv)['filename'].tolist()
else:
    # Create new parameters CSV with headers
    processed_files = []
    with open(params_csv, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['filename', 'min_cases', 'max_cases', 'top_k', 'coverage_threshold'])

# Initialize properties CSV headers if needed
props_headers_initialized = os.path.exists(props_csv)

def preprocess_log(df):
    """Ensure required columns exist in the log DataFrame"""
    if 'time:timestamp' not in df.columns:
        df['time:timestamp'] = pd.date_range(start='2024-01-01', periods=len(df), freq='s')
    
    if 'case:concept:name' not in df.columns:
        df['case:concept:name'] = df['case:id'] if 'case:id' in df.columns else df.index.astype(str)
    
    return df

# Process each XES file
for file in os.listdir(log_path):
    if not file.endswith(".xes"):
        continue
        
    # Skip if file was already processed
    if file in processed_files:
        print(f"Skipping {file} - already processed")
        continue

    try:
        print(f"Processing {file}...")
        
        # Read and preprocess log
        log = pm4py.read_xes(os.path.join(log_path, file))
        df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)
        df = preprocess_log(df)
        log = log_converter.apply(df, variant=log_converter.Variants.TO_EVENT_LOG)

        # Apply filters sequentially
        print('Applying filters...')
        # 1. Case size filter
        case_sizes = [len(trace) for trace in log]
        min_cases = np.percentile(case_sizes, 20)
        max_cases = np.percentile(case_sizes, 80)
        filtered_log = filter_case_size(log, min_cases, max_cases)

        # 2. Top-k variants filter
        variants = variants_module.get_variants(log)
        coverage_threshold = 0.8
        cumulative_coverage = 0
        top_k = 0
        for variant, cases in sorted(variants.items(), key=lambda x: len(x[1]), reverse=True):
            cumulative_coverage += len(cases) / len(log)
            top_k += 1
            if cumulative_coverage >= coverage_threshold:
                break
        filtered_log = filter_variants_top_k(filtered_log, top_k)

        # 3. Dynamic coverage filter
        filtered_log, final_coverage_threshold = dynamic_filter_variants_by_coverage(filtered_log)
        
        # Save results
        print('Saving results...')
        output_path = os.path.join(output_dir, f"{file}_combined.xes")
        pm4py.write_xes(filtered_log, output_path)
        continue
        # Calculate and save properties
        try:
            props = calcEventLogPs.calculate_event_log_ps(filtered_log)
            props['filename'] = f"{file}_combined.xes"
            props_df = pd.DataFrame([props])
            
            # Initialize headers if this is the first successful calculation
            if not props_headers_initialized:
                props_df.to_csv(props_csv, index=False)
                props_headers_initialized = True
            else:
                props_df.to_csv(props_csv, mode='a', header=False, index=False)
                
        except Exception as e:
            print(f"Error calculating properties for {file}: {e}")
            error_df = pd.DataFrame([{
                'error': str(e),
                'filename': f"{file}_combined.xes"
            }])
            error_df.to_csv(props_csv, mode='a', header=not props_headers_initialized, index=False)
            props_headers_initialized = True

        # Save parameters
        with open(params_csv, 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([file, min_cases, max_cases, top_k, final_coverage_threshold])

        # Add to processed files list
        processed_files.append(file)

    except Exception as e:
        print(f"Error processing {file}: {e}")

## filtered logs + inductive

In [7]:
import threading
import concurrent.futures

In [8]:
def preprocess_log(df):
    """Ensure required columns exist in the log DataFrame"""
    if 'time:timestamp' not in df.columns:
        df['time:timestamp'] = pd.date_range(start='2024-01-01', periods=len(df), freq='s')
    
    if 'case:concept:name' not in df.columns:
        df['case:concept:name'] = df['case:id'] if 'case:id' in df.columns else df.index.astype(str)
    
    return df

def process_xes_file_with_timeout(file_path):
    log = pm4py.read_xes(file_path)
    df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)
    df = preprocess_log(df)
    log = log_converter.apply(df, variant=log_converter.Variants.TO_EVENT_LOG)
    
    # Perform inductive mining
    pt = inductive_miner.apply(log)
    inductive_log = pm4py.objects.process_tree.semantics.generate_log(pt)
    return inductive_log
    

In [9]:
def run_with_timeout(file_path):
    def thread_function(future, file_path):
        try:
            result = process_xes_file_with_timeout(file_path)
            future.set_result(result)
        except Exception as e:
            future.set_exception(e)

    future = concurrent.futures.Future()
    thread = threading.Thread(target=thread_function, args=(future, file_path))
    # Set thread as daemon so it will be terminated when main thread exits
    thread.daemon = True
    thread.start()

    try:
        return future.result(timeout=60*12)  
    except concurrent.futures.TimeoutError:
        print(f"Processing timed out for {os.path.basename(file_path)}")
        # Add the file to timeout_fails.txt
        folder_path = os.path.dirname(file_path)
        timeout_file = os.path.join(folder_path, 'timeout_fails.txt')
        with open(timeout_file, 'a') as f:
            f.write(os.path.basename(file_path) + '\n')
        return None
    finally:
        # More aggressive thread termination
        try:
            thread.join(timeout=1)  # Give the thread 1 second to finish
            if thread.is_alive():
                print(f"Thread for {os.path.basename(file_path)} did not terminate gracefully")
                # Attempt to raise exception in thread to force termination
                import ctypes
                thread_id = thread.ident
                if thread_id is not None:
                    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
                        ctypes.c_long(thread_id), 
                        ctypes.py_object(SystemExit)
                    )
                    if res > 1:
                        # If it went wrong, cancel it
                        ctypes.pythonapi.PyThreadState_SetAsyncExc(
                            ctypes.c_long(thread_id), 
                            None
                        )
                # Give it a moment to terminate
                thread.join(0.1)
        except Exception as e:
            print(f"Error while terminating thread: {str(e)}")

In [None]:
output_dir = os.path.join("/home/jupyter-benjamin.andrick-3cf07/test/logs", 'combined_ind')
#os.makedirs(output_dir, exist_ok=True)
log_path = "/home/jupyter-benjamin.andrick-3cf07/test/logs/combined_filters"
# Process each XES file
for file in os.listdir(log_path):
    if not file.endswith(".xes"):
        continue
        
    # Check if output file already exists
    output_file = f"{file}_inductive.xes"
    output_path = os.path.join(output_dir, output_file)
    
    # Skip if file was already processed
    if os.path.exists(output_path):
        print(f"Skipping {file} - already processed")
        continue

    try:
        print(f"Processing {file}...")
        
        # Read and preprocess log
        # log = pm4py.read_xes(os.path.join(log_path, file))
        # df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)
        # df = preprocess_log(df)
        # log = log_converter.apply(df, variant=log_converter.Variants.TO_EVENT_LOG)
        
        # # Perform inductive mining
        # pt = inductive_miner.apply(log)
        # inductive_log = pm4py.objects.process_tree.semantics.generate_log(pt)
        inductive_log = run_with_timeout(os.path.join(log_path, file))
        # Save the processed log
        pm4py.write_xes(inductive_log, output_path)
        print(f"Saved processed log to {output_path}")

    except Exception as e:
        print(f"Error processing {file}: {e}")

Skipping pdc2024_110101.xes_combined.xes - already processed
Skipping pdc2021_011011.xes_combined.xes - already processed
Skipping pdc2022_121000.xes_combined.xes - already processed
Skipping pdc_2020_0011010.xes_combined.xes - already processed
Skipping pdc2024_100111.xes_combined.xes - already processed
Skipping pdc2022_020110.xes_combined.xes - already processed
Skipping pdc2024_120111.xes_combined.xes - already processed
Skipping pdc2022_020010.xes_combined.xes - already processed
Skipping pdc2024_120110.xes_combined.xes - already processed
Skipping pdc2021_001101.xes_combined.xes - already processed
Skipping pdc_2020_1211110.xes_combined.xes - already processed
Skipping pdc_2020_1210001.xes_combined.xes - already processed
Skipping pdc_2020_1211111.xes_combined.xes - already processed
Skipping pdc2024_120101.xes_combined.xes - already processed
Skipping pdc2021_121110.xes_combined.xes - already processed
Skipping pdc2024_001000.xes_combined.xes - already processed
Skipping pdc2021

parsing log, completed traces :: 100%|██████████| 449/449 [00:00<00:00, 1102.44it/s]


In [None]:
# Setup output directory
output_dir = os.path.join("/home/jupyter-benjamin.andrick-3cf07/test/logs", 'inductive_logs')
os.makedirs(output_dir, exist_ok=True)

# Initialize CSV files
props_csv = os.path.join(output_dir, 'log_properties.csv')
log_path = "/home/jupyter-benjamin.andrick-3cf07/test/logs/combined_filters"
# Initialize properties CSV headers if needed
props_headers_initialized = os.path.exists(props_csv)

def preprocess_log(df):
    """Ensure required columns exist in the log DataFrame"""
    if 'time:timestamp' not in df.columns:
        df['time:timestamp'] = pd.date_range(start='2024-01-01', periods=len(df), freq='s')
    
    if 'case:concept:name' not in df.columns:
        df['case:concept:name'] = df['case:id'] if 'case:id' in df.columns else df.index.astype(str)
    
    return df

# Process each XES file
for file in os.listdir(log_path):
    
    if not file.endswith(".xes"):
        continue
        
    # Skip if file was already processed
    if file in processed_files:
        print(f"Skipping {file} - already processed")
        continue

    try:
        print(f"Processing {file}...")
        
        # Read and preprocess log
        log = pm4py.read_xes(os.path.join(log_path, file))
        df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)
        df = preprocess_log(df)
        log = log_converter.apply(df, variant=log_converter.Variants.TO_EVENT_LOG)
        pt = inductive_miner.apply(log)
        inductive_log = pm4py.objects.process_tree.semantics.generate_log(pt)
        # Apply filters sequentially
        
        output_path = os.path.join(output_dir, f"{file}_inductive.xes")
        pm4py.write_xes(inductive_log, output_path)
        continue
        # Calculate and save properties
        try:
            props = calcEventLogPs.calculate_event_log_ps(inductive_log)
            props['filename'] = f"{file}_inductive.xes"
            props_df = pd.DataFrame([props])
            
            # Initialize headers if this is the first successful calculation
            if not props_headers_initialized:
                props_df.to_csv(props_csv, index=False)
                props_headers_initialized = True
            else:
                props_df.to_csv(props_csv, mode='a', header=False, index=False)
                
        except Exception as e:
            print(f"Error calculating properties for {file}: {e}")
            error_df = pd.DataFrame([{
                'error': str(e),
                'filename': f"{file}_inductive.xes"
            }])
            error_df.to_csv(props_csv, mode='a', header=not props_headers_initialized, index=False)
            props_headers_initialized = True

        
        # Add to processed files list
        processed_files.append(file)

    except Exception as e:
        print(f"Error processing {file}: {e}")

## calc properties


In [None]:
import os
import pm4py
import calcEventLogPs
import pandas as pd
from pm4py.objects.conversion.log import converter as log_converter

def calculate_properties_for_folder(input_path: str, output_csv: str) -> None:
    """
    Calculate properties for all XES files in a folder and save them to a CSV file.
    
    Parameters:
    - input_path: Path to the folder containing XES files
    - output_csv: Path where the CSV file should be saved
    """
    all_properties = []
    count = 1
    # Iterate through all XES files in the folder
    for file in os.listdir(input_path):
        #count += 1
       
        if file.endswith(".xes"):
            print(f"Processing {file}...")
            try:
                # Read the log and ensure proper conversion
                log = pm4py.read_xes(os.path.join(input_path, file))
                
                # Convert to DataFrame and back to ensure proper format
                df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)
                
                # Ensure required columns exist
                if 'time:timestamp' not in df.columns:
                    df['time:timestamp'] = pd.date_range(start='2024-01-01', periods=len(df), freq='s')
                
                if 'case:concept:name' not in df.columns:
                    if 'case:id' in df.columns:
                        df['case:concept:name'] = df['case:id']
                    else:
                        df['case:concept:name'] = df.index.astype(str)
                
                # Convert back to event log
                log = log_converter.apply(df, variant=log_converter.Variants.TO_EVENT_LOG)
                
                # Calculate properties
                props = calcEventLogPs.calculate_event_log_ps(log)
                props['filename'] = file
                all_properties.append(props)
                
            except Exception as e:
                print(f"Error processing {file}: {str(e)}")  # Added for debugging
                all_properties.append({
                    'filename': file,
                    'error': f"Failed to calculate properties: {str(e)}"
                })
    
    # Convert to DataFrame and save to CSV
    df = pd.DataFrame(all_properties)
    
    # Move filename column to front
    cols = ['filename'] + [col for col in df.columns if col != 'filename']
    df = df[cols]
    
    # Save to CSV
    df.to_csv(output_csv, index=False)
    print(f"Properties have been saved to {output_csv}")


# Example usage:
output_file = "/home/jupyter-benjamin.andrick-3cf07/test/logs/variants_coverage_filtered/log_properties.csv"
calculate_properties_for_folder(log_path, output_file)