In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
from flaml import AutoML
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
from tqdm import tqdm
import xgboost
import lightgbm as lgb

import pandas as pd
import json
import os
from pathlib import Path

from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import LabelEncoder


## get Files

In [None]:
def process_folder(base_folder):
    """
    Process a folder containing new_properties_2.csv and conformance_results.json
    """
    print(base_folder)
    if str(base_folder) == "/home/jupyter-benjamin.andrick-3cf07/test/logs/inductive_logs":
        conf_path = os.path.join(base_folder, 'conformance_results copy.json')
        props_path = os.path.join(base_folder, 'new_properties_2 copy.csv')
    else:
        conf_path = os.path.join(base_folder, 'conformance_results.json')
        props_path = os.path.join(base_folder, 'new_properties_2.csv')
    
    if not (os.path.exists(props_path) and os.path.exists(conf_path)):
        return None, None
    props_df = pd.read_csv(props_path)
    with open(conf_path, 'r') as f:
        conf_data = json.load(f)
        
    return props_df, conf_data



def flatten_conformance_data(json_data):
    """
    Flatten the conformance JSON data into a DataFrame, excluding specific metrics
    """
    flattened_data = []
    metrics_to_exclude = ['perc_fit_traces', 'average_trace_fitness', 'percentage_of_fitting_traces']
    
    for item in json_data:
        for filename, data in item.items():
            miners_data = data.get('conformance', data)
            for miner, metrics in miners_data.items():
                filtered_metrics = {k: v for k, v in metrics.items() if k not in metrics_to_exclude}
                metrics_flat = {"file": filename, "miner": miner, **filtered_metrics}
                flattened_data.append(metrics_flat)
    return pd.DataFrame(flattened_data)

def process_all_folders(root_path):
    """
    Process all subfolders in the given root path
    """
    merged_dfs = {}
    
    for folder_path in Path(root_path).iterdir():
        if not folder_path.is_dir() or folder_path.name.startswith('.i'):
            continue
            
        folder_name = folder_path.name
        print(f"Processing folder: {folder_name}")
        
        props_df, conf_data = process_folder(folder_path)
        
        if props_df is None or conf_data is None:
            print(f"Skipping {folder_name} - missing required files")
            continue
        conf_df = flatten_conformance_data(conf_data)
        
        merged_df = pd.merge(
            conf_df,
            props_df,
            left_on='file',
            right_on='File',
            how='left'
        )
        
        merged_dfs[folder_name] = merged_df
        
    return merged_dfs

root_path = '/home/jupyter-benjamin.andrick-3cf07/test/logs'
merged_dfs = process_all_folders(root_path)

## Derivations in Logfiles after Manipulation

In [4]:
def add_key_column(df):
    
    if 'filename' in df.columns:
        df['Key'] = df['File'].str.extract(r'(.*?\.xes)', expand=False)
    else:
        xes_columns = [col for col in df.columns if df[col].astype(str).str.contains('.xes').any()]
        if xes_columns:
            df['Key'] = df[xes_columns[0]].str.extract(r'(.*?\.xes)', expand=False)
        else:
            print("Could not find a column containing filenames with .xes")
    return df


In [5]:
def extract_unique_properties(merged_dfs):
    """
    Extract specified columns from merged dataframes, ensuring unique filenames.
    
    Args:
        merged_dfs (dict): Dictionary of merged dataframes from process_all_folders()
        
    Returns:
        dict: Dictionary of processed dataframes with unique properties per file
    """
    columns_to_keep = [
        "Number of Events", "ATS", "Number of Traces", "Distinct Events",
        "Distinct Traces", "Distinct Start Events", "Distinct End Events",
        "Average Trace Length", "Max Trace Length", "Min Trace Length",
        "Event Density", "Absolute Trace Coverage", "Relative Trace Coverage",
        "Structure", "Level of Detail", "Traces with Self-loops",
        "Total Self-loops", "Average Self-loop Size", "Event Diversity",
        "Event Repeatability", "Transition Consistency", "Sequential Complexity",
        "Rare Sequence Impact", "Event Class Dispersion",
        "Event Co-occurrence Consistency", "Trace Variability", "File"
    ]
    
    processed_dfs = {}
    
    for df_name, df in merged_dfs.items():
        try:
            filename_col = 'File'
            existing_columns = [col for col in columns_to_keep if col in df.columns]
            #print(existing_columns)
            processed_df = df[existing_columns].drop_duplicates(subset=[filename_col])
            processed_df = add_key_column(processed_df)
            
            processed_dfs[df_name] = processed_df
            
            print(f"Processed {df_name}: {len(processed_df)} unique files")
            print(f"Processed {df_name}: {processed_df['File'].nunique()} unique files")
            
            missing_columns = set(columns_to_keep) - set(existing_columns)
            if missing_columns:
                print(f"Warning: Missing columns in {df_name}: {missing_columns}")
                
        except Exception as e:
            print(f"Error processing {df_name}: {str(e)}")
            continue
    
    return processed_dfs



In [6]:
props_with_keys_df = extract_unique_properties(merged_dfs)

Processed combined_ind: 519 unique files
Processed combined_ind: 519 unique files
Processed standard: 548 unique files
Processed standard: 548 unique files
Processed case_size_filtered: 561 unique files
Processed case_size_filtered: 561 unique files
Processed inductive_logs: 561 unique files
Processed inductive_logs: 561 unique files
Processed variants_coverage_filtered: 563 unique files
Processed variants_coverage_filtered: 563 unique files
Processed variants_top_k_filtered: 543 unique files
Processed variants_top_k_filtered: 543 unique files
Processed combined_filters: 542 unique files
Processed combined_filters: 542 unique files


In [7]:
dfs_list = []

for preproc_type, df in props_with_keys_df.items():
    df_with_type = df.copy()
    df_with_type['preprocessing_type'] = preproc_type
    dfs_list.append(df_with_type)

all_props_df = pd.concat(dfs_list, ignore_index=True)

In [None]:
def prepare_for_powerbi(df):
    df.columns = df.columns.str.replace(' ', '_').str.replace('[^A-Za-z0-9_]', '')
    
    df = df.reset_index(drop=True)
    
    non_numeric_columns = ['File', 'Key', 'preprocessing_type']
    
    numeric_columns = [col for col in df.columns if col not in non_numeric_columns]
    
    for col in numeric_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

all_props_df_clean = prepare_for_powerbi(all_props_df)

output_dir = '/home/jupyter-benjamin.andrick-3cf07/test/Outputs3'
os.makedirs(output_dir, exist_ok=True)

output_file = os.path.join(output_dir, 'process_mining_analysis.csv')

all_props_df_clean.to_csv(output_file, 
                         index=False, 
                         sep=';',
                         decimal=',')
print(f"Successfully exported to CSV: {output_file}")



## Correlation / Regression Analysis (multiple)

In [6]:
def train_and_evaluate_models_old(X_train, X_test, y_train, y_test, target_name):

    automl = AutoML()
    
    # Configure the AutoML settings
    settings = {
        "time_budget": 0.3,
        "task": 'regression',
        "metric": 'mse',
        "estimator_list": ['rf'],
        "eval_method": "holdout",
        "verbose": 0,  # Reduced verbosity since we're saving to file
        "n_jobs": -1,
        "seed": 42
    }
    
    try:
        X_train = np.array(X_train).astype(np.float32)
        X_test = np.array(X_test).astype(np.float32)
        y_train = np.array(y_train).astype(np.float32)
        y_test = np.array(y_test).astype(np.float32)
        
        data_info = {
            "shapes": {
                "X_train": X_train.shape,
                "y_train": y_train.shape,
                "X_test": X_test.shape,
                "y_test": y_test.shape
            },
            "statistics": {
                "X_train_range": [float(X_train.min()), float(X_train.max())],
                "y_train_range": [float(y_train.min()), float(y_train.max())]
            }
        }
        
        automl.fit(X_train, y_train, **settings)
        
        y_pred = automl.predict(X_test)
        
        r2 = float(r2_score(y_test, y_pred))
        rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
        
        return {
            'data_info': data_info,
            'model_performance': {
                'r2': r2,
                'rmse': rmse
            },
            'model_info': {
                'best_config': automl.best_config,
                'best_estimator': str(automl.best_estimator)
            },
            'status': 'success'
        }
        
    except Exception as e:
        return {
            'status': 'error',
            'error_info': {
                'type': type(e).__name__,
                'message': str(e),
                'data_validation': {
                    'X_train_contains_nan': bool(np.isnan(X_train).any()),
                    'y_train_contains_nan': bool(np.isnan(y_train).any()),
                    'X_train_contains_inf': bool(np.isinf(X_train).any()),
                    'y_train_contains_inf': bool(np.isinf(y_train).any())
                }
            }
        }


In [3]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test, target_name):
    automl = AutoML()
    
    settings = {
        "time_budget": 120,
        "task": 'regression',
        "metric": 'mse',
        "estimator_list": [
            'xgboost',       # XGBoost
            'rf',            # Random Forest
            'extra_tree',    # Extra Trees
            #'kneighbor',     # K-Nearest Neighbors
            'histgb',        # Histogram-based Gradient Boosting
            'enet',          # Elastic Net
            #'lassolars',     # Lasso regression with LARS
        ],
        "eval_method": "holdout",
        "verbose": 0,
        "n_jobs": -1,
        "seed": 42,
        "min_sample_size": 10,  
        "ensemble": False,      
    }
    
    try:
        X_train = np.array(X_train).astype(np.float32)
        X_test = np.array(X_test).astype(np.float32)
        y_train = np.array(y_train).astype(np.float32)
        y_test = np.array(y_test).astype(np.float32)
        
        X_train = np.nan_to_num(X_train, nan=0.0, posinf=1e15, neginf=-1e15)
        X_test = np.nan_to_num(X_test, nan=0.0, posinf=1e15, neginf=-1e15)
        y_train = np.nan_to_num(y_train, nan=0.0, posinf=1e15, neginf=-1e15)
        y_test = np.nan_to_num(y_test, nan=0.0, posinf=1e15, neginf=-1e15)
        
        # Check if we have enough valid samples
        if len(X_train) < 10 or len(y_train) < 10:
            return {
                'status': 'error',
                'error_info': {
                    'type': 'InsufficientData',
                    'message': f'Not enough samples: X_train={len(X_train)}, y_train={len(y_train)}'
                }
            }
        
        # Check for constant target values
        if np.all(y_train == y_train[0]):
            return {
                'status': 'error',
                'error_info': {
                    'type': 'ConstantTarget',
                    'message': 'Target values are constant'
                }
            }
        
        # Data information for logging
        data_info = {
            "shapes": {
                "X_train": X_train.shape,
                "y_train": y_train.shape,
                "X_test": X_test.shape,
                "y_test": y_test.shape
            },
            "statistics": {
                "X_train_range": [float(X_train.min()), float(X_train.max())],
                "y_train_range": [float(y_train.min()), float(y_train.max())]
            }
        }
        
        # Train model with error catching
        try:
            automl.fit(X_train, y_train, **settings)
        except Exception as e:
            return {
                'status': 'error',
                'error_info': {
                    'type': 'FitError',
                    'message': str(e)
                }
            }
        
        y_pred = automl.predict(X_test)
        
        r2 = float(r2_score(y_test, y_pred))
        rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
        
        feature_importance = None
        if hasattr(automl.model.estimator, 'feature_importances_'):
            feature_names = (getattr(X_train, 'columns', None) or 
                           [f'feature_{i}' for i in range(X_train.shape[1])])
            
            importance_scores = automl.model.estimator.feature_importances_
            feature_importance = {
                str(name): float(score)  
                for name, score in zip(feature_names, importance_scores)
            }
            
            feature_importance = dict(sorted(
                feature_importance.items(), 
                key=lambda x: x[1], 
                reverse=True
            ))
        
        return {
            'data_info': data_info,
            'model_performance': {
                'r2': r2,
                'rmse': rmse
            },
            'model_info': {
                'best_config': automl.best_config,
                'best_estimator': str(automl.best_estimator)
            },
            'feature_importance': feature_importance,  
            'status': 'success'
        }
        
    except Exception as e:
        return {
            'status': 'error',
            'error_info': {
                'type': type(e).__name__,
                'message': str(e),
                'data_validation': {
                    'X_train_contains_nan': bool(np.isnan(X_train).any()),
                    'y_train_contains_nan': bool(np.isnan(y_train).any()),
                    'X_train_contains_inf': bool(np.isinf(X_train).any()),
                    'y_train_contains_inf': bool(np.isinf(y_train).any())
                }
            }
        }

In [3]:
#train function mit feeature names, muss als lister der funktion übergeben werden
def train_and_evaluate_models(X_train, X_test, y_train, y_test, target_name, feature_names=None):  # Add feature_names parameter
    # Initialize FLAML AutoML
    automl = AutoML()
    
    settings = {
        "time_budget": 120,
        "task": 'regression',
        "metric": 'mse',
        "estimator_list": [
            'xgboost',       # XGBoost
            'rf',            # Random Forest
            'extra_tree',    # Extra Trees
            #'kneighbor',     # K-Nearest Neighbors
            'histgb',        # Histogram-based Gradient Boosting
            'enet',          # Elastic Net
            #'lassolars',     # Lasso regression with LARS
        ],
        "eval_method": "holdout",
        "verbose": 0,
        "n_jobs": -1,
        "seed": 42,
        "min_sample_size": 10,  
        "ensemble": False,      
    }
    
    try:
        X_train = np.array(X_train).astype(np.float32)
        X_test = np.array(X_test).astype(np.float32)
        y_train = np.array(y_train).astype(np.float32)
        y_test = np.array(y_test).astype(np.float32)
        
        X_train = np.nan_to_num(X_train, nan=0.0, posinf=1e15, neginf=-1e15)
        X_test = np.nan_to_num(X_test, nan=0.0, posinf=1e15, neginf=-1e15)
        y_train = np.nan_to_num(y_train, nan=0.0, posinf=1e15, neginf=-1e15)
        y_test = np.nan_to_num(y_test, nan=0.0, posinf=1e15, neginf=-1e15)
        
        if len(X_train) < 10 or len(y_train) < 10:
            return {
                'status': 'error',
                'error_info': {
                    'type': 'InsufficientData',
                    'message': f'Not enough samples: X_train={len(X_train)}, y_train={len(y_train)}'
                }
            }
        
        if np.all(y_train == y_train[0]):
            return {
                'status': 'error',
                'error_info': {
                    'type': 'ConstantTarget',
                    'message': 'Target values are constant'
                }
            }
        
        data_info = {
            "shapes": {
                "X_train": X_train.shape,
                "y_train": y_train.shape,
                "X_test": X_test.shape,
                "y_test": y_test.shape
            },
            "statistics": {
                "X_train_range": [float(X_train.min()), float(X_train.max())],
                "y_train_range": [float(y_train.min()), float(y_train.max())]
            }
        }
        
        try:
            automl.fit(X_train, y_train, **settings)
        except Exception as e:
            return {
                'status': 'error',
                'error_info': {
                    'type': 'FitError',
                    'message': str(e)
                }
            }
        
        y_pred = automl.predict(X_test)
        
        r2 = float(r2_score(y_test, y_pred))
        rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
        
        feature_importance = None
        if hasattr(automl.model.estimator, 'feature_importances_'):
            names = feature_names if feature_names is not None else [f'feature_{i}' for i in range(X_train.shape[1])]
            
            importance_scores = automl.model.estimator.feature_importances_
            feature_importance = {
                str(name): float(score)
                for name, score in zip(names, importance_scores)
            }
            
            feature_importance = dict(sorted(
                feature_importance.items(), 
                key=lambda x: x[1], 
                reverse=True
            ))
        
        return {
            'model_performance': {
                'r2': r2,
                'rmse': rmse
            },
            'model_info': {
                'best_config': automl.best_config,
                'best_estimator': str(automl.best_estimator)
            },
            'feature_importance': feature_importance,
            'status': 'success'
        }
        
    except Exception as e:
        return {
            'status': 'error',
            'error_info': {
                'type': type(e).__name__,
                'message': str(e)
            }
        }

### different scalers

In [None]:
for df_type, df in merged_dfs.items():
    unique_miners = df['miner'].unique()
    print(df_type)    
    miner_dataframes = {}

    

    for miner in unique_miners:
        miner_dataframes[miner] = df[df['miner'] == miner]
    feature_columns = [
        "Number of Events",
        "ATS",
        "Number of Traces", 
        "Distinct Events",
        "Distinct Traces",
        "Distinct Start Events",
        "Distinct End Events",
        "Average Trace Length",
        "Max Trace Length",
        "Min Trace Length",
        "Event Density",
        "Absolute Trace Coverage",
        "Relative Trace Coverage",
        "Structure",
        "Level of Detail",
        "Traces with Self-loops",
        "Total Self-loops",
        "Average Self-loop Size",
        "Event Diversity",
        "Event Repeatability",
        "Transition Consistency",
        "Sequential Complexity",
        "Rare Sequence Impact",
        "Event Class Dispersion",
        "Event Co-occurrence Consistency",
        "Trace Variability"
    ]
    
    # Group features by their characteristics
    count_features = [
        "Number of Events",
        "Number of Traces", 
        "Distinct Events",
        "Distinct Traces",
        "Distinct Start Events",
        "Distinct End Events",
        "Total Self-loops",
    ]

    ratio_features = [
        "Event Density",
        "Relative Trace Coverage",
        "Structure",
        "Event Diversity",
        "Event Repeatability",
        "Transition Consistency",
        "Event Co-occurrence Consistency",
    ]

    length_features = [
        "ATS",
        "Average Trace Length",
        "Max Trace Length",
        "Min Trace Length",
        "Level of Detail",
        "Average Self-loop Size",
    ]

    complexity_features = [
        "Sequential Complexity",
        "Rare Sequence Impact",
        "Event Class Dispersion",
        "Trace Variability",
    ]

    preprocessed_data = {}

    for miner, miner_df in miner_dataframes.items():
        features = miner_df[feature_columns]
        target_columns = miner_df.select_dtypes(include=['number']).columns.difference(features.columns)
        targets = miner_df[target_columns]
        preprocessor = ColumnTransformer(
            transformers=[
                ('count_scaler', RobustScaler(), count_features),
                ('ratio_scaler', MinMaxScaler(), ratio_features),
                ('length_scaler', RobustScaler(), length_features),
                ('complexity_scaler', StandardScaler(), complexity_features)
            ],
            remainder='passthrough'  
        )
        
        preprocessed_data[miner] = {
            "features": features,
            "targets": targets,
            "preprocessor": preprocessor
        }

    train_test_data = {}
    for miner, data in preprocessed_data.items():
        features = data['features']
        targets = data['targets']
        features = features.apply(pd.to_numeric, errors='coerce')
        targets = targets.apply(pd.to_numeric, errors='coerce')
    
        features = features.replace([np.inf, -np.inf], np.nan).fillna(0)
        targets = targets.replace([np.inf, -np.inf], np.nan).fillna(0)
        epsilon = 1e-10
        features = features + epsilon
        features = features.clip(-1e15, 1e15)
        targets = targets.clip(-1e15, 1e15)
        try:
            scaled_features = data['preprocessor'].fit_transform(features)
            
            scaled_features = np.clip(scaled_features, -1e15, 1e15)
            
            X_train, X_test, y_train, y_test = train_test_split(
                scaled_features, 
                targets.values, 
                test_size=0.15, 
                random_state=42
            )
            
            train_test_data[miner] = {
                "preprocessor": data['preprocessor'],
                "X_train": np.nan_to_num(X_train, nan=0.0, posinf=1e15, neginf=-1e15),
                "X_test": np.nan_to_num(X_test, nan=0.0, posinf=1e15, neginf=-1e15),
                "y_train": np.nan_to_num(y_train, nan=0.0, posinf=1e15, neginf=-1e15),
                "y_test": np.nan_to_num(y_test, nan=0.0, posinf=1e15, neginf=-1e15),
                "feature_names": features.columns
            }
            
        except Exception as e:
            print(f"Error processing {miner}: {str(e)}")
            continue
    base_output_dir = '/home/jupyter-benjamin.andrick-3cf07/test/Outputs3'
    corr_output_dir = os.path.join(base_output_dir, 'CorrMatrix')
    reg_output_dir = os.path.join(base_output_dir, 'RegressionResults')
    
    for dir_path in [corr_output_dir, reg_output_dir]:
        os.makedirs(dir_path, exist_ok=True)

    for miner, data in train_test_data.items():
        features_df = pd.DataFrame(
            data["X_train"],
            columns=data["feature_names"]
        )
        targets_df = pd.DataFrame(
            data["y_train"],
            columns=preprocessed_data[miner]["targets"].columns
        )
        
        corr_matrix = features_df.join(targets_df).corr().loc[features_df.columns, targets_df.columns]
        corr_matrix = corr_matrix.fillna(0)
        
        base_filename = f"{miner}_{df_type}_mixed_scaling"
        corr_matrix.to_csv(os.path.join(corr_output_dir, f"{base_filename}_correlation_matrix.csv"))
        
        plt.figure(figsize=(15, 10))  # Increased figure size
        
        sns.heatmap(
            corr_matrix,
            annot=True,
            fmt=".2f",
            cmap='coolwarm',
            cbar_kws={'label': 'Correlation'},
            xticklabels=targets_df.columns,
            yticklabels=features_df.columns,
            annot_kws={'size': 8},  
            center=0,  
            vmin=-1,   
            vmax=1     
        )
        
        plt.title(f"Feature-Target Correlation Matrix for {miner}\n({df_type}, mixed scaling)", 
                pad=20,  
                size=14, 
                weight='bold')
        plt.ylabel("Features", size=12, weight='bold')
        plt.xlabel("Targets", size=12, weight='bold')
        
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        
        plt.tight_layout()
        
        plt.savefig(
            os.path.join(corr_output_dir, f"{base_filename}_correlation_matrix.png"),
            dpi=300,
            bbox_inches='tight'
        )
        plt.close()
    

    all_results = {
        "df_type": df_type,
        "miners": {}
    }
    # For each miner
    for miner, data in tqdm(train_test_data.items(), desc="Processing miners"):
        miner_results = {}
        target_names = preprocessed_data[miner]['targets'].columns
        
        # For each target metric
        for i, target_name in enumerate(target_names):
            try:
                y_train = data['y_train'][:, i]
                y_test = data['y_test'][:, i]
                
                results = train_and_evaluate_models(
                    data['X_train'], 
                    data['X_test'], 
                    y_train, 
                    y_test, 
                    target_name
                )
                
                miner_results[target_name] = results
                    
            except Exception as e:
                miner_results[target_name] = {
                    'status': 'error',
                    'error_message': str(e)
                }
        
        all_results["miners"][miner] = miner_results
        miner_filename = os.path.join(
            reg_output_dir, 
            f'regression_results_{miner.replace(" ", "_")}_{df_type}_mixed_scaling.json'
        )
        with open(miner_filename, 'w') as f:
            json.dump({
                "df_type": df_type,
                "scaling_info": {
                    "count_features": count_features,
                    "ratio_features": ratio_features,
                    "length_features": length_features,
                    "complexity_features": complexity_features
                },
                "miner_results": miner_results
            }, f, indent=4)

    # Save combined results
    combined_filename = os.path.join(reg_output_dir, f'regression_results_all_{df_type}.json')
    with open(combined_filename, 'w') as f:
        json.dump(all_results, f, indent=4)


combined_ind


Processing miners:   0%|          | 0/5 [00:00<?, ?it/s]

### standard scaler

In [None]:
for df_type, df in merged_dfs.items():
    unique_miners = df['miner'].unique()
    print(df_type)   
    miner_dataframes = {}
    

    for miner in unique_miners:
        miner_dataframes[miner] = df[df['miner'] == miner]

    feature_columns = [
        "Number of Events",
        "ATS",
        "Number of Traces", 
        "Distinct Events",
        "Distinct Traces",
        "Distinct Start Events",
        "Distinct End Events",
        "Average Trace Length",
        "Max Trace Length",
        "Min Trace Length",
        "Event Density",
        "Absolute Trace Coverage",
        "Relative Trace Coverage",
        "Structure",
        "Level of Detail",
        "Traces with Self-loops",
        "Total Self-loops",
        "Average Self-loop Size",
        "Event Diversity",
        "Event Repeatability",
        "Transition Consistency",
        "Sequential Complexity",
        "Rare Sequence Impact",
        "Event Class Dispersion",
        "Event Co-occurrence Consistency",
        "Trace Variability"
    ]
    preprocessed_data = {}


    for miner, miner_df in miner_dataframes.items():
        features = miner_df[feature_columns]
        
        target_columns = miner_df.select_dtypes(include=['number']).columns.difference(features.columns)
        targets = miner_df[target_columns]
        
        preprocessed_data[miner] = {
            "features": features,
            "targets": targets
        }


 
    train_test_data = {}

    # Process each miner's data
    for miner, data in preprocessed_data.items():
        # Extract features and targets
        features = data['features']
        targets = data['targets']
        
        # Ensure no missing values in features or targets
        features = features.fillna(0)
        targets = targets.fillna(0)
        
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(features)
        
        X_train, X_test, y_train, y_test = train_test_split(
            scaled_features, 
            targets.values, 
            test_size=0.15, 
            random_state=42
        )
        
        train_test_data[miner] = {
            "scaler": scaler,
            "X_train": X_train,
            "X_test": X_test,
            "y_train": y_train,
            "y_test": y_test
        }

  
    output_dir = '/home/jupyter-benjamin.andrick-3cf07/test/Outputs3/CorrMatrix'
    os.makedirs(output_dir, exist_ok=True)  

    for miner, splits in train_test_data.items():
        features_df = pd.DataFrame(splits["X_train"], columns=preprocessed_data[miner]["features"].columns)
        targets_df = pd.DataFrame(splits["y_train"], columns=preprocessed_data[miner]["targets"].columns)
        
        corr_matrix = features_df.join(targets_df).corr().loc[features_df.columns, targets_df.columns]

        corr_matrix = corr_matrix.fillna(0)
        
        csv_file_name = os.path.join(output_dir, f"{miner}_{df_type}_correlation_matrix.csv")
        corr_matrix.to_csv(csv_file_name)
        
        png_file_name = os.path.join(output_dir, f"{miner}_{df_type}_correlation_matrix.png")
        plt.figure(figsize=(12, 8))
        sns.heatmap(
            corr_matrix,
            annot=True,
            fmt=".2f",
            cmap='coolwarm',
            cbar_kws={'label': 'Correlation'},
            xticklabels=targets_df.columns,
            yticklabels=features_df.columns
        )
        plt.title(f"Feature-Target Correlation Matrix for {miner} ({df_type})")
        plt.ylabel("Features")
        plt.xlabel("Targets")
        plt.savefig(png_file_name)
        plt.close()

    output_dir = '/home/jupyter-benjamin.andrick-3cf07/test/Outputs3/RegressionResults_1s_test'
    os.makedirs(output_dir, exist_ok=True)

    all_results = {
        "df_type": df_type,  
        "miners": {}        
    }

    # For each miner
    for miner, data in tqdm(train_test_data.items(), desc="Processing miners"):
        miner_results = {}
        target_names = preprocessed_data[miner]['targets'].columns
        
        # For each target metric
        for i, target_name in enumerate(target_names):
            try:
                # Extract the specific target column
                y_train = data['y_train'][:, i]
                y_test = data['y_test'][:, i]
                
                results = train_and_evaluate_models(
                    data['X_train'], 
                    data['X_test'], 
                    y_train, 
                    y_test, 
                    target_name
                )
                
                miner_results[target_name] = results
                    
            except Exception as e:
                miner_results[target_name] = {
                    'status': 'error',
                    'error_message': str(e)
                }
        
        all_results["miners"][miner] = miner_results 
        
        miner_filename = os.path.join(output_dir, f'regression_results_{miner.replace(" ", "_")}_{df_type}.json')
        with open(miner_filename, 'w') as f:
            json.dump({
                "df_type": df_type,
                "miner_results": miner_results
            }, f, indent=4)

    combined_filename = os.path.join(output_dir, f'regression_results_all_{df_type}.json')
    with open(combined_filename, 'w') as f:
        json.dump(all_results, f, indent=4)


## Correlation / Regresssion Analysis (großer DF)

In [9]:
all_merged_df = pd.concat([
    df.assign(preprocessing_type=preproc_type) 
    for preproc_type, df in merged_dfs.items()
], ignore_index=True)



In [None]:
df = all_merged_df
df_name = df['file'].iloc[0]
df_type = 'all_logtypes'
unique_miners = df['miner'].unique()
print(df_type)   
miner_dataframes = {}

for miner in unique_miners:
    miner_dataframes[miner] = df[df['miner'] == miner]

feature_columns = [
    "Number of Events",
    "ATS",
    "Number of Traces", 
    "Distinct Events",
    "Distinct Traces",
    "Distinct Start Events",
    "Distinct End Events",
    "Average Trace Length",
    "Max Trace Length",
    "Min Trace Length",
    "Event Density",
    "Absolute Trace Coverage",
    "Relative Trace Coverage",
    "Structure",
    "Level of Detail",
    "Traces with Self-loops",
    "Total Self-loops",
    "Average Self-loop Size",
    "Event Diversity",
    "Event Repeatability",
    "Transition Consistency",
    "Sequential Complexity",
    "Rare Sequence Impact",
    "Event Class Dispersion",
    "Event Co-occurrence Consistency",
    "Trace Variability"
]

count_features = [
    "Number of Events",
    "Number of Traces", 
    "Distinct Events",
    "Distinct Traces",
    "Distinct Start Events",
    "Distinct End Events",
    "Total Self-loops",
]

ratio_features = [
    "Event Density",
    "Relative Trace Coverage",
    "Structure",
    "Event Diversity",
    "Event Repeatability",
    "Transition Consistency",
    "Event Co-occurrence Consistency",
]

length_features = [
    "ATS",
    "Average Trace Length",
    "Max Trace Length",
    "Min Trace Length",
    "Level of Detail",
    "Average Self-loop Size",
]

complexity_features = [
    "Sequential Complexity",
    "Rare Sequence Impact",
    "Event Class Dispersion",
    "Trace Variability",
]

preprocessed_data = {}

for miner, miner_df in miner_dataframes.items():
    features = miner_df[feature_columns]
    
    target_columns = miner_df.select_dtypes(include=['number']).columns.difference(features.columns)
    targets = miner_df[target_columns]
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('count_scaler', RobustScaler(), count_features),
            ('ratio_scaler', MinMaxScaler(), ratio_features),
            ('length_scaler', RobustScaler(), length_features),
            ('complexity_scaler', StandardScaler(), complexity_features)
        ],
        remainder='passthrough'  
    )
    
    preprocessed_data[miner] = {
        "features": features,
        "targets": targets,
        "preprocessor": preprocessor
    }

train_test_data = {}

for miner, data in preprocessed_data.items():
    features = data['features']
    targets = data['targets']
    features = features.apply(pd.to_numeric, errors='coerce')
    targets = targets.apply(pd.to_numeric, errors='coerce')
    
    features = features.replace([np.inf, -np.inf], np.nan).fillna(0)
    targets = targets.replace([np.inf, -np.inf], np.nan).fillna(0)
    
    epsilon = 1e-10
    features = features + epsilon
    
    features = features.clip(-1e15, 1e15)
    targets = targets.clip(-1e15, 1e15)
    
    try:
        scaled_features = data['preprocessor'].fit_transform(features)
        
        scaled_features = np.clip(scaled_features, -1e15, 1e15)
        
        X_train, X_test, y_train, y_test = train_test_split(
            scaled_features, 
            targets.values, 
            test_size=0.15, 
            random_state=42
        )
        
        train_test_data[miner] = {
            "preprocessor": data['preprocessor'],
            "X_train": np.nan_to_num(X_train, nan=0.0, posinf=1e15, neginf=-1e15),
            "X_test": np.nan_to_num(X_test, nan=0.0, posinf=1e15, neginf=-1e15),
            "y_train": np.nan_to_num(y_train, nan=0.0, posinf=1e15, neginf=-1e15),
            "y_test": np.nan_to_num(y_test, nan=0.0, posinf=1e15, neginf=-1e15),
            "feature_names": features.columns
        }
        
    except Exception as e:
        print(f"Error processing {miner}: {str(e)}")
        continue

base_output_dir = '/home/jupyter-benjamin.andrick-3cf07/test/Outputs3'
corr_output_dir = os.path.join(base_output_dir, 'CorrMatrix_Miner')
reg_output_dir = os.path.join(base_output_dir, 'RegressionResults_Miner')

for dir_path in [corr_output_dir, reg_output_dir]:
    os.makedirs(dir_path, exist_ok=True)

for miner, miner_df in miner_dataframes.items():
    features_df = miner_df[feature_columns]
    
    target_columns = miner_df.select_dtypes(include=['number']).columns.difference(features_df.columns)
    targets_df = miner_df[target_columns]
    
    corr_matrix = features_df.join(targets_df).corr().loc[features_df.columns, targets_df.columns]
    corr_matrix = corr_matrix.fillna(0)
    
    base_filename = f"{miner}_{df_type}_original_values"
    corr_matrix.to_csv(os.path.join(corr_output_dir, f"{base_filename}_correlation_matrix.csv"))
    
    plt.figure(figsize=(15, 10)) 
    
    sns.heatmap(
        corr_matrix,
        annot=True,
        fmt=".2f",
        cmap='coolwarm',
        cbar_kws={'label': 'Correlation'},
        xticklabels=targets_df.columns,
        yticklabels=features_df.columns,
        annot_kws={'size': 8},  
        center=0,  
        vmin=-1,   
        vmax=1     
    )
    
    plt.title(f"Feature-Target Correlation Matrix for {miner}\n({df_type}, original values)", 
            pad=20,  
            size=14, 
            weight='bold')
    plt.ylabel("Features", size=12, weight='bold')
    plt.xlabel("Targets", size=12, weight='bold')
    
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    
    plt.tight_layout()
    
    plt.savefig(
        os.path.join(corr_output_dir, f"{base_filename}_correlation_matrix.png"),
        dpi=300,
        bbox_inches='tight'
    )
    plt.close()

all_results = {
    "df_type": df_type,
    "miners": {}
}

for miner, data in tqdm(train_test_data.items(), desc="Processing miners"):
    miner_results = {}
    target_names = preprocessed_data[miner]['targets'].columns
    
    for i, target_name in enumerate(target_names):
        try:
            y_train = data['y_train'][:, i]
            y_test = data['y_test'][:, i]
            
            results = train_and_evaluate_models(
                data['X_train'], 
                data['X_test'], 
                y_train, 
                y_test, 
                target_name
            )
            
            miner_results[target_name] = results
                
        except Exception as e:
            miner_results[target_name] = {
                'status': 'error',
                'error_message': str(e)
            }
    
    all_results["miners"][miner] = miner_results
    
    miner_filename = os.path.join(
        reg_output_dir, 
        f'regression_results_{miner.replace(" ", "_")}_{df_type}_mixed_scaling.json'
    )
    with open(miner_filename, 'w') as f:
        json.dump({
            "df_type": df_type,
            "scaling_info": {
                "count_features": count_features,
                "ratio_features": ratio_features,
                "length_features": length_features,
                "complexity_features": complexity_features
            },
            "miner_results": miner_results
        }, f, indent=4)

combined_filename = os.path.join(reg_output_dir, f'regression_results_all_{df_type}.json')
with open(combined_filename, 'w') as f:
    json.dump(all_results, f, indent=4)


all_logtypes


Processing miners: 100%|██████████| 5/5 [00:07<00:00,  1.41s/it]


## Correlation/Regression BIIIG

In [None]:
df = all_merged_df
df_name = df['file'].iloc[0]
df_type = 'all_logtypes'


feature_columns = [
    "Number of Events",
    "ATS",
    "Number of Traces", 
    "Distinct Events",
    "Distinct Traces",
    "Distinct Start Events",
    "Distinct End Events",
    "Average Trace Length",
    "Max Trace Length",
    "Min Trace Length",
    "Event Density",
    "Absolute Trace Coverage",
    "Relative Trace Coverage",
    "Structure",
    "Level of Detail",
    "Traces with Self-loops",
    "Total Self-loops",
    "Average Self-loop Size",
    "Event Diversity",
    "Event Repeatability",
    "Transition Consistency",
    "Sequential Complexity",
    "Rare Sequence Impact",
    "Event Class Dispersion",
    "Event Co-occurrence Consistency",
    "Trace Variability"
]

count_features = [
    "Number of Events",
    "Number of Traces", 
    "Distinct Events",
    "Distinct Traces",
    "Distinct Start Events",
    "Distinct End Events",
    "Total Self-loops",
]

ratio_features = [
    "Event Density",
    "Relative Trace Coverage",
    "Structure",
    "Event Diversity",
    "Event Repeatability",
    "Transition Consistency",
    "Event Co-occurrence Consistency",
]

length_features = [
    "ATS",
    "Average Trace Length",
    "Max Trace Length",
    "Min Trace Length",
    "Level of Detail",
    "Average Self-loop Size",
]

complexity_features = [
    "Sequential Complexity",
    "Rare Sequence Impact",
    "Event Class Dispersion",
    "Trace Variability",
]


features = df[feature_columns]
target_columns = df.select_dtypes(include=['number']).columns.difference(features.columns)
targets = df[target_columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('count_scaler', RobustScaler(), count_features),
        ('ratio_scaler', MinMaxScaler(), ratio_features),
        ('length_scaler', RobustScaler(), length_features),
        ('complexity_scaler', StandardScaler(), complexity_features)
    ],
    remainder='passthrough'
)

try:
    features = features.apply(pd.to_numeric, errors='coerce')
    targets = targets.apply(pd.to_numeric, errors='coerce')
    features = features.replace([np.inf, -np.inf], np.nan).fillna(0)
    targets = targets.replace([np.inf, -np.inf], np.nan).fillna(0)
    
    features = features + 1e-10
    features = features.clip(-1e15, 1e15)
    targets = targets.clip(-1e15, 1e15)
    
    scaled_features = preprocessor.fit_transform(features)
    scaled_features = np.clip(scaled_features, -1e15, 1e15)
    
    X_train, X_test, y_train, y_test = train_test_split(
        scaled_features,
        targets.values,
        test_size=0.15,
        random_state=42
    )
    
    base_output_dir = '/home/jupyter-benjamin.andrick-3cf07/test/Outputs3'
    corr_output_dir = os.path.join(base_output_dir, 'CorrMatrix_Combined')
    reg_output_dir = os.path.join(base_output_dir, 'RegressionResults_Combined')
    
    for dir_path in [corr_output_dir, reg_output_dir]:
        os.makedirs(dir_path, exist_ok=True)
    
    corr_matrix = features.join(targets).corr().loc[features.columns, targets.columns]
    corr_matrix = corr_matrix.fillna(0)
    
    base_filename = f"combined_{df_type}_original_values"
    corr_matrix.to_csv(os.path.join(corr_output_dir, f"{base_filename}_correlation_matrix.csv"))
    
    plt.figure(figsize=(15, 10))
    sns.heatmap(
        corr_matrix,
        annot=True,
        fmt=".2f",
        cmap='coolwarm',
        cbar_kws={'label': 'Correlation'},
        xticklabels=targets.columns,
        yticklabels=features.columns,
        annot_kws={'size': 8},
        center=0,
        vmin=-1,
        vmax=1
    )
    
    plt.title(f"Feature-Target Correlation Matrix\n({df_type}, original values)", 
            pad=20,
            size=14,
            weight='bold')
    plt.ylabel("Features", size=12, weight='bold')
    plt.xlabel("Targets", size=12, weight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    
    plt.savefig(
        os.path.join(corr_output_dir, f"{base_filename}_correlation_matrix.png"),
        dpi=300,
        bbox_inches='tight'
    )
    plt.close()
    
    all_results = {
        "df_type": df_type,
        "results": {}
    }
    
    for i, target_name in enumerate(targets.columns):
        try:
            results = train_and_evaluate_models(
                X_train,
                X_test,
                y_train[:, i],
                y_test[:, i],
                target_name
            )
            all_results["results"][target_name] = results
        except Exception as e:
            all_results["results"][target_name] = {
                'status': 'error',
                'error_message': str(e)
            }
    
    results_filename = os.path.join(reg_output_dir, f'regression_results_{df_type}_mixed_scaling.json')
    with open(results_filename, 'w') as f:
        json.dump({
            "df_type": df_type,
            "scaling_info": {
                "count_features": count_features,
                "ratio_features": ratio_features,
                "length_features": length_features,
                "complexity_features": complexity_features
            },
            "results": all_results
        }, f, indent=4)

except Exception as e:
    print(f"Error processing data: {str(e)}")











## Regression Analysis (predict best Algo)

In [14]:
feature_columns = [
    "Number of Events",
    "ATS",
    "Number of Traces", 
    "Distinct Events",
    "Distinct Traces",
    "Distinct Start Events",
    "Distinct End Events",
    "Average Trace Length",
    "Max Trace Length",
    "Min Trace Length",
    "Event Density",
    "Absolute Trace Coverage",
    "Relative Trace Coverage",
    "Structure",
    "Level of Detail",
    "Traces with Self-loops",
    "Total Self-loops",
    "Average Self-loop Size",
    "Event Diversity",
    "Event Repeatability",
    "Transition Consistency",
    "Sequential Complexity",
    "Rare Sequence Impact",
    "Event Class Dispersion",
    "Event Co-occurrence Consistency",
    "Trace Variability"
]

def keep_best_miner_per_file(df, column_name):
    """
    Filter DataFrame to keep only rows where the miner has the highest value
    for the specified column per file and remove other metric columns.
    Excludes ILP Miner for specific fitness-related metrics.
    """
    df_copy = df.copy()
    
    ilp_exclude_metrics = [
        #'perc_fit_traces',
        #'average_trace_fitness',
        'log_fitness',
        #'percentage_of_fitting_traces'
    ]
    
    if column_name in ilp_exclude_metrics:
        df_copy = df_copy[df_copy['miner'] != 'ILP Miner']
    
    metric_columns = [
        #'perc_fit_traces',
        #'average_trace_fitness',
        'log_fitness',
        #'percentage_of_fitting_traces',
        'precision',
        'generalization',
        'simplicity',
        'metricsAverageWeight',
        'fscore',
        #'score'
    ]
    
    idx = df_copy.groupby('file')[column_name].idxmax()
    
    best_miners_df = df_copy.loc[idx]
    
    columns_to_remove = [col for col in metric_columns if col != column_name]
    best_miners_df = best_miners_df.drop(columns=columns_to_remove)
    
    best_miners_df = best_miners_df.reset_index(drop=True)
    
    return best_miners_df

In [None]:
#KNN Entfernen!!!!

metrics = [
    #'perc_fit_traces',
    #'average_trace_fitness',
    'log_fitness',
    #'percentage_of_fitting_traces',
    'precision',
    'generalization',
    'simplicity',
    'metricsAverageWeight',
    'fscore',
]

all_preprocessing_results = {}

for preproc_type, df in merged_dfs.items():
    
    
    all_metric_results = {}
    
    for metric in metrics:
        
        
        best_miners_df = keep_best_miner_per_file(df, metric)

        X = best_miners_df[feature_columns]
        
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(best_miners_df['miner'])
        
        miner_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
        
        
        count_features = [
            "Number of Events",
            "Number of Traces", 
            "Distinct Events",
            "Distinct Traces",
            "Distinct Start Events",
            "Distinct End Events",
            "Total Self-loops",
        ]

        ratio_features = [
            "Event Density",
            "Relative Trace Coverage",
            "Structure",
            "Event Diversity",
            "Event Repeatability",
            "Transition Consistency",
            "Event Co-occurrence Consistency",
        ]

        length_features = [
            "ATS",
            "Average Trace Length",
            "Max Trace Length",
            "Min Trace Length",
            "Level of Detail",
            "Average Self-loop Size",
        ]

        complexity_features = [
            "Sequential Complexity",
            "Rare Sequence Impact",
            "Event Class Dispersion",
            "Trace Variability",
        ]
        
        preprocessor = ColumnTransformer(
            transformers=[
                ('count_scaler', RobustScaler(), count_features),
                ('ratio_scaler', MinMaxScaler(), ratio_features),
                ('length_scaler', RobustScaler(), length_features),
                ('complexity_scaler', StandardScaler(), complexity_features)
            ],
            remainder='passthrough' 
        )
        
        X = X.fillna(0)
        X_scaled = preprocessor.fit_transform(X)
        
        total_samples = len(X)
        test_size = 0.2 if total_samples < 100 else 0.15
        
        try:
            X_train, X_test, y_train, y_test = train_test_split(
                X_scaled, 
                y,
                test_size=test_size, 
                random_state=42,
                stratify=y
            )
        except ValueError as e:
            print(f"Error in train_test_split: {e}")
            X_train, X_test, y_train, y_test = train_test_split(
                X_scaled, 
                y,
                test_size=test_size, 
                random_state=42
            )
        
        
        unique, counts = np.unique(y_train, return_counts=True)
        for u, c in zip(unique, counts):
            miner_name = label_encoder.inverse_transform([u])[0]
            print(f"{miner_name}: {c} samples")
        
        results = train_and_evaluate_models(
            X_train,
            X_test,
            y_train,
            y_test,
            f'miner_prediction_{metric}'
        )
        
        all_metric_results[metric] = {
            'results': results,
            'miner_mapping': miner_mapping,
            'class_distribution': dict(zip(label_encoder.inverse_transform(unique), counts))
        }
        
    
    all_preprocessing_results[preproc_type] = all_metric_results

output_dir = '/home/jupyter-benjamin.andrick-3cf07/test/Outputs3/Regression_pred_best_miner'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'miner_prediction_results_all_preprocessing.json')

def convert_to_serializable(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    return obj

serializable_results = {
    preproc_type: {
        metric: {
            key: {
                k: convert_to_serializable(v) if isinstance(v, (np.integer, np.floating, np.ndarray)) else v
                for k, v in value.items()
            }
            for key, value in results.items()
        }
        for metric, results in metric_results.items()
    }
    for preproc_type, metric_results in all_preprocessing_results.items()
}

with open(output_file, 'w') as f:
    json.dump(serializable_results, f, indent=4)

print(f"\nResults saved to: {output_file}")

### best miner prediction unabhängig von preprocessing

In [None]:
metrics = [
    #'perc_fit_traces',
    #'average_trace_fitness',
    'log_fitness',
    #'percentage_of_fitting_traces',
    'precision',
    'generalization',
    'simplicity',
    'metricsAverageWeight',
    'fscore',
]

all_metric_results = {}

df = pd.concat(merged_dfs.values(), ignore_index=True)

for metric in metrics:
    best_miners_df = keep_best_miner_per_file(df, metric)
    
    X = best_miners_df[feature_columns]
    
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(best_miners_df['miner'])
    
    miner_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    
    # Group features by their characteristics
    count_features = [
        "Number of Events",
        "Number of Traces", 
        "Distinct Events",
        "Distinct Traces",
        "Distinct Start Events",
        "Distinct End Events",
        "Total Self-loops",
    ]

    ratio_features = [
        "Event Density",
        "Relative Trace Coverage",
        "Structure",
        "Event Diversity",
        "Event Repeatability",
        "Transition Consistency",
        "Event Co-occurrence Consistency",
    ]

    length_features = [
        "ATS",
        "Average Trace Length",
        "Max Trace Length",
        "Min Trace Length",
        "Level of Detail",
        "Average Self-loop Size",
    ]

    complexity_features = [
        "Sequential Complexity",
        "Rare Sequence Impact",
        "Event Class Dispersion",
        "Trace Variability",
    ]
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('count_scaler', RobustScaler(), count_features),
            ('ratio_scaler', MinMaxScaler(), ratio_features),
            ('length_scaler', RobustScaler(), length_features),
            ('complexity_scaler', StandardScaler(), complexity_features)
        ],
        remainder='passthrough' 
    )
    
    X = X.fillna(0)
    X_scaled = preprocessor.fit_transform(X)
    
    total_samples = len(X)
    test_size = 0.2 if total_samples < 100 else 0.15
    
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, 
            y,
            test_size=test_size, 
            random_state=42,
            stratify=y
        )
    except ValueError as e:
        print(f"Error in train_test_split: {e}")
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, 
            y,
            test_size=test_size, 
            random_state=42
        )
    
    unique, counts = np.unique(y_train, return_counts=True)
    for u, c in zip(unique, counts):
        miner_name = label_encoder.inverse_transform([u])[0]
        print(f"{miner_name}: {c} samples")
    
    results = train_and_evaluate_models(
        X_train,
        X_test,
        y_train,
        y_test,
        f'miner_prediction_{metric}'
    )
    
    all_metric_results[metric] = {
        'results': results,
        'miner_mapping': miner_mapping,
        'class_distribution': dict(zip(label_encoder.inverse_transform(unique), counts))
    }

output_dir = '/home/jupyter-benjamin.andrick-3cf07/test/Outputs3/Regression_pred_best_miner_combined'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'miner_prediction_results.json')

def convert_to_serializable(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    return obj

serializable_results = {
    metric: {
        key: {
            k: convert_to_serializable(v) if isinstance(v, (np.integer, np.floating, np.ndarray)) else v
            for k, v in value.items()
        }
        for key, value in results.items()
    }
    for metric, results in all_metric_results.items()
}

with open(output_file, 'w') as f:
    json.dump(serializable_results, f, indent=4)

print(f"\nResults saved to: {output_file}")