<a href="https://colab.research.google.com/github/adir-hil/HeteroGraphs-for-OC-logs/blob/main/Baseline_production.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package installation & import

In [None]:
!pip install numpy==1.26.4
!pip install catboost --force-reinstall

In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)
%cd gdrive/MyDrive/ocel

import pandas as pd
from sklearn.metrics import mean_absolute_error

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np # For sqrt

Mounted at /content/gdrive
/content/gdrive/MyDrive/ocel


# Preprocessing & Dataset split

In [None]:
new_df_lc = pd.read_csv('/content/gdrive/MyDrive/ocel/with_lables.csv')
new_df_lc['LIFECYCLE_BATCH'] = new_df_lc['LIFECYCLE_BATCH'].astype(str)
new_df_lc['TIMESTAMP'] = pd.to_datetime(new_df_lc['TIMESTAMP'])
new_df_lc.drop("Unnamed: 0",axis=1,inplace =True)
new_df_lc.drop("Unnamed: 0.1",axis=1,inplace =True)

# Get the order of batches based on the first timestamp of each batch
batch_order = new_df_lc.groupby('LIFECYCLE_BATCH')['TIMESTAMP'].min().sort_values().index
# Reindex the dataframe based on the batch order
new_df_lc = new_df_lc.set_index('LIFECYCLE_BATCH').loc[batch_order].reset_index()

# Calculate the first timestamp in the dataframe
first_timestamp = new_df_lc['TIMESTAMP'].min()

# Create a new column called 'DAYS_FROM_FIRST_EVENT'
new_df_lc['DAYS_FROM_FIRST_EVENT'] = (new_df_lc['TIMESTAMP'] - first_timestamp).dt.days

  new_df_lc = pd.read_csv('/content/gdrive/MyDrive/ocel/with_lables.csv')


In [None]:
n_train_batches = 815
n_val_batches = 116
unique_batches = new_df_lc['LIFECYCLE_BATCH'].unique()
n_total_unique_batches = len(unique_batches)

print(f"\nTotal unique batches found: {n_total_unique_batches}")
if n_train_batches + n_val_batches > n_total_unique_batches:
      print(f"Warning: Train+Val batches ({n_train_batches + n_val_batches}) exceeds total ({n_total_unique_batches}). Adjusting Val size.")
      n_val_batches = n_total_unique_batches - n_train_batches
      n_val_batches = max(0, n_val_batches) # Ensure non-negative

train_batch_ids = unique_batches[0 : n_train_batches]
val_batch_ids = unique_batches[n_train_batches : n_train_batches + n_val_batches]
test_batch_ids = unique_batches[n_train_batches + n_val_batches : ]

print(f"Assigning {len(train_batch_ids)} batches to Train.")
print(f"Assigning {len(val_batch_ids)} batches to Validation.")
print(f"Assigning {len(test_batch_ids)} batches to Test.")

train_set = set(train_batch_ids)
val_set = set(val_batch_ids)
test_set = set(test_batch_ids)

new_df_lc['SET'] = 'Unknown' # Initialize
new_df_lc.loc[new_df_lc['LIFECYCLE_BATCH'].isin(train_set), 'SET'] = 'Train'
new_df_lc.loc[new_df_lc['LIFECYCLE_BATCH'].isin(val_set), 'SET'] = 'Val'
new_df_lc.loc[new_df_lc['LIFECYCLE_BATCH'].isin(test_set), 'SET'] = 'Test'

print("\nData distribution across sets:")
print(new_df_lc['SET'].value_counts())
if (new_df_lc['SET'] == 'Unknown').any():
    print("\nError: Some rows were not assigned to a set. Check batch IDs.")


Total unique batches found: 1165
Assigning 815 batches to Train.
Assigning 116 batches to Validation.
Assigning 234 batches to Test.

Data distribution across sets:
SET
Train    372848
Test     105793
Val       46988
Name: count, dtype: int64


In [None]:
target_column = 'log_days_to_finish'

# List the exact column names you want to use as input features
feature_columns = [
    'ITEM',            # Potentially categorical ID? Treat as numeric or categorical? Let's try categorical.
    'AREA',            # Categorical
    'LOCATION',        # Categorical
    'TU',              # Categorical ID
    'PMX_USER',        # Categorical ID
    'PMX_ORDER',       # Potentially categorical ID? Let's try categorical.
    'ORDER_STEP',      # Potentially categorical ID? Let's try categorical.
    'ACTIVITY',        # Categorical (redundant with MAIN/SUB?)
    'LOT',             # Potentially categorical ID? Let's try categorical.
    'PRODUCT',         # Potentially categorical ID? Let's try categorical.
    'MATERIAL',        # Potentially categorical ID? Let's try categorical.
    'LIFECYCLE_BATCH',
    # Include your hour columns if desired
    'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7',
    'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15',
    'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
    # Include your tertile columns if desired
    'tertile_12am-8am', 'tertile_8am-4pm', 'tertile_4pm-12am',
    # Include your time features if desired
    'time_since_first_day',
    'grouped_time_since_first_day',
]
    # 'DAYS_FROM_FIRST_EVENT', # Include if calculated and desired
    # Ensure 'days_to_finish' (the non-log version) is NOT included if 'log_days_to_finish' is the target]

# Identify which of the chosen feature_columns are categorical
# *** USER ACTION REQUIRED HERE: Verify this list is correct for your chosen features ***
categorical_features_list = [
    'ITEM','AREA', 'LOCATION', 'TU',
    'PMX_USER', 'PMX_ORDER', 'ORDER_STEP', 'ACTIVITY', 'LOT',
    'PRODUCT', 'MATERIAL','LIFECYCLE_BATCH'
    # Add/remove based on the actual feature_columns list and their types
    # Ensure USAGE/PROD_TYPE are added if they are categorical strings/objects
]

# Basic check for missing definitions
if not target_column or not feature_columns:
    print("Error: Please define `target_column` and `feature_columns`.")
    exit()

# Verify all feature columns exist in the DataFrame
missing_cols = [col for col in feature_columns if col not in new_df_lc.columns]
if missing_cols:
    print(f"Error: The following feature columns are not in the DataFrame: {missing_cols}")
    exit()

# Verify target column exists
if target_column not in new_df_lc.columns:
    print(f"Error: The target column '{target_column}' is not in the DataFrame.")
    exit()

# Verify categorical features are within the main feature list
missing_cats = [col for col in categorical_features_list if col not in feature_columns]
if missing_cats:
    print(f"Error: The following categorical features are not in the main feature_columns list: {missing_cats}")
    exit()


In [None]:
for col in categorical_features_list:
  # Convert the column to a categorical type and extract numerical codes
  new_df_lc[col] = new_df_lc[col].astype('category')
  new_df_lc[col] = new_df_lc[col].cat.codes

In [None]:
print("\nSplitting data into Train/Validation/Test sets (X and y)...")

train_df = new_df_lc[new_df_lc['SET'] == 'Train']
val_df   = new_df_lc[new_df_lc['SET'] == 'Val']
test_df  = new_df_lc[new_df_lc['SET'] == 'Test']

X_train = train_df[feature_columns]
y_train = train_df[target_column]

X_val = val_df[feature_columns]
y_val = val_df[target_column]

X_test = test_df[feature_columns]
y_test = test_df[target_column]

print(f"Train shapes: X={X_train.shape}, y={y_train.shape}")
print(f"Val shapes:   X={X_val.shape}, y={y_val.shape}")
print(f"Test shapes:  X={X_test.shape}, y={y_test.shape}")


Splitting data into Train/Validation/Test sets (X and y)...
Train shapes: X=(372848, 41), y=(372848,)
Val shapes:   X=(46988, 41), y=(46988,)
Test shapes:  X=(105793, 41), y=(105793,)


# Model Initialization & Execution

In [None]:
print("\nInitializing CatBoostRegressor...")

# Define model parameters (adjust as needed)
catboost_params = {
    'iterations': 1000,             # Number of boosting iterations (trees)
    'learning_rate': 0.05,          # Step size shrinkage
    'depth': 6,                     # Depth of trees
    'l2_leaf_reg': 3,               # L2 regularization coefficient
    'loss_function': 'MAE',        # Objective function for regression
    'eval_metric': 'MAE',          # Metric for evaluation and early stopping
    'cat_features': categorical_features_list, # List of categorical feature names
    'early_stopping_rounds': 50,    # Stop if eval_metric doesn't improve for 50 rounds
    'random_seed': 42,              # For reproducibility
    'verbose': 100,                 # Print progress every 100 iterations
    # 'nan_mode': 'Min'             # How CatBoost handles NaNs internally if not pre-filled
                                    # (Can be 'Min', 'Max', 'Forbidden')
}


Initializing CatBoostRegressor...


In [None]:
test_results = []
val_results = []
train_results = []

# Set the random seed
seeds = [1,2,3,4,5,6,7,8,9,10,11,12]
for seed in seeds:
    # Make a copy of the fixed parameters and update the random_seed
    params = catboost_params.copy()
    params['random_seed'] = seed  # CatBoost uses 'random_seed' as the parameter name
    # Initialize the CatBoostRegressor using the parameters
    model = CatBoostRegressor(**params)
    print("Training CatBoost model...")
    # Train the model using the training set
    # Evaluate performance on the validation set during training for early stopping
    model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)
              # cat_features parameter passed during init is usually sufficient,
              # but can be passed here too for clarity:
              # cat_features=categorical_features_list
    print("\nTraining finished.")
    print(f"Best iteration found: {model.get_best_iteration()}")
    predictions = model.predict(X_val)
    mae_train = model.best_score_['learn']['MAE']
    mae_val = model.best_score_['validation']['MAE']
    # Save results
    mae_test = mean_absolute_error(y_val, predictions)
    # Save results
    test_results.append({'seed': seed, 'MAE_TEST': np.expm1(mae_test)})
    val_results.append({'seed': seed, 'MAE_VAL': np.expm1(mae_val)})
    train_results.append({'seed': seed, 'MAE_TRAIN': np.expm1(mae_train)})
    print(f'Seed: {seed}, MAE_TEST: {np.expm1(mae_test)}')
    print(f'Seed: {seed}, MAE_VAL: {np.expm1(mae_val)}')
    print(f'Seed: {seed}, MAE_TRAIN: {np.expm1(mae_train)}')

Training CatBoost model...

Training finished.
Best iteration found: 53
Seed: 1, MAE_TEST: 0.8316402357178724
Seed: 1, MAE_VAL: 0.8316384058602675
Seed: 1, MAE_TRAIN: 0.37920288908380084
Training CatBoost model...

Training finished.
Best iteration found: 67
Seed: 2, MAE_TEST: 0.8271508805877597
Seed: 2, MAE_VAL: 0.8271490555153309
Seed: 2, MAE_TRAIN: 0.36286673320228857
Training CatBoost model...


KeyboardInterrupt: 

In [None]:
import math
print(math.expm1((model.best_score_['learn']['MAE'])))
print(math.expm1((model.best_score_['validation']['MAE'])))

0.37648751535222
0.8332309119404723


In [None]:
results

[{'seed': 1, 'MAE': 0.6052118690840198},
 {'seed': 2, 'MAE': 0.6027578577778933},
 {'seed': 3, 'MAE': 0.6271541933379294},
 {'seed': 4, 'MAE': 0.5849851187049623},
 {'seed': 5, 'MAE': 0.6114633528170189},
 {'seed': 6, 'MAE': 0.603006210448655},
 {'seed': 7, 'MAE': 0.5968650305419213},
 {'seed': 8, 'MAE': 0.600984902002711},
 {'seed': 9, 'MAE': 0.6184216266907516},
 {'seed': 10, 'MAE': 0.6021739210639081},
 {'seed': 11, 'MAE': 0.6178597953591217},
 {'seed': 12, 'MAE': 0.5993395977767381}]