Evaluation of our Framework

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import warnings

# --- 1. IPython Magic: Autoreload ---
# This tells the notebook to automatically reload modules
# when the .py files (like LinearMetaModel.py) are changed.
%load_ext autoreload
%autoreload 2
print("Autoreload enabled.")

# --- 2. Setup Paths ---
# Get the current directory of the notebook (e.g., .../Bachelor-Thesis/experiments/framework_eval)
current_dir = os.getcwd()
print(f"Current notebook directory: {current_dir}")

# Go UP TWO levels to the project root (e.g., .../Bachelor-Thesis)
project_root = os.path.abspath(os.path.join(current_dir, os.pardir, os.pardir))
print(f"Calculated project root: {project_root}")

# Define the correct path to the 'src' folder (e.g., .../Bachelor-Thesis/src)
src_path = os.path.join(project_root, 'src')
print(f"Target 'src' directory: {src_path}")

# Add the correct 'src' path
if src_path not in sys.path:
    sys.path.append(src_path)
    print(f"Added to path: {src_path}")
else:
    print(f"Path already exists: {src_path}")

# --- 3. Try imports ---
# This cell should now work, BUT only if you do Part 2 below
try:
    from ExplainableTreeEnsemble import ExplainableTreeEnsemble
    from BasicMetaModel import BasicMetaModel
    from LinearMetaModel import LinearMetaModel
    print("\nImports loaded successfully.")
except ModuleNotFoundError as e:
    print(f"\nImport Error: {e}")
    print(f"Could not find modules at: {src_path}")
    print(">>> IF THIS FAILED, PLEASE DO PART 2 <<<")

warnings.filterwarnings('ignore')

Autoreload enabled.
Current notebook directory: c:\Users\I7- 9700\rekik-BA\Bachelor-Thesis\experiments\framework_eval
Calculated project root: c:\Users\I7- 9700\rekik-BA\Bachelor-Thesis
Target 'src' directory: c:\Users\I7- 9700\rekik-BA\Bachelor-Thesis\src
Added to path: c:\Users\I7- 9700\rekik-BA\Bachelor-Thesis\src


  from .autonotebook import tqdm as notebook_tqdm



Imports loaded successfully.


In [None]:
# --- 2. Define Experiment Parameters ---
DATASET_NAME = "3droad"
KEEP_RATIO_STAGE1 = 0.3  # 200 trees -> 60 trees (30%)

# Set dynamic correlation threshold
if DATASET_NAME == "bike":
    CORR_THRESH_STAGE2 = 0.99
elif DATASET_NAME == "3droad":
    CORR_THRESH_STAGE2 = 0.9
elif DATASET_NAME == "slice":
    CORR_THRESH_STAGE2 = 0.95
else:
    CORR_THRESH_STAGE2 = 0.9

# This is the file we will append to
output_file = "framework_analysis_results.csv"

print(f"--- Starting Experiment ---")
print(f"Dataset: {DATASET_NAME}")
print(f"Stage 1 Keep Ratio: {KEEP_RATIO_STAGE1}")
print(f"Stage 2 Corr Thresh: {CORR_THRESH_STAGE2}")
print(f"Results will be saved to: {output_file}")

--- Starting Experiment ---
Dataset: slice
Stage 1 Keep Ratio: 0.3
Stage 2 Corr Thresh: 0.95
Results will be saved to: framework_analysis_results.csv


In [3]:
# --- 3. Step 1: Full Ensemble (Baseline) ---
print("\n[Step 1/3] Training Full Ensemble...")

# Assumes Cell 2 has been run
workflow = ExplainableTreeEnsemble(data_type="regression", dataset_name=DATASET_NAME)
workflow.train_base_trees()

# FIX: We manually call _evaluate() to get the MSE
mse_full, _ , _ , _ , _ ,  _ = workflow._evaluate()
trees_full = workflow.n_trees

# Get n_samples and n_features from the workflow
n_samples = workflow.n_samples
n_features = workflow.n_features

print(f"Full Ensemble: {mse_full:.4f} MSE, {trees_full} Trees")
print(f"Dataset Stats: {n_samples} samples, {n_features} features")
print("\n'workflow' object created.")


[Step 1/3] Training Full Ensemble...
slice dataset, N=53500, d=385
-------------creating the base Trees-------------- 
full ensemble mse 74.3770295084585
Full Ensemble: 74.3770 MSE, 200 Trees
Dataset Stats: 42800 samples, 385 features

'workflow' object created.


In [4]:
# --- 4. Step 2: Stage 1 Pruning (SHAP Rank) ---
print("\n[Step 2/3] Running Stage 1 (SHAP Rank Pruning)...")

try:
    model_stage1 = BasicMetaModel(keep_ratio=KEEP_RATIO_STAGE1)
    model_stage1.attach_to(workflow)
    model_stage1.train() 

    mse_stage1, _ = model_stage1.evaluate() 
    trees_stage1 = len(model_stage1.pruned_trees)

    print(f"After Stage 1: {mse_stage1:.4f} MSE, {trees_stage1} Trees")
    print("\n'model_stage1' object created.")
    
except NameError as e:
    print(f"Error: Make sure you have run Cell 3 first! (Details: {e})")


[Step 2/3] Running Stage 1 (SHAP Rank Pruning)...
=== Stage 1: Training model and pruning by SHAP (keep top 30.0%) ===
Pre-Pruned ensemble MSE (Weighted): 44.575492343061676
After Stage 1: 44.5755 MSE, 60 Trees

'model_stage1' object created.


In [5]:

print("\n[Step 3/3] Running Stage 2 (HRP Optimization Pruning)...")

try:
    model_stage2 = LinearMetaModel()
    model_stage2.attach_to(workflow) 
    model_stage2.train(pruned_trees_list=model_stage1.pruned_trees) 
    model_stage2.prune(corr_thresh=CORR_THRESH_STAGE2) 

    mse_stage2, _ = model_stage2.evaluate() 
    trees_stage2 = len(model_stage2.pruned_trees)

    print(f"After Stage 2: {mse_stage2:.4f} MSE, {trees_stage2} Trees")
    print("\n'model_stage2' object created.")

except NameError as e:
    print(f"Error: Make sure you have run Cell 4 first! (Details: {e})")


[Step 3/3] Running Stage 2 (HRP Optimization Pruning)...
[INFO] Training LinearMetaModel on 60 pruned trees...
 Lambda prune :  158.97283935546875
 Lambda div :  79.48641967773438
Epoch    0 | Total Loss: 1189.1631 | MSE Loss: 593.5637 | Prune Loss: 3.7337 | Div Loss: 0.025611
Epoch   20 | Total Loss: 646.2557 | MSE Loss: 61.9592 | Prune Loss: 3.6614 | Div Loss: 0.028024
Epoch   40 | Total Loss: 587.4935 | MSE Loss: 33.9893 | Prune Loss: 3.4646 | Div Loss: 0.034313
Epoch   60 | Total Loss: 548.8109 | MSE Loss: 30.8708 | Prune Loss: 3.2364 | Div Loss: 0.043354
Epoch   80 | Total Loss: 520.8850 | MSE Loss: 29.6153 | Prune Loss: 3.0635 | Div Loss: 0.053542
Epoch  100 | Total Loss: 497.2164 | MSE Loss: 28.7824 | Prune Loss: 2.9142 | Div Loss: 0.064812
Epoch  120 | Total Loss: 472.2151 | MSE Loss: 28.9088 | Prune Loss: 2.7498 | Div Loss: 0.077582
Epoch  140 | Total Loss: 452.6099 | MSE Loss: 28.6007 | Prune Loss: 2.6212 | Div Loss: 0.091959
Epoch  160 | Total Loss: 434.9897 | MSE Loss: 29.

In [6]:
# --- 6. Collect and Append Results to CSV ---
print("\n--- Experiment Complete ---")

try:
    # 1. Create a dictionary for the new result row
    new_result = {
        "dataset_name": DATASET_NAME,
        "n_samples": n_samples,
        "n_features": n_features,
        "mse_full": mse_full,
        "mse_stage1": mse_stage1,
        "mse_stage2": mse_stage2
    }
    
    # 2. Convert new result to a DataFrame
    df_new = pd.DataFrame([new_result])

    # 3. Check if the file already exists
    if os.path.exists(output_file):
        # File exists: Append without header
        print(f"File '{output_file}' exists. Appending new result.")
        df_new.to_csv(output_file, mode='a', header=False, index=False)
    else:
        # File does not exist: Create it and write with header
        print(f"File '{output_file}' not found. Creating new file.")
        df_new.to_csv(output_file, mode='w', header=True, index=False)

    print(f"\nResults successfully saved to {output_file}")
    
    # 4. Display the full CSV content
    print("\n--- Current CSV Content ---")
    print(pd.read_csv(output_file))

except NameError as e:
    print(f"Error: A variable is missing! Make sure you have run Cells 2-5 first. (Details: {e})")


--- Experiment Complete ---
File 'framework_analysis_results.csv' exists. Appending new result.

Results successfully saved to framework_analysis_results.csv

--- Current CSV Content ---
   dataset_name  n_samples  n_features    mse_full  mse_stage1  mse_stage2
0        3droad     347899           3  179.982272  148.960932  120.475382
1         slice      42800         385   70.161952   42.605203   35.568698
2         slice      42800         385   72.385460   46.575549   36.227354
3         slice      42800         385   62.305534   38.554120   25.425535
4         slice      42800         385   62.311290   46.294762   30.395166
5         slice      42800         385   62.311290   46.294762   30.748182
6         slice      42800         385   64.157213   41.450804   31.294830
7         slice      42800         385   75.593464   49.212943   41.603926
8         slice      42800         385   69.244611   52.208408   40.333366
9         slice      42800         385   61.272665   38.961108