In [1]:
from deepforest import CascadeForestClassifier
import PRF4DF
import numpy as np


# --- Seed setting ---
seed = 123
np.random.seed(seed)

In [3]:
# --- Data loading ---
X = np.load('../data/bootstrap_X.npy')
dX = np.load('../data/bootstrap_dX.npy')
y = np.load('../data/bootstrap_y.npy')

print("--- Original Data ---")
print("Original X sample: ", X)
print("Original dX sample: ", dX)
print("Original y sample: ", y)
print("Unique labels in y: ", set(y))

n_objects = X.shape[0]
n_features = X.shape[1]
n_classes = len(set(y))
print(f"{n_objects} objects, {n_features} features")

--- Original Data ---
Original X sample:  [[ 0.33453338  0.3817734   0.19323093 ... -0.81191589  0.19067206
   0.84772759]
 [ 0.32742773  0.35012285  0.17785153 ... -0.85643206  0.19620735
   0.85023905]
 [ 0.39173798  0.32019704  0.17513842 ...  0.23618577  0.20204299
   0.79843681]
 ...
 [ 0.403697    0.27038462  0.12964029 ...  0.89465122  0.1941173
   0.77770493]
 [ 0.25029907  0.33824084  0.20026945 ...  0.8638979   0.14645717
   0.82934586]
 [ 0.27610644  0.34        0.16186233 ... -0.52845195  0.15704813
   0.83015366]]
Original dX sample:  [[0.00980023 0.01298663 0.01730336 ... 0.07554966 0.00503042 0.00614086]
 [0.0099433  0.01547131 0.01834346 ... 0.06557894 0.00303583 0.00408301]
 [0.00952382 0.00953937 0.0162304  ... 0.15137843 0.00276635 0.00701187]
 ...
 [0.01547352 0.01519596 0.02114385 ... 0.23644937 0.00473304 0.01679457]
 [0.00721856 0.01831125 0.01688584 ... 0.81714142 0.00310632 0.01113628]
 [0.00577725 0.01198521 0.01370714 ... 0.07905823 0.00147411 0.00798292]]
Or

In [None]:
# --- Data splitting ---
n_train = int(n_objects * 0.8)
n_test =  int(n_objects - n_train)
print(f'Train set size = {n_train}, Test set size = {n_test}')

shuffled_inds = np.random.permutation(n_objects)

train_inds = shuffled_inds[:n_train]
X_train = X[train_inds][:, :n_features]
dX_train = dX[train_inds][:, :n_features] 
y_train = y[train_inds]

test_inds = shuffled_inds[n_train:(n_train + n_test)]
X_test = X[test_inds][:, :n_features]
dX_test = dX[test_inds][:, :n_features]
y_test = y[test_inds]

# Concatenate X and dX for training
X_train_combined = np.hstack((X_train, dX_train))
X_test_combined = np.hstack((X_test, dX_test))


Train set size = 36703, Test set size = 9176


In [5]:
# --- Model training ---

# DeepForest
n_cascade_estimators = 4  # Forests per layer
model = CascadeForestClassifier(
    n_bins=n_classes,
    random_state=seed,
)


# PRF (estimators)
prf_estimators = []
for i in range(n_cascade_estimators):
    single_prf_estimator = PRF4DF.SklearnCompatiblePRF(
        n_classes_= n_classes,
        n_features_= n_features,
        n_estimators=10, # Trees per forest
        max_depth=10,
        random_state=i,
        n_jobs=1
    )
    prf_estimators.append(single_prf_estimator)

# Set the PRF estimators to the DF model
model.set_estimator(prf_estimators)

In [7]:
# --- Model fitting ---
print("Starting model fitting...")
model.fit(X=X_train_combined, y=y_train)

Starting model fitting...
[2025-06-04 18:52:28.192] Start to fit the model:
[2025-06-04 18:52:28.192] Fitting cascade layer = 0 
[2025-06-04 18:56:54.204] layer = 0  | Val Acc = 70.926 % | Elapsed = 266.012 s
[2025-06-04 18:56:54.413] Fitting cascade layer = 1 
[2025-06-04 19:04:26.118] layer = 1  | Val Acc = 73.356 % | Elapsed = 451.705 s
[2025-06-04 19:04:26.308] Fitting cascade layer = 2 
[2025-06-04 19:11:25.211] layer = 2  | Val Acc = 74.157 % | Elapsed = 418.903 s
[2025-06-04 19:11:25.391] Fitting cascade layer = 3 
[2025-06-04 19:18:17.472] layer = 3  | Val Acc = 74.217 % | Elapsed = 412.081 s
[2025-06-04 19:18:17.659] Fitting cascade layer = 4 
[2025-06-04 19:25:12.068] layer = 4  | Val Acc = 73.621 % | Elapsed = 414.409 s
[2025-06-04 19:25:12.068] Early stopping counter: 1 out of 2
[2025-06-04 19:25:12.256] Fitting cascade layer = 5 
[2025-06-04 19:32:23.306] layer = 5  | Val Acc = 73.261 % | Elapsed = 431.051 s
[2025-06-04 19:32:23.306] Early stopping counter: 2 out of 2
[202

In [8]:
# --- Model evaluation ---
accuracy = model.score(X_test_combined, y_test) * 100
print(f"Testing Accuracy: {accuracy:.3f} %")

[2025-06-04 19:41:10.827] Start to evalute the model:
[2025-06-04 19:41:10.834] Evaluating cascade layer = 0 
[2025-06-04 19:41:23.380] Evaluating cascade layer = 1 
[2025-06-04 19:41:33.820] Evaluating cascade layer = 2 
[2025-06-04 19:41:43.701] Evaluating cascade layer = 3 
Testing Accuracy: 74.793 %
