In [166]:
from deepforest import CascadeForestClassifier
import PRF4DF
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

# --- Seed setting ---
seed = 129
np.random.seed(seed)

In [167]:
# --- Data loading ---
X = np.load('../data/bootstrap_X.npy')
dX = np.load('../data/bootstrap_dX.npy')
y = np.load('../data/bootstrap_y.npy')

X = X[:1000]
dX = dX[:1000]
y = y[:1000]
y[y > 2] = 2

#dX = np.zeros(shape=(2000,17))
print("--- Original Data ---")
print("Original X sample: ", X)
print("Original dX sample: ", dX)
print("Original y sample: ", y)
print("Unique labels in y: ", set(y))

n_objects = X.shape[0]
n_features = X.shape[1]
n_classes = len(set(y))
print(f"{n_objects} objects, {n_features} features")

--- Original Data ---
Original X sample:  [[ 0.33453338  0.3817734   0.19323093 ... -0.81191589  0.19067206
   0.84772759]
 [ 0.32742773  0.35012285  0.17785153 ... -0.85643206  0.19620735
   0.85023905]
 [ 0.39173798  0.32019704  0.17513842 ...  0.23618577  0.20204299
   0.79843681]
 ...
 [ 0.43071606  0.37988827  0.24714295 ... -0.84221529  0.24334545
   0.8628581 ]
 [ 0.18695281  0.28667433  0.1404702  ...  1.19362455  0.09216904
   0.77052006]
 [ 0.12773789  0.30903322  0.15816711 ...  0.52400894  0.06605999
   0.79830589]]
Original dX sample:  [[0.00980023 0.01298663 0.01730336 ... 0.07554966 0.00503042 0.00614086]
 [0.0099433  0.01547131 0.01834346 ... 0.06557894 0.00303583 0.00408301]
 [0.00952382 0.00953937 0.0162304  ... 0.15137843 0.00276635 0.00701187]
 ...
 [0.01513477 0.01971204 0.01964974 ... 0.12579423 0.00383432 0.00537172]
 [0.01069045 0.00748945 0.0150812  ... 0.43512693 0.00363849 0.00970247]
 [0.00616469 0.01622886 0.01385321 ... 0.36720489 0.00177792 0.0100949 ]]
O

In [168]:
# --- Data splitting ---
n_train = int(n_objects * 0.8)
n_test =  int(n_objects - n_train)
print(f'Train set size = {n_train}, Test set size = {n_test}')

shuffled_inds = np.random.permutation(n_objects)

train_inds = shuffled_inds[:n_train]
X_train = X[train_inds][:, :n_features]
dX_train = dX[train_inds][:, :n_features] 
y_train = y[train_inds]

test_inds = shuffled_inds[n_train:(n_train + n_test)]
X_test = X[test_inds][:, :n_features]
dX_test = dX[test_inds][:, :n_features]
y_test = y[test_inds]

# Concatenate X and dX for training
X_train_combined = np.hstack((X_train, dX_train))
X_test_combined = np.hstack((X_test, dX_test))

Train set size = 800, Test set size = 200


In [173]:
# --- Model training ---

# DeepForest
n_cascade_estimators = 2  # Forests per layer
model = CascadeForestClassifier(
    n_bins=n_classes,
    random_state=seed,
)


# PRF (estimators)
prf_estimators = []
for i in range(n_cascade_estimators):
    single_prf_estimator = PRF4DF.SklearnCompatiblePRF(
        n_classes_= n_classes,
        n_features_= n_features,
        use_probabilistic_labels=False, 
        use_feature_uncertainties=True,
        n_estimators=10, # Trees per forest
        max_depth=10,
        n_jobs=1
    )

    #rf = RandomForestClassifier(
    #    n_estimators=20, # Trees per forest
    #    max_depth=10,
    #    random_state=i)
    
    #erf = ExtraTreesClassifier(
    #    n_estimators=20, # Trees per forest
    #    max_depth=10,
    #    random_state=i)

    prf_estimators.append(single_prf_estimator)
    #prf_estimators.append(rf)
    #prf_estimators.append(erf)


# Set the PRF estimators to the DF model
model.set_estimator(prf_estimators)

In [174]:
# --- Model fitting ---
print("Starting model fitting...")
model.fit(X=X_train_combined, y=y_train)

Starting model fitting...
[2025-06-09 08:32:14.013] Start to fit the model:
[2025-06-09 08:32:14.013] Fitting cascade layer = 0 
[2025-06-09 08:32:18.810] layer = 0  | Val Acc = 73.750 % | Elapsed = 4.798 s
[2025-06-09 08:32:18.813] Fitting cascade layer = 1 
[2025-06-09 08:32:24.175] layer = 1  | Val Acc = 72.875 % | Elapsed = 5.362 s
[2025-06-09 08:32:24.175] Early stopping counter: 1 out of 2
[2025-06-09 08:32:24.177] Fitting cascade layer = 2 
[2025-06-09 08:32:29.025] layer = 2  | Val Acc = 71.875 % | Elapsed = 4.848 s
[2025-06-09 08:32:29.025] Early stopping counter: 2 out of 2
[2025-06-09 08:32:29.025] Handling early stopping
[2025-06-09 08:32:29.032] The optimal number of layers: 1


In [175]:
# --- Model evaluation ---
accuracy = model.score(X_test_combined, y_test) * 100
print(f"Testing Accuracy: {accuracy:.3f} %")

[2025-06-09 08:32:30.505] Start to evalute the model:
[2025-06-09 08:32:30.505] Evaluating cascade layer = 0 
Testing Accuracy: 75.500 %


In [176]:
model = CascadeForestClassifier(   
    n_bins=n_classes,     
    n_estimators=2, # Forests sets (RF + ExtraRF) per layer
    n_trees=50, # Trees per forest
    max_depth=10,
    n_jobs=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred) * 100
print("\nTesting Accuracy: {:.3f} %".format(acc))

[2025-06-09 08:32:33.386] Start to fit the model:
[2025-06-09 08:32:33.386] Fitting cascade layer = 0 
[2025-06-09 08:32:33.651] layer = 0  | Val Acc = 74.375 % | Elapsed = 0.264 s
[2025-06-09 08:32:33.654] Fitting cascade layer = 1 
[2025-06-09 08:32:33.922] layer = 1  | Val Acc = 75.875 % | Elapsed = 0.268 s
[2025-06-09 08:32:33.925] Fitting cascade layer = 2 
[2025-06-09 08:32:34.223] layer = 2  | Val Acc = 76.625 % | Elapsed = 0.298 s
[2025-06-09 08:32:34.227] Fitting cascade layer = 3 
[2025-06-09 08:32:34.480] layer = 3  | Val Acc = 77.625 % | Elapsed = 0.254 s
[2025-06-09 08:32:34.483] Fitting cascade layer = 4 
[2025-06-09 08:32:34.730] layer = 4  | Val Acc = 78.250 % | Elapsed = 0.247 s
[2025-06-09 08:32:34.733] Fitting cascade layer = 5 
[2025-06-09 08:32:34.983] layer = 5  | Val Acc = 77.750 % | Elapsed = 0.250 s
[2025-06-09 08:32:34.984] Early stopping counter: 1 out of 2
[2025-06-09 08:32:34.988] Fitting cascade layer = 6 
[2025-06-09 08:32:35.235] layer = 6  | Val Acc = 7

In [None]:
model = PRF4DF.SklearnCompatiblePRF(
    n_classes_= n_classes,
    n_features_= n_features,
    use_probabilistic_labels=False, 
    use_feature_uncertainties=True,
    n_estimators=10, # Trees per forest
    max_depth=10,
    n_jobs=1
)
model.fit(X_train_combined, y_train)
accuracy = model.score(X_test_combined, y_test) * 100

TypeError: fit() got an unexpected keyword argument 'X'