In [26]:
import deepforest as df
import os
import PRF4DF
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

# --- Seed setting ---
seed = 129
np.random.seed(seed)

In [27]:
# --- Data loading ---
X = np.load('../data/bootstrap_X.npy')
dX = np.load('../data/bootstrap_dX.npy')
y = np.load('../data/bootstrap_y.npy')

X = X[:1000]
dX = dX[:1000]
y = y[:1000]
y[y > 2] = 2

#dX = np.zeros(shape=(2000,17))
print("--- Original Data ---")
print("Original X sample: ", X)
print("Original dX sample: ", dX)
print("Original y sample: ", y)
print("Unique labels in y: ", set(y))

n_objects = X.shape[0]
n_features = X.shape[1]
n_classes = len(set(y))
print(f"{n_objects} objects, {n_features} features")

--- Original Data ---
Original X sample:  [[ 0.33453338  0.3817734   0.19323093 ... -0.81191589  0.19067206
   0.84772759]
 [ 0.32742773  0.35012285  0.17785153 ... -0.85643206  0.19620735
   0.85023905]
 [ 0.39173798  0.32019704  0.17513842 ...  0.23618577  0.20204299
   0.79843681]
 ...
 [ 0.43071606  0.37988827  0.24714295 ... -0.84221529  0.24334545
   0.8628581 ]
 [ 0.18695281  0.28667433  0.1404702  ...  1.19362455  0.09216904
   0.77052006]
 [ 0.12773789  0.30903322  0.15816711 ...  0.52400894  0.06605999
   0.79830589]]
Original dX sample:  [[0.00980023 0.01298663 0.01730336 ... 0.07554966 0.00503042 0.00614086]
 [0.0099433  0.01547131 0.01834346 ... 0.06557894 0.00303583 0.00408301]
 [0.00952382 0.00953937 0.0162304  ... 0.15137843 0.00276635 0.00701187]
 ...
 [0.01513477 0.01971204 0.01964974 ... 0.12579423 0.00383432 0.00537172]
 [0.01069045 0.00748945 0.0150812  ... 0.43512693 0.00363849 0.00970247]
 [0.00616469 0.01622886 0.01385321 ... 0.36720489 0.00177792 0.0100949 ]]
O

In [28]:
# --- Data splitting ---
n_train = int(n_objects * 0.8)
n_test =  int(n_objects - n_train)
print(f'Train set size = {n_train}, Test set size = {n_test}')

shuffled_inds = np.random.permutation(n_objects)

train_inds = shuffled_inds[:n_train]
X_train = X[train_inds][:, :n_features]
dX_train = dX[train_inds][:, :n_features] 
y_train = y[train_inds]

test_inds = shuffled_inds[n_train:(n_train + n_test)]
X_test = X[test_inds][:, :n_features]
dX_test = dX[test_inds][:, :n_features]
y_test = y[test_inds]

# Concatenate X and dX for training
X_train_combined = np.hstack((X_train, dX_train))
X_test_combined = np.hstack((X_test, dX_test))

Train set size = 800, Test set size = 200


In [29]:
# --- Model training ---

# DeepForest
n_cascade_estimators = 2  # Forests per layer
model = df.CascadeForestClassifier(
    n_bins=n_classes,
    random_state=seed,
)


# PRF (estimators)
prf_estimators = []
for i in range(n_cascade_estimators):
    single_prf_estimator = PRF4DF.SklearnCompatiblePRF(
        n_classes_= n_classes,
        n_features_= n_features,
        use_probabilistic_labels=False, 
        use_feature_uncertainties=True,
        n_estimators=10, # Trees per forest
        max_depth=10,
        n_jobs=1
    )

    prf_estimators.append(single_prf_estimator)

# Set the PRF estimators to the DF model
model.set_estimator(prf_estimators)

In [30]:
# --- Model fitting ---
print("Starting model fitting...")
model.fit(X=X_train_combined, y=y_train)

Starting model fitting...
[2025-06-09 14:24:29.376] Start to fit the model:
[2025-06-09 14:24:29.376] Fitting cascade layer = 0 
[2025-06-09 14:24:33.061] layer = 0  | Val Acc = 74.375 % | Elapsed = 3.685 s
[2025-06-09 14:24:33.062] Fitting cascade layer = 1 
[2025-06-09 14:24:37.161] layer = 1  | Val Acc = 73.125 % | Elapsed = 4.099 s
[2025-06-09 14:24:37.161] Early stopping counter: 1 out of 2
[2025-06-09 14:24:37.162] Fitting cascade layer = 2 
[2025-06-09 14:24:41.176] layer = 2  | Val Acc = 72.125 % | Elapsed = 4.015 s
[2025-06-09 14:24:41.176] Early stopping counter: 2 out of 2
[2025-06-09 14:24:41.176] Handling early stopping
[2025-06-09 14:24:41.181] The optimal number of layers: 1


In [31]:
# --- Model evaluation ---
accuracy = model.score(X_test_combined, y_test) * 100
print(f"Testing Accuracy: {accuracy:.3f} %")

[2025-06-09 14:24:48.952] Start to evalute the model:
[2025-06-09 14:24:48.953] Evaluating cascade layer = 0 
Testing Accuracy: 76.500 %


In [32]:
print(os.path.dirname(df.__file__))
model = df.CascadeForestClassifier(   
    n_bins=n_classes,     
    n_estimators=2, # Forests sets (RF + ExtraRF) per layer
    n_trees=10, # Trees per forest
    max_depth=10,
    n_jobs=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred) * 100
print("\nTesting Accuracy: {:.3f} %".format(acc))

C:\Users\carlo\Desktop\project\Deep-Forest\deepforest
[2025-06-09 14:24:53.126] Start to fit the model:
[2025-06-09 14:24:53.126] Fitting cascade layer = 0 
[2025-06-09 14:24:53.168] layer = 0  | Val Acc = 72.375 % | Elapsed = 0.042 s
[2025-06-09 14:24:53.171] Fitting cascade layer = 1 
[2025-06-09 14:24:53.210] layer = 1  | Val Acc = 74.125 % | Elapsed = 0.039 s
[2025-06-09 14:24:53.212] Fitting cascade layer = 2 
[2025-06-09 14:24:53.254] layer = 2  | Val Acc = 71.625 % | Elapsed = 0.041 s
[2025-06-09 14:24:53.254] Early stopping counter: 1 out of 2
[2025-06-09 14:24:53.256] Fitting cascade layer = 3 
[2025-06-09 14:24:53.297] layer = 3  | Val Acc = 70.625 % | Elapsed = 0.041 s
[2025-06-09 14:24:53.297] Early stopping counter: 2 out of 2
[2025-06-09 14:24:53.297] Handling early stopping
[2025-06-09 14:24:53.297] The optimal number of layers: 2
[2025-06-09 14:24:53.298] Start to evalute the model:
[2025-06-09 14:24:53.298] Evaluating cascade layer = 0 
[2025-06-09 14:24:53.300] Evalua

  "Some inputs do not have OOB predictions. "
  / oob_decision_function.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB predictions. "
  / oob_decision_function.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB predictions. "
  / oob_decision_function.sum(axis=1)[:, np.newaxis]
  "Some inputs do not have OOB predictions. "
  / oob_decision_function.sum(axis=1)[:, np.newaxis]


In [33]:
model = PRF4DF.SklearnCompatiblePRF(
    n_classes_= n_classes,
    n_features_= n_features,
    use_probabilistic_labels=False, 
    use_feature_uncertainties=True,
    n_estimators=10, # Trees per forest
    max_depth=10,
    n_jobs=1
)
model.fit(X_train_combined, y_train)
accuracy = model.score(X_test_combined, y_test) * 100
print(f"Testing Accuracy: {accuracy:.3f} %")

Testing Accuracy: 80.000 %
