In [1]:
from deepforest import CascadeForestClassifier
import PRF4DF
import numpy as np


# --- Seed setting ---
seed = 123
np.random.seed(seed)

In [2]:
# --- Data loading ---
X = np.load('../data/bootstrap_X.npy')

X = X[:1000]

# --- Label Generation ---
# Random probabilities for 17 classes
py_original = np.array([np.random.dirichlet(np.ones(17)).tolist() for _ in range(1000)])
y_discrete = np.argmax(py_original, axis=1)

# --- Configuration Flags ---
use_feature_uncertainties_mode = False
use_probabilistic_labels_mode = True

print("--- Original Data ---")
print("Original X sample shape: ", X.shape)
print("Original py_original sample shape: ", py_original.shape)

n_objects = X.shape[0]
n_features_X_orig = X.shape[1]
n_classes = py_original.shape[1]
print(f"{n_objects} objects, {n_features_X_orig} original features, {n_classes} classes")

--- Original Data ---
Original X sample shape:  (1000, 17)
Original py_original sample shape:  (1000, 17)
1000 objects, 17 original features, 17 classes


In [3]:
# --- Data splitting ---
n_train = int(n_objects * 0.8)
n_test = int(n_objects - n_train)

shuffled_inds = np.random.permutation(n_objects)

train_inds = shuffled_inds[:n_train]
X_train = X[train_inds][:, :n_features_X_orig]
y_train_discrete = y_discrete[train_inds]
py_train = py_original[train_inds]

test_inds = shuffled_inds[n_train:(n_train + n_test)]
X_test = X[test_inds][:, :n_features_X_orig]
y_test_discrete = y_discrete[test_inds]
py_test = py_original[test_inds]

# Concatenate parts of X_combined dynamically
parts_train = [X_train]
parts_test = [X_test]

parts_train.append(py_train)
parts_test.append(py_test)

X_train_combined = np.hstack(parts_train)
X_test_combined = np.hstack(parts_test)

In [4]:
# --- Model training ---

n_cascade_estimators = 4
model = CascadeForestClassifier(
    n_bins=n_classes,
    random_state=seed,
)

# PRF (estimators)
prf_estimators = []
for i in range(n_cascade_estimators):
    single_prf_estimator = PRF4DF.SklearnCompatiblePRF(
        n_classes_= n_classes,
        n_features_= n_features_X_orig,
        use_probabilistic_labels=use_probabilistic_labels_mode,
        use_feature_uncertainties=use_feature_uncertainties_mode,
        n_estimators=10,
        max_depth=10,
        random_state=i,
        n_jobs=1
    )
    prf_estimators.append(single_prf_estimator)

# Set the PRF estimators to the DF model
model.set_estimator(prf_estimators)

In [5]:
# --- Model fitting ---
print("Starting model fitting...")
model.fit(X=X_train_combined, y=y_train_discrete)

Starting model fitting...
[2025-06-05 17:48:08.421] Start to fit the model:
[2025-06-05 17:48:08.422] Fitting cascade layer = 0 
[2025-06-05 17:48:17.769] layer = 0  | Val Acc = 7.000 % | Elapsed = 9.347 s
[2025-06-05 17:48:17.784] Fitting cascade layer = 1 
[2025-06-05 17:48:28.340] layer = 1  | Val Acc = 6.250 % | Elapsed = 10.556 s
[2025-06-05 17:48:28.340] Early stopping counter: 1 out of 2
[2025-06-05 17:48:28.354] Fitting cascade layer = 2 
[2025-06-05 17:48:38.867] layer = 2  | Val Acc = 5.625 % | Elapsed = 10.514 s
[2025-06-05 17:48:38.867] Early stopping counter: 2 out of 2
[2025-06-05 17:48:38.867] Handling early stopping
[2025-06-05 17:48:38.885] The optimal number of layers: 1


In [6]:
# --- Model evaluation ---
accuracy = model.score(X_test_combined, y_test_discrete) * 100
print(f"Testing Accuracy: {accuracy:.3f} %")

[2025-06-05 17:49:11.486] Start to evalute the model:
[2025-06-05 17:49:11.487] Evaluating cascade layer = 0 
Testing Accuracy: 8.000 %
