In [1]:
import numpy as np

from sklearn.model_selection import train_test_split

from data.utils import list_data, get_data
from metrics import compare_accuracy

from treeffuser import LightGBMTreeffuser
from ngboost import NGBRegressor

## Data

In [2]:
list_data()

{'naval': 'data/uci/naval',
 'protein': 'data/uci/protein',
 'wine': 'data/uci/wine',
 'yacht': 'data/uci/yacht'}

In [3]:
data = get_data("naval", verbose=True)
print(data.keys())
print(f"Categorical variables: {data['categorical']}")

Getting naval dataset.
# of observations: 11934, # of covariates: 17, dimension of outcome: 1
dict_keys(['x', 'y', 'categorical'])
Categorical variables: []


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data["x"], data["y"], test_size=0.2, random_state=42
)

## Models

In [5]:
model = {}
preds = {}

First, we fit treeffuser.

In [6]:
model["treeffuser"] = LightGBMTreeffuser(
    verbose=1,
    n_repeats=100,
    n_estimators=10000,
    sde_name="vesde",
    learning_rate=0.1,
    early_stopping_rounds=50,
)
temp = model["treeffuser"].fit(
    X_train, y_train
)  # "temp=" is as a temp fix for Issue #26, see github.com/blei-lab/tree-diffuser/issues/26

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002572 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3407
[LightGBM] [Info] Number of data points in the train set: 954700, number of used features: 17
[LightGBM] [Info] Start training from score 0.001627
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[6170]	valid_0's l2: 0.074176


We then sample from the fitted model.

In [7]:
y_samples = model["treeffuser"].sample(
    X_test, n_samples=1, n_parallel=100, n_steps=30, seed=0
)
preds["treeffuser"] = y_samples.mean(axis=1)

100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


Next, we run NGBoost with Gaussian likelihood.

In [8]:
model["ngb"] = NGBRegressor(early_stopping_rounds=50)

model["ngb"].fit(X_train, y_train)

y_dim = data["y"].shape[1]
preds["ngb"] = model["ngb"].predict(X_test).reshape((-1, y_dim))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[iter 0] loss=-3.4770 val_loss=-3.4645 scale=1.0000 norm=0.3850
[iter 100] loss=-3.6774 val_loss=-3.6735 scale=2.0000 norm=0.6649
[iter 200] loss=-3.8827 val_loss=-3.8809 scale=1.0000 norm=0.3309
[iter 300] loss=-4.0624 val_loss=-4.0606 scale=1.0000 norm=0.3301
[iter 400] loss=-4.2254 val_loss=-4.2170 scale=1.0000 norm=0.3349


## Metrics

In [9]:
metrics = compare_accuracy(preds, y_test, print_table=True)

+------------+-----------------------+-----------------------+-----------------------+----------------------+--------------------+--------------------+
|   Method   |          mae          |          rmse         |          mdae         |        marpd         |         r2         |        corr        |
+------------+-----------------------+-----------------------+-----------------------+----------------------+--------------------+--------------------+
| treeffuser | 0.0005198843989037958 | 0.0007821756917792708 | 0.0001617188752605081 | 0.052644136268068215 | 0.9892636196676616 | 0.9946185939246414 |
|    ngb     | 0.0030448647465668175 | 0.0038593207524945015 | 0.0024634541526422638 |  0.3084181684394628  | 0.7386208101861786 | 0.9257460083466912 |
+------------+-----------------------+-----------------------+-----------------------+----------------------+--------------------+--------------------+
