In [1]:
import numpy as np

from sklearn.model_selection import train_test_split

from data.utils import list_data(), get_data()
from metrics import compare_accuracy

from treeffuser import LightGBMTreeffuser
from ngboost import NGBRegressor

## Data

In [2]:
list_uci_data()

['naval', 'protein', 'wine', 'yacht']

In [3]:
data = get_uci_data("yacht", verbose=True)
print(data.keys())
print(f"Categorical variables: {data['categorical']}")

dataset: yacht
# of observations: 308, # of covariates: 6, dimension of outcome: 1
dict_keys(['x', 'y', 'categorical'])
Categorical variables: []


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data["x"], data["y"], test_size=0.2, random_state=42
)

## Models

In [5]:
model = {}
preds = {}

First, we fit treeffuser.

In [6]:
model["treeffuser"] = LightGBMTreeffuser(
    verbose=1,
    n_repeats=100,
    n_estimators=10000,
    sde_name="vesde",
    learning_rate=0.1,
    early_stopping_rounds=50,
)
temp = model["treeffuser"].fit(
    X_train, y_train
)  # "temp=" is as a temp fix for Issue #26, see github.com/blei-lab/tree-diffuser/issues/26

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000220 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 580
[LightGBM] [Info] Number of data points in the train set: 24600, number of used features: 8
[LightGBM] [Info] Start training from score 0.002828
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[718]	valid_0's l2: 0.0183032


We then sample from the fitted model.

In [7]:
y_samples = model["treeffuser"].sample(
    X_test, n_samples=1, n_parallel=100, denoise=False, n_steps=30, seed=0
)
preds["treeffuser"] = y_samples.mean(axis=1)

100it [00:00, 109.63it/s]            


Next, we run NGBoost with Gaussian likelihood.

In [8]:
model["ngb"] = NGBRegressor()

model["ngb"].fit(X_train, y_train)

y_dim = data["y"].shape[1]
preds["ngb"] = model["ngb"].predict(X_test).reshape((-1, y_dim))

  y = column_or_1d(y, warn=True)


[iter 0] loss=4.1709 val_loss=0.0000 scale=1.0000 norm=12.2006
[iter 100] loss=2.9329 val_loss=0.0000 scale=2.0000 norm=3.7028
[iter 200] loss=1.9562 val_loss=0.0000 scale=2.0000 norm=1.4216
[iter 300] loss=1.0421 val_loss=0.0000 scale=2.0000 norm=1.1655
[iter 400] loss=0.3063 val_loss=0.0000 scale=2.0000 norm=1.0317


## Metrics

In [9]:
metrics = compare_accuracy(preds, y_test, print_table=True)

+------------+---------------------+--------------------+---------------------+--------------------+--------------------+--------------------+
|   Method   |         mae         |        rmse        |         mdae        |       marpd        |         r2         |        corr        |
+------------+---------------------+--------------------+---------------------+--------------------+--------------------+--------------------+
| treeffuser | 0.40690423263030795 | 0.821200077340465  | 0.15212195630136444 | 20.380399866908114 | 0.9954614502325652 | 0.9977360419408013 |
|    ngb     |  0.2681759418723791 | 0.5737286821495425 | 0.12379010196374282 | 17.80290905044615  | 0.9977847014528798 | 0.9989865427029946 |
+------------+---------------------+--------------------+---------------------+--------------------+--------------------+--------------------+
