In [None]:
import numpy as np

from sklearn.model_selection import train_test_split

from data.utils import list_data, get_data
from metrics import compare_accuracy

from treeffuser import LightGBMTreeffuser
from treeffuser.sde.initialize import initialize_sde
from ngboost import NGBRegressor

## Data

In [None]:
list_data()

In [None]:
data = get_data("naval", verbose=True)
print(data.keys())
print(f"Categorical variables: {data['categorical']}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data["x"], data["y"], test_size=0.2, random_state=42
)

## Models

In [None]:
model = {}
preds = {}

print(y_train.max())
print(y_train.max() - y_train.min())
print(y_train.std())

First, we fit treeffuser.

In [None]:
for sde in ["vesde", "vpsde", "sub-vpsde"]:
    model_name = "treeffuser-" + sde
    model[model_name] = LightGBMTreeffuser(
        verbose=1,
        n_repeats=100,
        n_estimators=10**4,
        sde_name=sde,
        sde_initialize_with_data=True,
        learning_rate=0.1,
        early_stopping_rounds=50,
    )
    temp = model[model_name].fit(
        X_train, y_train
    )  # "temp=" is as a temp fix for Issue #26, see github.com/blei-lab/tree-diffuser/issues/26

    print(model[model_name].score_config.sde)

We then sample from the fitted model.

In [None]:
for sde in ["vesde", "vpsde", "sub-vpsde"]:
    model_name = "treeffuser-" + sde
    y_samples = model[model_name].sample(
        X_test, n_samples=1, n_parallel=100, n_steps=100, seed=0
    )
    preds[model_name] = y_samples.mean(axis=1)

Next, we run NGBoost with Gaussian likelihood.

In [None]:
model["ngb"] = NGBRegressor(n_estimators=10**4, early_stopping_rounds=50)

model["ngb"].fit(X_train, y_train)

In [None]:
y_dim = data["y"].shape[1]
preds["ngb"] = model["ngb"].predict(X_test).reshape((-1, y_dim))

## Metrics

In [None]:
metrics = compare_accuracy(preds, y_test, print_table=True)