In [None]:
import numpy as np

from sklearn.model_selection import train_test_split

from data.utils import list_data, get_data
from metrics import compare_accuracy

from treeffuser import LightGBMTreeffuser
from ngboost import NGBRegressor

## Data

In [None]:
list_data()

In [None]:
data = get_data("naval", verbose=True)
print(data.keys())
print(f"Categorical variables: {data['categorical']}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data["x"], data["y"], test_size=0.05, random_state=42
)

## Models

In [None]:
model = {}
preds = {}

First, we fit treeffuser.

In [None]:
model["treeffuser"] = LightGBMTreeffuser(
    verbose=1,
    n_repeats=100,
    n_estimators=10000,
    sde_name="vesde",
    learning_rate=1,
    early_stopping_rounds=50,
)
temp = model["treeffuser"].fit(
    X_train, y_train
)  # "temp=" is as a temp fix for Issue #26, see github.com/blei-lab/tree-diffuser/issues/26

We then sample from the fitted model.

In [None]:
preds["treeffuser"] = model["treeffuser"].predict(X_test, ode=False, tol=1e-2, verbose=True)

In [None]:
nll = model["treeffuser"].compute_nll(X_test, y_test, ode=False, n_samples=100, verbose=True)
print(nll)

Next, we run NGBoost with Gaussian likelihood.

In [None]:
model["ngb"] = NGBRegressor(n_estimators=10**4, early_stopping_rounds=50)

model["ngb"].fit(X_train, y_train)

y_dim = data["y"].shape[1]
preds["ngb"] = model["ngb"].predict(X_test).reshape((-1, y_dim))

## Metrics

In [None]:
metrics = compare_accuracy(preds, y_test, print_table=True)