In [None]:
import torch
from autoemulate.emulators import GaussianProcess
from autoemulate.emulators.random_forest import RandomForest
from autoemulate.emulators.transformed.base import TransformedEmulator
from autoemulate.transforms import PCATransform, VAETransform, StandardizeTransform
from sklearn.datasets import make_regression
import torchmetrics
from autoemulate.core.model_selection import evaluate
from autoemulate.core.types import TensorLike
from autoemulate.core.compare import AutoEmulate

# Uncomment to enable logging for GPs
import logging
# logging.basicConfig(level=logging.INFO)

def make_data(
    random_state: int = 42,
    n_samples: int = 200,
    n_informative:int = 2,
    n_features: int = 5,
    noise: float = 0.2,
    n_targets: int = 500
):
    x, y, _ = make_regression(
        n_samples=n_samples,
        n_features=n_features,
        noise=noise,
        random_state=random_state,
        n_informative=n_informative,
        n_targets=n_targets,
        coef=True,
    )
    x = torch.tensor(x, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.float32)
    return x, y

# Train data
x, y = make_data(random_state=42, n_features=10, n_targets=2, noise=0.01)

# Test data
x2, y2 = make_data(random_state=43, n_features=10, n_targets=2, noise=0.01)


In [None]:
ae = AutoEmulate(
    x,
    y,
    models=[GaussianProcess, RandomForest],
    x_transforms_list=[[], [StandardizeTransform(), PCATransform(n_components=5)]],
    y_transforms_list=[[], [StandardizeTransform(), PCATransform(n_components=1)]]
)


In [3]:
outputs  = ae.compare(4)


100%|██████████| 4/4 [00:02<00:00,  1.89it/s]
100%|██████████| 4/4 [00:00<00:00, 13.84it/s]
  cov_orig = make_positive_definite(cov_orig)
100%|██████████| 4/4 [00:00<00:00,  6.73it/s]
100%|██████████| 4/4 [00:00<00:00, 12.16it/s]
100%|██████████| 4/4 [00:00<00:00,  5.95it/s]
100%|██████████| 4/4 [00:00<00:00,  8.52it/s]
  cov_orig = make_positive_definite(cov_orig)
100%|██████████| 4/4 [00:00<00:00,  6.79it/s]
100%|██████████| 4/4 [00:00<00:00,  7.03it/s]


In [4]:
outputs

[{'config': {'mean_module_fn': <function autoemulate.emulators.gaussian_process.constant_mean(n_features, n_outputs)>,
   'covar_module_fn': <function autoemulate.emulators.gaussian_process.rq_kernel(n_features, n_outputs)>,
   'epochs': 500,
   'batch_size': 32,
   'activation': torch.nn.modules.activation.GELU,
   'lr': 0.07543120063354615,
   'preprocessor_cls': None,
   'likelihood_cls': gpytorch.likelihoods.multitask_gaussian_likelihood.MultitaskGaussianLikelihood},
  'x_transforms': [],
  'y_transforms': [],
  'model_cls': autoemulate.emulators.gaussian_process.exact.GaussianProcessExact,
  'r2_score': 0.4029142737388611,
  'rmse_score': 17.389700325212118},
 {'config': {'n_estimators': 463,
   'min_samples_split': 10,
   'min_samples_leaf': 8,
   'max_features': 'sqrt',
   'bootstrap': True,
   'oob_score': False,
   'max_depth': 20,
   'max_samples': 0.7},
  'x_transforms': [],
  'y_transforms': [],
  'model_cls': autoemulate.emulators.random_forest.RandomForest,
  'r2_score': 

In [5]:
import pandas as pd
df = pd.DataFrame.from_records(outputs).sort_values(by=["r2_score", "rmse_score"], ascending=False, inplace=False)
best_model_idx = df.index[0]
df


Unnamed: 0,config,x_transforms,y_transforms,model_cls,r2_score,rmse_score
0,{'mean_module_fn': <function constant_mean at ...,[],[],<class 'autoemulate.emulators.gau...,0.402914,17.3897
1,"{'n_estimators': 463, 'min_samples_split': 10,...",[],[],<class 'autoemulate.emulators.ran...,0.0,39.538689
7,"{'n_estimators': 133, 'min_samples_split': 19,...","[StandardizeTransform(), PCATransform()]","[StandardizeTransform(), PCATransform()]",<class 'autoemulate.emulators.ran...,0.0,39.512795
4,{'mean_module_fn': <function constant_mean at ...,"[StandardizeTransform(), PCATransform()]",[],<class 'autoemulate.emulators.gau...,-0.107478,21.702006
5,"{'n_estimators': 50, 'min_samples_split': 11, ...","[StandardizeTransform(), PCATransform()]",[],<class 'autoemulate.emulators.ran...,-5.360586,33.049751
2,{'mean_module_fn': <function constant_mean at ...,[],"[StandardizeTransform(), PCATransform()]",<class 'autoemulate.emulators.gau...,-7.341877,28.84335
6,{'mean_module_fn': <function constant_mean at ...,"[StandardizeTransform(), PCATransform()]","[StandardizeTransform(), PCATransform()]",<class 'autoemulate.emulators.gau...,-14.007648,31.108872
3,"{'n_estimators': 362, 'min_samples_split': 10,...",[],"[StandardizeTransform(), PCATransform()]",<class 'autoemulate.emulators.ran...,-40.155426,35.436852


In [6]:
outputs[best_model_idx]

{'config': {'mean_module_fn': <function autoemulate.emulators.gaussian_process.constant_mean(n_features, n_outputs)>,
  'covar_module_fn': <function autoemulate.emulators.gaussian_process.rq_kernel(n_features, n_outputs)>,
  'epochs': 500,
  'batch_size': 32,
  'activation': torch.nn.modules.activation.GELU,
  'lr': 0.07543120063354615,
  'preprocessor_cls': None,
  'likelihood_cls': gpytorch.likelihoods.multitask_gaussian_likelihood.MultitaskGaussianLikelihood},
 'x_transforms': [],
 'y_transforms': [],
 'model_cls': autoemulate.emulators.gaussian_process.exact.GaussianProcessExact,
 'r2_score': 0.4029142737388611,
 'rmse_score': 17.389700325212118}

In [7]:
em = TransformedEmulator(
    x,
    y,
    model=outputs[best_model_idx]["model_cls"],
    x_transforms=outputs[best_model_idx]["x_transforms"],
    y_transforms=outputs[best_model_idx]["y_transforms"],
    **outputs[best_model_idx]["config"]
)
em.fit(x, y)

In [None]:
from autoemulate.model_selection import r2_metric

y_pred = em.predict(x[:100])
evaluate(y_pred.mean, y[:100], r2_metric())

0.9774684906005859

In [None]:
# Poor held-out performance
y_pred = em.predict(x2)
evaluate(y_pred.mean, y2, r2_metric())

-0.26360195875167847