#  **PART 2A**: The basics: Gaussian process regression in F3DASM

This step-by-step tutorial with exercises can be followedin order to gain understanding of how regression works in F3DASM.

## 1. Import the necessary packages.

In [None]:
import f3dasm
import numpy as np
import matplotlib.pyplot as plt
import gpytorch

## 2. Define the hyperparameters

In [None]:
dimensionality = 1
numsamples = 10

kernel = gpytorch.kernels.ScaleKernel(base_kernel=gpytorch.kernels.RBFKernel())

noise_fix = True

## 3. Specify the problem

In [None]:
fun = f3dasm.functions.Levy(
    dimensionality=dimensionality,
    scale_bounds=np.tile([0.0, 1.0], (dimensionality, 1)),
    )

Let's plot the function.

In [None]:
x_plot = np.linspace(0, 1, 500)[:, None] # add 1D plot method for functions!
y_plot = fun(x_plot)

# y_plot = (y_plot - np.mean(y_plot)) / np.std(y_plot)

plt.plot(x_plot, y_plot, 'b--', label='Exact')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

Add the design space, sampler and finally the training data.

In [None]:
parameter_DesignSpace: f3dasm.DesignSpace = f3dasm.make_nd_continuous_design(
    bounds=np.tile([0.0, 1.0], (dimensionality, 1)),
    dimensionality=dimensionality,
)

sampler = f3dasm.sampling.SobolSequence(design=parameter_DesignSpace)

train_data: f3dasm.Data = sampler.get_samples(numsamples=numsamples)
train_data.add_output(output=fun(train_data))

Let's see how the training data looks like.

In [None]:
train_data.data

In [None]:
y_unscaled = train_data.get_output_data()
y_scaled = (y_unscaled - np.mean(y_unscaled)) / np.std(y_unscaled)

In [None]:
# y_unscaled

In [None]:
# y_scaled

In [None]:
# train_data.data['output'] = y_scaled

In [None]:
plt.plot(x_plot, y_plot, 'b--', label='Exact')
plt.scatter(train_data.data['input'], train_data.data['output'], c='b', label='Training data')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

## 4. Regression and prediction

In [None]:
param = f3dasm.regression.gpr.Sogpr_Parameters(kernel=kernel)

regressor = f3dasm.regression.gpr.Sogpr(
    train_data=train_data, 
    design=train_data.design,
    parameter=param,
    noise_fix=noise_fix,
)

surrogate = regressor.train()

In [None]:
# import torch
# surrogate.model.likelihood = gpytorch.likelihoods.FixedNoiseGaussianLikelihood(noise=torch.tensor([0.1]))

# cons = gpytorch.constraints.constraints.GreaterThan(1e-0)
# surrogate.model.likelihood.noise_covar.register_constraint("raw_noise", cons)

Let's evaluate the mean and the variance of the Gaussian process posterior.

In [None]:
x_plot_data = f3dasm.Data(design=train_data.design)
x_plot_data.add_numpy_arrays(input=x_plot, output=x_plot)
mean, var = surrogate.predict(test_input_data=x_plot_data)

ucb, lcb = [mean + 2 * (-1) ** k * np.sqrt(np.abs(var)) for k in range(2)]

Let's see how the prediction looks like.

In [None]:
plt.plot(x_plot, y_plot, 'b--', label='Exact') # add regression plot
plt.scatter(train_data.data['input'], train_data.data['output'], c='b', label='Training data')
plt.plot(x_plot, mean, color='purple', label='Prediction')
plt.fill_between(x_plot.flatten(), lcb.flatten(), ucb.flatten(), color='purple', alpha=.25, label='Confidence')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

In [None]:
surrogate.model.covar_module.raw_outputscale

In [None]:
surrogate.model.covar_module.base_kernel.raw_lengthscale

## Comparing GPR results with other packages

### scikit-learn

In [None]:
import sklearn.gaussian_process

kernel_sklearn = sklearn.gaussian_process.kernels.ConstantKernel() * sklearn.gaussian_process.kernels.RBF()
regressor_sklearn = sklearn.gaussian_process.GaussianProcessRegressor(kernel=kernel_sklearn, n_restarts_optimizer=10,)
surrogate_sklearn = regressor_sklearn.fit(X=train_data.get_input_data(), y=train_data.get_output_data())
mean_sklearn, std_sklearn = surrogate_sklearn.predict(X=x_plot, return_std=True)

ucb_sklearn, lcb_sklearn = [mean_sklearn + 2 * (-1) ** k * std_sklearn for k in range(2)]

plt.plot(x_plot, y_plot, 'b--', label='Exact') # add regression plot
plt.scatter(train_data.data['input'], train_data.data['output'], c='b', label='Training data')
plt.plot(x_plot, mean_sklearn, color='purple', label='Prediction')
plt.fill_between(x_plot.flatten(), lcb_sklearn.flatten(), ucb_sklearn.flatten(), color='purple', alpha=.25, label='Confidence')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

In [None]:
surrogate_sklearn.kernel_.get_params()

### GPy

In [None]:
import GPy

# kernel_gpy = GPy.kern.RBF()
regressor_gpy = GPy.models.GPRegression(X=train_data.get_input_data(), Y=train_data.get_output_data(), kernel=None)
regressor_gpy.Gaussian_noise.variance.fix(0)

# regressor_gpy.optimize()
regressor_gpy.optimize_restarts(num_restarts=10)
surrogate_gpy = regressor_gpy

mean_gpy, var_gpy = surrogate_gpy.predict(Xnew=x_plot)

ucb_gpy, lcb_gpy = [mean_gpy + 2 * (-1) ** k * np.sqrt(np.abs(var_gpy)) for k in range(2)]

plt.plot(x_plot, y_plot, 'b--', label='Exact') # add regression plot
plt.scatter(train_data.data['input'], train_data.data['output'], c='b', label='Training data')
plt.plot(x_plot, mean_gpy, color='purple', label='Prediction')
plt.fill_between(x_plot.flatten(), lcb_gpy.flatten(), ucb_gpy.flatten(), color='purple', alpha=.25, label='Confidence')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

In [None]:
surrogate_gpy

## Exercises
1. Change the cosine GP kernel into an RBF kernel (`gpytorch.kernels.RBFKernel()`). What do you notice?
2. Change the function to regress from the AlpineN2 into the Schwefel function (`f3dasm.functions.Schwefel`). What do you notice?
3. Change the number of data points from `15` into a higher number $\le$`150`. Does the GP regress well?