# Finding best model with Gaussian Process Regression (GPR)

In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
%matplotlib notebook

In [4]:
# import own modules
import run_gpr
from gpr_alg import plot_data

# import external modules
import warnings
import pandas as pd
import numpy as np
import sklearn.gaussian_process as gp

## Find best model

We start be performing a grid search to find the best model for the given dataset. The ouput is saved in a file.

In [5]:
# Load statistics from model fitting with different kernels
store = pd.HDFStore('grid_search_stats_2D_100_iterations.h5')
model_stats = store['df']

model_stats

Unnamed: 0,kernel,stats,pred_mean,pred_cov
0,RBF: {'length_scale': 9.999999999999997e-06},0.100028,"[-0.421499462109478, 0.5101733594488376, 1.416...","[[1.4999997732800807e-07, 0.0, 0.0, 0.0, 0.0, ..."
6,RationalQuadratic: {'length_scale': 0.15913900...,0.000202,"[-0.4213950214655142, 0.5098295757879896, 1.41...","[[1.4942326864719746e-07, 2.401476795199642e-0..."
7,RationalQuadratic: {'length_scale': 0.15914036...,0.000202,"[-0.42139502280656416, 0.5098295804289137, 1.4...","[[1.494232841903198e-07, 2.4014039645692264e-0..."
8,RationalQuadratic: {'length_scale': 0.15913876...,0.000202,"[-0.4213950213575117, 0.5098295753150524, 1.41...","[[1.4942326753697444e-07, 2.4014855659615364e-..."
9,RationalQuadratic: {'length_scale': 0.15914291...,0.000202,"[-0.4213950194521203, 0.5098295730886093, 1.41...","[[1.4942326664879602e-07, 2.4014726873744507e-..."
10,RationalQuadratic: {'length_scale': 0.15913854...,0.000202,"[-0.4213950212824784, 0.5098295749553472, 1.41...","[[1.4942326653777371e-07, 2.4014918942327768e-..."
11,RationalQuadratic: {'length_scale': 0.15913864...,0.000202,"[-0.42139502124337014, 0.5098295749280624, 1.4...","[[1.494232660936845e-07, 2.4014904509428447e-0..."
12,RBF + WhiteKernel: {'k1': RBF(length_scale=0.1...,0.001074,"[-0.42018575507188416, 0.5068250047550009, 1.4...","[[1.5026264549522494e-05, 4.2079478379530855e-..."
13,RationalQuadratic + WhiteKernel: {'k1': Ration...,0.001069,"[-0.42018710489105615, 0.5068359056746203, 1.4...","[[1.5026467276246791e-05, 4.2004774869486283e-..."


In [6]:
# Sort dataframe
model_stats = model_stats.sort_values(by=['stats'])
model_stats

Unnamed: 0,kernel,stats,pred_mean,pred_cov
7,RationalQuadratic: {'length_scale': 0.15914036...,0.000202,"[-0.42139502280656416, 0.5098295804289137, 1.4...","[[1.494232841903198e-07, 2.4014039645692264e-0..."
9,RationalQuadratic: {'length_scale': 0.15914291...,0.000202,"[-0.4213950194521203, 0.5098295730886093, 1.41...","[[1.4942326664879602e-07, 2.4014726873744507e-..."
6,RationalQuadratic: {'length_scale': 0.15913900...,0.000202,"[-0.4213950214655142, 0.5098295757879896, 1.41...","[[1.4942326864719746e-07, 2.401476795199642e-0..."
8,RationalQuadratic: {'length_scale': 0.15913876...,0.000202,"[-0.4213950213575117, 0.5098295753150524, 1.41...","[[1.4942326753697444e-07, 2.4014855659615364e-..."
11,RationalQuadratic: {'length_scale': 0.15913864...,0.000202,"[-0.42139502124337014, 0.5098295749280624, 1.4...","[[1.494232660936845e-07, 2.4014904509428447e-0..."
10,RationalQuadratic: {'length_scale': 0.15913854...,0.000202,"[-0.4213950212824784, 0.5098295749553472, 1.41...","[[1.4942326653777371e-07, 2.4014918942327768e-..."
13,RationalQuadratic + WhiteKernel: {'k1': Ration...,0.001069,"[-0.42018710489105615, 0.5068359056746203, 1.4...","[[1.5026467276246791e-05, 4.2004774869486283e-..."
12,RBF + WhiteKernel: {'k1': RBF(length_scale=0.1...,0.001074,"[-0.42018575507188416, 0.5068250047550009, 1.4...","[[1.5026264549522494e-05, 4.2079478379530855e-..."
0,RBF: {'length_scale': 9.999999999999997e-06},0.100028,"[-0.421499462109478, 0.5101733594488376, 1.416...","[[1.4999997732800807e-07, 0.0, 0.0, 0.0, 0.0, ..."


In [7]:
for row in range(model_stats.shape[0]):
    print(model_stats['kernel'].iloc[row])

RationalQuadratic: {'length_scale': 0.159140367693352, 'alpha': 152.34775531426777}
RationalQuadratic: {'length_scale': 0.15914291470979336, 'alpha': 152.22371436712567}
RationalQuadratic: {'length_scale': 0.15913900616767376, 'alpha': 152.49262497940862}
RationalQuadratic: {'length_scale': 0.15913876928554635, 'alpha': 152.5146072440485}
RationalQuadratic: {'length_scale': 0.15913864329007374, 'alpha': 152.52771338688737}
RationalQuadratic: {'length_scale': 0.15913854326350657, 'alpha': 152.5349390768122}
RationalQuadratic + WhiteKernel: {'k1': RationalQuadratic(alpha=622, length_scale=0.166), 'k2': WhiteKernel(noise_level=1e-05), 'k1__length_scale': 0.1660520247237648, 'k1__alpha': 622.3010890282484, 'k1__length_scale_bounds': (1e-05, 100000.0), 'k1__alpha_bounds': (1e-05, 100000.0), 'k2__noise_level': 9.999999999999997e-06, 'k2__noise_level_bounds': (1e-05, 100000.0)}
RBF + WhiteKernel: {'k1': RBF(length_scale=0.166), 'k2': WhiteKernel(noise_level=1e-05), 'k1__length_scale': 0.16567

We discard the first two options beacause for both kernels at least one parameter is very close to the specified bounds. Moreover, with the third kernel the mean absolut error is still extremly low.

In [8]:
# Get model with smallest mean absolut error
mean_prediction =  model_stats['pred_mean'].iloc[0]
cov_prediction = model_stats['pred_cov'].iloc[0]

print('Kernel: ' + str(model_stats['kernel'].iloc[0]))
print('Mean absolut error: ' + str(model_stats['stats'].iloc[0]))

Kernel: RationalQuadratic: {'length_scale': 0.159140367693352, 'alpha': 152.34775531426777}
Mean absolut error: 0.00020198705556307429


In [9]:
# Load grid and data
grid_x1, grid_x2, grid_train, full_grid, data_train, data = run_gpr.create_2D_data(add_noise=False)

train set size: 0.79


In [10]:
# Plot original data
plot_data.make_3D_surface_plot(x=grid_x1, y=grid_x2, z=data.reshape(np.shape(grid_x1)[0], np.shape(grid_x1)[1]), file_name='original_data.png')

<IPython.core.display.Javascript object>

In [11]:
# Plot fitted data
plot_data.make_3D_surface_plot(x=grid_x1, y=grid_x2, z=mean_prediction.reshape(np.shape(grid_x1)[0], np.shape(grid_x1)[1]), file_name='fitted_data.png')

<IPython.core.display.Javascript object>

In [21]:
error = abs(data.reshape(np.shape(grid_x1)[0], np.shape(grid_x1)[1]) - mean_prediction.reshape(np.shape(grid_x1)[0], np.shape(grid_x1)[1]))
# Plot error
plot_data.make_3D_contour_plot(x=grid_x1, y=grid_x2, z=error, add_train=True, x_train=grid_train[:,0], y_train=grid_train[:,1], file_name='original_data.png')

<IPython.core.display.Javascript object>

In [19]:
# Plot posteriors
posteriors = plot_data.plot_posteriors(x=grid_x1, y=grid_x2, z=data.reshape(np.shape(grid_x1)[0], np.shape(grid_x1)[1]), mean_pred=mean_prediction.flatten(),
                                               cov_pred=cov_prediction,
                                               posterior_nums=5, add_train_ind=False, x_train_val=grid_train)

<IPython.core.display.Javascript object>

## Fit specific model