In [None]:
import sklearn.gaussian_process as gp
import os
import typing

from sklearn.gaussian_process.kernels import *
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.kernel_approximation import  Nystroem, RBFSampler
from sklearn import pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm

In [None]:
#%cd /content/drive/My Drive/Task1PAI/Task1

In [None]:
# Cost function constants
THRESHOLD = 35.5
COST_W_NORMAL = 1.0
COST_W_OVERPREDICT = 5.0
COST_W_THRESHOLD = 20.0

In [None]:
def cost_function(y_true: np.ndarray, y_predicted: np.ndarray) -> float:
    """
    Calculates the cost of a set of predictions.

    :param y_true: Ground truth pollution levels as a 1d NumPy float array
    :param y_predicted: Predicted pollution levels as a 1d NumPy float array
    :return: Total cost of all predictions as a single float
    """
    assert y_true.ndim == 1 and y_predicted.ndim == 1 and y_true.shape == y_predicted.shape

    # Unweighted cost
    cost = (y_true - y_predicted) ** 2
    weights = np.zeros_like(cost)

    # Case i): overprediction
    mask_1 = y_predicted > y_true
    weights[mask_1] = COST_W_OVERPREDICT

    # Case ii): true is above threshold, prediction below
    mask_2 = (y_true >= THRESHOLD) & (y_predicted < THRESHOLD)
    weights[mask_2] = COST_W_THRESHOLD

    # Case iii): everything else
    mask_3 = ~(mask_1 | mask_2)
    weights[mask_3] = COST_W_NORMAL

    # Weigh the cost and return the average
    return np.mean(cost * weights)

In [None]:
X_tr = np.loadtxt('train_x.csv', delimiter=',', skiprows=1)
y_tr = np.loadtxt('train_y.csv', delimiter=',', skiprows=1)
X_te = np.loadtxt('test_x.csv', delimiter=',', skiprows=1)

## Visualise data

In [None]:
# plot the training data
plt.scatter(X_tr[:,0],X_tr[:, 1])

In [None]:
# plot the test data
plt.scatter(X_te[:, 0], X_te[:, 1])

## Sample the training data

In [None]:
def sample_training(X_tr, y_tr):
    train_data = np.concatenate((X_tr, y_tr.reshape(-1, 1)), axis = 1)
    np.random.shuffle(train_data)
    # only take 4000 observations
    index = np.random.choice(train_data.shape[0], size = 4000, replace=False)
    train = train_data[index]

    X_train, X_val, y_train, y_val = train_test_split(train[:, :2], train[:, 2], test_size = 0.3, random_state=0)
    
    return X_train, X_val, y_train, y_val

In [None]:
train_data = np.concatenate((X_tr, y_tr.reshape(-1, 1)), axis = 1)
np.random.shuffle(train_data)

In [None]:
index = np.random.choice(train_data.shape[0], size = 3000, replace=False)
train = train_data[index]

X_train, X_val, y_train, y_val = train_test_split(train[:, :2], train[:, 2], test_size = 0.1, random_state=0)

In [None]:
params = {'length_scale': [0.1],
          'noise_level': [0.005],
          'alpha': [0.001],
          'n_components': [200]}

In [None]:
# X_train = train[:, :2]
# y_train = train[:, 2].reshape(-1, 1)

# TODO: Fit your model here

kernel= RBF(length_scale=params['length_scale']) + WhiteKernel(noise_level=params['noise_level'])
gpr_p = pipeline.Pipeline([
                ("nystrom", Nystroem(kernel = kernel, random_state=0, n_components=params['n_components'])),
                ("gpr", GaussianProcessRegressor(kernel=kernel, alpha = params['alpha'], 
                                                 random_state=0, normalize_y=True))])
gpr_p.fit(X_train, y_train)

In [None]:
X_train.shape, X_val.shape

## RBFSampler and Hyperopt

In [None]:
# params = {'nu': [0.1, 0.5, 1],
#           'noise_level': [0.01, 0.05, 0.1, 0.5],
#           'alpha': [0.01, 0.05, 0.1, 0.5]}

# params = {'length_scale': [0.5,0.1],# 0.1, 10, 100],
#           'noise_level': [0.1, 0.05],
#           'alpha': [0.1, 0.05, 0.001]}#, 0.05, 1, 2, 5, 10]}

# params = {'length_scale': [0.1, 0.01],# 0.1, 10, 100],
#           'noise_level': [0.1, 0.01],
#           'alpha': [0.1, 0.05, 0.01]}#, 0.05, 1, 2, 5, 10]}

params = {'length_scale': [0.01],
          'noise_level': [0.1],
          'alpha': [0.05],
          'offset': [-2]} # [-1, -2]

grid = ParameterGrid(params)
list(grid)

In [None]:
# 5: y_hat > y
# 20: y >= threshold > y_hat
# 1:  
#     y >= y_hat >= threshold
#     threshold > y >= y_hat

In [None]:
costs = []
for params in tqdm(grid):
    kernel= RBF(length_scale=params['length_scale']) + WhiteKernel(noise_level=params['noise_level'])
    gpr_p = pipeline.Pipeline([
#                     ("scale", StandardScaler()),
                    ("sampler", RBFSampler(random_state=0, n_components=200)),
                    ("gpr", GaussianProcessRegressor(kernel=kernel, alpha = params['alpha'],
                                                     random_state=0, normalize_y=True))])
    gpr_p.fit(X_train, y_train)
    pred_mean, pred_std = gpr_p.predict(X_val, return_std=True)
    pred_mean += params['offset']
    mask1 = ((THRESHOLD - pred_mean >= 0) & (THRESHOLD - pred_mean <= 5))
#     mask2 = ((pred_mean - THRESHOLD >= 0) & (pred_mean - THRESHOLD <= 5))
    pred_mean[mask1] = THRESHOLD
#     pred_mean[mask2] = THRESHOLD
#     lower_mask = pred_mean <= np.percentile(pred_mean, 25)
#     upper_mask =pred_mean >= np.percentile(pred_mean, 75)
#     pred_mean[upper_mask] = pred_mean[upper_mask] - (THRESHOLD/2)
    costs.append(cost_function(y_val, pred_mean))
    
costs, grid[np.argmin(costs)]

## Nystrom approx and Hyperopt

In [None]:
# params = {'nu': [0.1, 0.5, 1],
#           'noise_level': [0.01, 0.05, 0.1, 0.5],
#           'alpha': [0.01, 0.05, 0.1, 0.5]}

# params = {'length_scale': [0.5,0.1],# 0.1, 10, 100],
#           'noise_level': [0.1, 0.05],
#           'alpha': [0.1, 0.05, 0.001]}#, 0.05, 1, 2, 5, 10]}

# params = {'length_scale': [0.05, 0.1],# 0.1, 10, 100],
#           'noise_level': [0.005, 0.001],
#           'alpha': [0.001, 0.005],
#            'n_components': [100, 200, 300]}#, 0.05, 1, 2, 5, 10]}

params = {'length_scale': [0.1],# 0.1, 10, 100],
          'noise_level': [0.005],
          'alpha': [0.001],
           'n_components': [200],
           'offset': [-2]}#, 0

grid = ParameterGrid(params)
list(grid)

In [None]:
# cost_iterations = []
# for i in tqdm(range(5)):
#     X_train, X_val, y_train, y_val = sample_training(X_tr, y_tr)
    
#     costs = []
#     for params in tqdm(grid):
#         kernel= RBF(length_scale=params['length_scale']) + WhiteKernel(noise_level=params['noise_level'])
#         gpr_p = pipeline.Pipeline([
#                         ("nystrom", Nystroem(kernel = kernel, random_state=0, n_components=params['n_components'])),
#                         ("gpr", GaussianProcessRegressor(kernel=kernel, alpha = params['alpha'], n_restarts_optimizer=5,
#                                                          random_state=0, normalize_y=True))])
#         gpr_p.fit(X_train, y_train)
#         pred_mean, pred_std = gpr_p.predict(X_val, return_std=True)
#         pred_mean += params['offset']
#         mask1 = ((THRESHOLD - pred_mean >= 0) & (THRESHOLD - pred_mean <= 5))
#         pred_mean[mask1] = THRESHOLD
#         costs.append(cost_function(y_val, pred_mean))

#     costs, grid[np.argmin(costs)]
#     cost_iterations.append(costs)

In [None]:
# n_restarts_optimizer=5,
grid = ParameterGrid(params)
costs = []
for params in tqdm(grid):
    print(params)
    kernel= RBF(length_scale=params['length_scale']) + WhiteKernel(noise_level=params['noise_level'])
    gpr_p = pipeline.Pipeline([
                    ("nystrom", Nystroem(kernel = kernel, random_state=0, n_components=params['n_components'])),
                    ("gpr", GaussianProcessRegressor(kernel=kernel, alpha = params['alpha'], 
                                                     random_state=0, normalize_y=True))])
    gpr_p.fit(X_train, y_train)
    pred_mean, pred_std = gpr_p.predict(X_val, return_std=True)
    pred_mean += params['offset']
    mask1 = ((THRESHOLD - pred_mean >= 0) & (THRESHOLD - pred_mean <= 5))
    pred_mean[mask1] = THRESHOLD
    costs.append(cost_function(y_val, pred_mean))
    
costs, grid[np.argmin(costs)]

## RBF, Constant and White kernels

In [None]:
# params = {'nu': [0.1, 0.5, 1],
#           'noise_level': [0.01, 0.05, 0.1, 0.5],
#           'alpha': [0.01, 0.05, 0.1, 0.5]}

# params = {'length_scale': [0.5,0.1],# 0.1, 10, 100],
#           'noise_level': [0.1, 0.05],
#           'alpha': [0.1, 0.05, 0.001]}#, 0.05, 1, 2, 5, 10]}

# params = {'length_scale': [0.05, 0.1],# 0.1, 10, 100],
#           'noise_level': [0.005, 0.001],
#           'alpha': [0.001, 0.005],
#            'n_components': [100, 200, 300]}#, 0.05, 1, 2, 5, 10]}

params = {'length_scale': [0.1],# 0.1, 10, 100],
          'noise_level': [0.005],
          'alpha': [0.001],
          'n_components': [200],
          'offset': [-2]}

grid = ParameterGrid(params)
list(grid)

In [None]:
costs = []
for params in tqdm(grid):
    kernel= ConstantKernel(1) * RBF(length_scale=params['length_scale']) + WhiteKernel(noise_level=params['noise_level'])
    gpr_p = pipeline.Pipeline([
                    ("nystrom", Nystroem(kernel = kernel, random_state=0, n_components=params['n_components'])),
                    ("gpr", GaussianProcessRegressor(kernel=kernel, alpha = params['alpha'], n_restarts_optimizer=5,
                                                     random_state=0, normalize_y=True))])
    gpr_p.fit(X_train, y_train)
    pred_mean, pred_std = gpr_p.predict(X_val, return_std=True)
    pred_mean += params['offset']
    mask1 = ((THRESHOLD - pred_mean >= 0) & (THRESHOLD - pred_mean <= 5))
    pred_mean[mask1] = THRESHOLD
    costs.append(cost_function(y_val, pred_mean))
    
costs, grid[np.argmin(costs)]

## Mattern Kernel and White Kernel

In [None]:
# params = {'nu': [1.5, 2.5],
#           'length_scale': [0.1, 0.5],
#           'noise_level': [0.01, 0.001],
#           'alpha': [0.01],
#            'n_components': [50, 100]}

params = {'nu': [2.5],
          'length_scale': [0.5],
          'noise_level': [0.001],
          'alpha': [0.01],
           'n_components': [100],
          'offset': [-2]}

grid = ParameterGrid(params)
list(grid)

In [None]:
cost_iterations = []
for i in tqdm(range(5)):
    X_train, X_val, y_train, y_val = sample_training(X_tr, y_tr)
    
    costs = []
    for params in tqdm(grid):
        kernel= Matern(length_scale=params['length_scale'], nu=params['nu']) + WhiteKernel(noise_level=params['noise_level'])
        gpr_p = pipeline.Pipeline([
                        ("nystrom", Nystroem(kernel = kernel, random_state=0, n_components=params['n_components'])),
                        ("gpr", GaussianProcessRegressor(kernel=kernel, alpha = params['alpha'], n_restarts_optimizer=5,
                                                         random_state=0, normalize_y=True))])
        gpr_p.fit(X_train, y_train)
        pred_mean, pred_std = gpr_p.predict(X_val, return_std=True)
        pred_mean += params['offset']
        mask1 = ((THRESHOLD - pred_mean >= 0) & (THRESHOLD - pred_mean <= 5))
        pred_mean[mask1] = THRESHOLD
        costs.append(cost_function(y_val, pred_mean))

    costs, grid[np.argmin(costs)]
    cost_iterations.append(costs)

In [None]:
np.mean(cost_iterations), cost_iterations

In [None]:
costs = []
for params in tqdm(grid):
    kernel= Matern(length_scale=params['length_scale'], nu=params['nu']) + WhiteKernel(noise_level=params['noise_level'])
    gpr_p = pipeline.Pipeline([
                    ("nystrom", Nystroem(kernel = kernel, random_state=0, n_components=params['n_components'])),
                    ("gpr", GaussianProcessRegressor(kernel=kernel, alpha = params['alpha'], n_restarts_optimizer=5,
                                                     random_state=0, normalize_y=True))])
    gpr_p.fit(X_train, y_train)
    pred_mean, pred_std = gpr_p.predict(X_val, return_std=True)
    pred_mean += params['offset']
    mask1 = ((THRESHOLD - pred_mean >= 0) & (THRESHOLD - pred_mean <= 5))
    pred_mean[mask1] = THRESHOLD
    costs.append(cost_function(y_val, pred_mean))
    
costs, grid[np.argmin(costs)]

## Combination of multiple Kernels

### RBF + Matern + WhiteNoise

In [None]:
# params = {'nu': [1.5, 2.5],
#           'length_scale': [0.1, 0.5],
#           'noise_level': [0.01, 0.001],
#           'alpha': [0.01],
#            'n_components': [50, 100]}

# params = {'nu': [2.5, 1.5],
#           'length_scale': [0.5],
#           'noise_level': [0.001],
#           'alpha': [0.01, 0.01],
#            'n_components': [200]}
# #           'offset': [-1, -2]}

params = {'nu': [2.5],
          'length_scale': [0.5],
          'noise_level': [0.001],
          'alpha': [0.01],
           'n_components': [200]}
#           'offset': [-1, -2]}

grid = ParameterGrid(params)
list(grid)

In [None]:
costs = []
for params in tqdm(grid):
    kernel= RBF(length_scale=params['length_scale']) + Matern(length_scale=params['length_scale'], nu=params['nu']) + WhiteKernel(noise_level=params['noise_level'])
    gpr_p = pipeline.Pipeline([
                    ("nystrom", Nystroem(kernel = kernel, random_state=0, n_components=params['n_components'])),
                    ("gpr", GaussianProcessRegressor(kernel=kernel, alpha = params['alpha'], n_restarts_optimizer=5,
                                                     random_state=0, normalize_y=True))])
    gpr_p.fit(X_train, y_train)
    pred_mean, pred_std = gpr_p.predict(X_val, return_std=True)
#     pred_mean += params['offset']
#     mask1 = ((THRESHOLD - pred_mean >= 0) & (THRESHOLD - pred_mean <= 5))
#     pred_mean[mask1] = THRESHOLD
    costs.append(cost_function(y_val, pred_mean))
    
costs, grid[np.argmin(costs)]

### RBF + RationalQuadratic +WhiteNoise

In [None]:
# params = {'nu': [1.5, 2.5],
#           'length_scale': [0.1, 0.5],
#           'noise_level': [0.01, 0.001],
#           'alpha': [0.01],
#            'n_components': [50, 100]}

params = {'nu': [2.5],
          'length_scale': [0.5, 0.1],
          'noise_level': [0.001],
          'alpha': [0.001, 0.01],
          'mixture': [0.5, 1, 1.5],
           'n_components': [200, 100]}
#           'offset': [-1, -2]}

grid = ParameterGrid(params)
list(grid)

In [None]:
costs = []
for params in tqdm(grid):
    kernel= RationalQuadratic(length_scale=params['length_scale'], alpha = params['mixture']) + Matern(length_scale=params['length_scale'], nu=params['nu']) + WhiteKernel(noise_level=params['noise_level'])
    gpr_p = pipeline.Pipeline([
                    ("nystrom", Nystroem(kernel = kernel, random_state=0, n_components=params['n_components'])),
                    ("gpr", GaussianProcessRegressor(kernel=kernel, alpha = params['alpha'], n_restarts_optimizer=5,
                                                     random_state=0, normalize_y=True))])
    gpr_p.fit(X_train, y_train)
    pred_mean, pred_std = gpr_p.predict(X_val, return_std=True)
#     pred_mean += params['offset']
#     mask1 = ((THRESHOLD - pred_mean >= 0) & (THRESHOLD - pred_mean <= 5))
#     pred_mean[mask1] = THRESHOLD
    costs.append(cost_function(y_val, pred_mean))
    
costs, grid[np.argmin(costs)]