In [54]:
## load in relevant packages
import pandas as pd  
import numpy as np
#import matplotlib.pyplot as pl
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

np.random.seed(1)

In [120]:
##===================================================================================
## read in data

## winter
## collapsed over time
#dta = pd.read_csv("../data/kcl_london_model_data_winter_collapsed.csv", sep=',') 
## aggregated over time
dta = pd.read_csv("../data/kcl_london_model_data_winter_agg_time.csv", sep=',')

## not winter
## collapsed over time
#dta = pd.read_csv("../data/kcl_london_model_data_nowinter_collapsed.csv", sep=',')
## aggregated over time
#dta = pd.read_csv("../data/kcl_london_model_data_nowinter_agg_time.csv", sep=',')

## monthly data (2000-2019)
#dta = pd.read_csv("../data/kcl_london_model_data_monthly.csv", sep=',')
##===================================================================================

In [121]:
## preview data
print(dta.head())
print(dta.shape)

## divide into features and variable
X = dta[['latitude', 'longitude']].values  
y = dta.loc[:,'nox'].values  

print(X.shape)
print(y.shape)


## print previews
print(y[0:10])
print(X[1:10,:])

                            site code   latitude  longitude   site_type  year  \
0               Heathrow Airport  LH2  51.479234  -0.440531  Industrial  2000   
1       Barnet - Tally Ho Corner  BN1  51.614675  -0.176607    Kerbside  2000   
2         Camden - Swiss Cottage  CD1  51.544219  -0.175284    Kerbside  2000   
3  Westminster - Marylebone Road  MY1  51.522540  -0.154590    Kerbside  2000   
4              Croydon - Norbury  CR5  51.411349  -0.123110    Kerbside  2000   

          nox  
0  148.090848  
1  199.026427  
2  217.054604  
3  457.892734  
4  235.985324  
(1981, 7)
(1981, 2)
(1981,)
[148.09084761 199.0264266  217.05460423 457.89273426 235.98532394
 433.92134367 239.16108482 167.38472428 184.71412974 161.47389498]
[[51.614675   -0.176607  ]
 [51.544219   -0.175284  ]
 [51.52254    -0.15459   ]
 [51.411349   -0.12311   ]
 [51.55834619  0.06999151]
 [51.593935    0.085516  ]
 [51.552264   -0.402779  ]
 [51.51074135 -0.37162352]
 [51.489321   -0.310002  ]]


In [126]:
## create validation dataset (no test set since using CV)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=0) 

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(1485, 2)
(1485,)
(496, 2)
(496,)


In [129]:
#y_train = normalize(y_train[:,np.newaxis], axis=0).ravel()
#y_val = normalize(y_val[:,np.newaxis], axis=0).ravel()

## should I standardize X-values??
from sklearn.preprocessing import StandardScaler  
feature_scaler = StandardScaler()  
X_train = feature_scaler.fit_transform(X_train)  
X_val = feature_scaler.transform(X_val)  
print(X_train[1:5,:])
print(X_val[1:5,:])

(1485, 2)
[[ 1.16995822  1.93651833]
 [ 1.42255373 -1.51986064]
 [ 0.03918873 -0.82917594]
 [-2.09211187 -1.06546145]
 [-2.00818931  0.06906187]
 [-2.00818931  0.06906187]
 [ 0.20401868 -0.17958956]
 [ 0.28445892  0.02733805]
 [ 1.62651214 -0.30395794]]
[[ 0.69952956 -0.32300044]
 [ 1.89548528 -1.11041401]
 [ 0.23024601 -0.01530015]
 [ 1.16995822  1.93651833]
 [-0.36526168 -1.53115304]
 [-0.20738336 -2.02127878]
 [-0.19727365 -1.18250426]
 [-0.73650062 -0.42421379]
 [-2.22706774 -0.26339495]]


In [102]:
kernel_0 = RBF(1000, (1e-2, 1e2))
gp = GaussianProcessRegressor(kernel=kernel_0, normalize_y=True)
all_accuracies = cross_val_score(estimator=gp, X=X_train, y=y_train, cv=5, scoring='r2').mean()
print(all_accuracies)

0.18000194568260935


In [None]:
## search for best hyperparameters: sigma
all_accuracies_mean = []
all_accuracies_std = []
sigmas = np.arange(start=0.1, stop=4.5, step=0.5)
for sigma in sigmas:
    kernel = RBF(length_scale=sigma)
    gp = GaussianProcessRegressor(kernel=kernel, alpha=1, n_restarts_optimizer=5)
    all_accuracies = cross_val_score(estimator=gp, X=X_train, y=y_train, cv=5, scoring='r2')
    all_accuracies_mean.append(all_accuracies.mean())
    all_accuracies_std.append(all_accuracies.std())
print(sigmas)
print(all_accuracies_mean)
print(all_accuracies_std) 

In [None]:
## search for best hyperparameters: alpha
all_accuracies_al = []
alphas = np.arange(start=0.1, stop=2, step=0.1)
for alpha in alphas:
    kernel = RBF(length_scale=best_sigma)
    gp = GaussianProcessRegressor(kernel=kernel, alpha=alpha, n_restarts_optimizer=5)
    all_accuracies_al.append(cross_val_score(estimator=gp, X=X_train, y=y_train, cv=5, scoring='r2'))
print(alphas)
print(all_accuracies_al.mean())
print(all_accuracies_al.std())

In [None]:
y_pred, sigma = gp.predict(x, return_std=True)