# Gaussian Process
#### Replication of Adrianna's methods using LAQN data retrieved through the LAQN API and applied to multiple pollutants

In [2]:
## load in relevant packages
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import gpflow

np.random.seed(1)

### Access & Process Data 
#### Read in data downloaded from LAQN API in download_data.py

In [1]:
import pandas as pd  

## read in hourly data of given pollutant
pollutant = "O3"
hourly_df = pd.read_csv("../data/" + pollutant + "_data.csv", sep=',', parse_dates=["Timestamp"]).set_index("Timestamp")

print("Hourly Data of ", pollutant)
hourly_df.head()
# convert CSV to DF

Hourly Data of  O3


Unnamed: 0_level_0,O3,SiteCode,Latitude,Longitude
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-12-01 00:00:00,71.8,TD0,51.424304,-0.345715
2007-12-01 01:00:00,71.2,TD0,51.424304,-0.345715
2007-12-01 02:00:00,67.9,TD0,51.424304,-0.345715
2007-12-01 03:00:00,65.4,TD0,51.424304,-0.345715
2007-12-01 04:00:00,61.9,TD0,51.424304,-0.345715


In [2]:
daily_df = hourly_df.groupby(by=["SiteCode"]).resample("D").mean().reset_index()
daily_df["date"] = daily_df["Timestamp"].apply(lambda x: x.strftime("%Y-%m-%d"))
daily_df.set_index("date", inplace=True)

print("Daily Data of ", pollutant)
daily_df.head()

Daily Data of  O3


Unnamed: 0_level_0,SiteCode,Timestamp,O3,Latitude,Longitude
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2007-12-01,BL0,2007-12-01,49.979167,51.522287,-0.125848
2007-12-02,BL0,2007-12-02,51.729167,51.522287,-0.125848
2007-12-03,BL0,2007-12-03,25.225,51.522287,-0.125848
2007-12-04,BL0,2007-12-04,19.604167,51.522287,-0.125848
2007-12-05,BL0,2007-12-05,35.7875,51.522287,-0.125848


In [3]:
monthly_df = hourly_df.groupby(by=["SiteCode"]).resample("M", convention="start").mean().reset_index()
monthly_df["date"] = monthly_df["Timestamp"].apply(lambda x: x.strftime("%Y-%m"))
monthly_df.set_index("date", inplace=True)

print("Monthly Data of ", pollutant)
monthly_df.head()

Monthly Data of  O3


Unnamed: 0_level_0,SiteCode,Timestamp,O3,Latitude,Longitude
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2007-12,BL0,2007-12-31,20.212113,51.522287,-0.125848
2008-01,BL0,2008-01-31,27.960894,51.522287,-0.125848
2008-02,BL0,2008-02-29,19.813362,51.522287,-0.125848
2008-03,BL0,2008-03-31,36.158199,51.522287,-0.125848
2008-04,BL0,2008-04-30,38.231154,51.522287,-0.125848


In [4]:
yearly_df = hourly_df.groupby(by=["SiteCode"]).resample("Y", convention="start").mean().reset_index()
yearly_df["date"] = yearly_df["Timestamp"].apply(lambda x: x.strftime("%Y"))
# yearly_df.set_index("date", inplace=True)

print("Yearly Data of ", pollutant)
yearly_df.head()

Yearly Data of  O3


Unnamed: 0,SiteCode,Timestamp,O3,Latitude,Longitude,date
0,BL0,2007-12-31,20.212113,51.522287,-0.125848,2007
1,BL0,2008-12-31,28.031154,51.522287,-0.125848,2008
2,BL0,2009-12-31,25.668282,51.522287,-0.125848,2009
3,BL0,2010-12-31,22.333717,51.522287,-0.125848,2010
4,BL0,2011-12-31,27.074341,51.522287,-0.125848,2011


In [5]:
# get usable data: 2015 and above
df = yearly_df.loc[yearly_df["date"] >= "2015"]
df.head()

Unnamed: 0,SiteCode,Timestamp,O3,Latitude,Longitude,date
8,BL0,2015-12-31,30.758933,51.522287,-0.125848,2015
9,BL0,2016-12-31,25.429808,51.522287,-0.125848,2016
10,BL0,2017-12-31,29.384156,51.522287,-0.125848,2017
11,BL0,2018-12-31,34.801808,51.522287,-0.125848,2018
12,BL0,2019-12-31,36.350386,51.522287,-0.125848,2019


#### Create Training & Testing Datasets

In [6]:
## divide into features and variable
params = ['date', 'Latitude', 'Longitude']

X = df[params].values
y = df.loc[:, pollutant].values
y = y.reshape(-1,1)

## print previews
print("Model Features:")
print(X.shape)
print(X[1:10,:])

print("--------------------")

print("Variables: ", pollutant)
print(y.shape)
print(y[0:10])


Model Features:
(160, 3)
[['2016' 51.522287 -0.125848]
 ['2017' 51.522287 -0.125848]
 ['2018' 51.522287 -0.125848]
 ['2019' 51.522287 -0.125848]
 ['2020' 51.522287 -0.125848]
 ['2021' 51.522287 -0.125848]
 ['2015' 51.4946486813055 0.137279111232178]
 ['2016' 51.4946486813055 0.137279111232178]
 ['2017' 51.4946486813055 0.137279111232178]]
--------------------
Variables:  O3
(160, 1)
[[30.75893312]
 [25.42980824]
 [29.38415635]
 [34.80180849]
 [36.35038615]
 [44.51012582]
 [42.41657855]
 [44.73705923]
 [37.0498008 ]
 [43.72014515]]


In [7]:
from sklearn.model_selection import train_test_split
## create validation dataset (no test set since using MLL)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=0) 

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(120, 3)
(120, 1)
(40, 3)
(40, 1)


In [8]:

from sklearn.preprocessing import StandardScaler  
feature_scaler = StandardScaler() 

## Normalize Y (after splitting into training and validation)

## standardize y-values
y_train = feature_scaler.fit_transform(y_train)
y_val = feature_scaler.fit_transform(y_val)

print(y_train.shape)
print(y_val.shape)

(120, 1)
(40, 1)


### Building GP model

In [9]:
import gpflow

In [1]:
print("hello")

k = gpflow.kernels.Constant()
gpflow.utilities.print_summary(k, fmt = "notebook")

hello


NameError: name 'gpflow' is not defined

In [2]:
import gpflow.kernels as kernels
from gpflow.utilities import print_summary

print("hello")
k = kernels.Constant() 
print_summary(k, fmt="notebook")

# k = SafeMatern52(lengthscales=1, active_dims=[0, 1]) + SafeMatern52(lengthscales=0.1, active_dims=[2])
#, lengthscales=[0.01, 0.01, 0.01])
# + gpflow.kernels.Periodic(gpflow.kernels.IsotropicStationary(), period=12)
#k = gpflow.kernels.Matern52(active_dims=[2]) + gpflow.kernels.Matern52(active_dims=[0, 1])
#k = SafeMatern52(active_dims=[2]) + SafeMatern52(active_dims=[0,1]) + gpflow.kernels.White(3)
#k = gpflow.kernels.White(3) +\
#            gpflow.kernels.RBF(2, active_dims=[0,1], lengthscales=1.0) +\
#            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) +\
#            gpflow.kernels.Periodic(gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0), period=12.0)
#k = gpflow.kernels.White(1)
#k = gpflow.kernels.White() + gpflow.kernels.Matern52(active_dims=[0]) + gpflow.kernels.Matern52(active_dims=[1]) \
#+ gpflow.kernels.Matern52(active_dims=[2])
#k = gpflow.kernels.SquaredExponential()
# k = gpflow.kernels.SquaredExponential() + gpflow.kernels.SquaredExponential()

# print("here")

In [1]:
##===================================================================================
import gpflow 
# define GP kernel parameters

## fullcovariance kernel(lat, long, time; lat', long', time') =
## RBF(lat, long; lat', long') + RBF(time; time') * periodic(time; time')


## kernel 0: White noise
## kernel 1: RBF space (spatial relationship)
## kernel 2: RBF time (short-term irregularities due to weather, error measurement)
## kernel 3: RBF time * Periodic time
# kernel = gpflow.kernels.White()
kernel = gpflow.kernels.White(1) +\
            gpflow.kernels.RBF(2, active_dims=[0,1], lengthscales=1.0) +\
            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) +\
            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) *\
            gpflow.kernels.Periodic(1, period=12., active_dims=[2])

## kernel 0: White noise
## kernel 1: RBF time (noise, should be shorter length-scale)
## kernel 1: RBF space
## kernel 2: RBF time (short)
## kernel 3: RBF time * Periodic time
# kernel1 = gpflow.kernels.White(1) +\
#             gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) +\
#             gpflow.kernels.RBF(2, active_dims=[0,1], lengthscales=1.0) +\
#             gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) +\
#             gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) *\
#             gpflow.kernels.Periodic(1, period=12., active_dims=[2])

## OLD
## kernel 0: Constant
## kernel 1: RBF space * RBF time
## kernel 2: RBF time
## kernel 3: RBF time * Periodic time
#kernel1 = gpflow.kernels.Constant(1) +\
#            gpflow.kernels.RBF(2, active_dims=[0,1], lengthscales=1.0) *\
#            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) +\
#            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) +\
#            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) *\
#            gpflow.kernels.Periodic(1, period=12., active_dims=[2])

## kernel 0: Constant
## kernel 1: RBF space * RBF time * Periodic time
## kernel 2: RBF space * time
# kernel2 = gpflow.kernels.Constant(1) +\
#             gpflow.kernels.RBF(2, active_dims=[0,1], lengthscales=1.0) *\
#             gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) *\
#             gpflow.kernels.Periodic(1, period=12., active_dims=[2]) +\
#             gpflow.kernels.RBF(2, active_dims=[0,1], lengthscales=1.0) *\
#             gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0)

# if kernel_flag == 0:
#     selected_kernel = kernel0
# if kernel_flag == 1:
#     selected_kernel = kernel1
# if kernel_flag == 2:
#     selected_kernel = kernel2
print(kernel)
print_summary(kernel)

In [1]:
import gpflow
## build model
model = gpflow.models.GPR(X_train, y_train, kern=k)
model.likelihood.variance = 0.01

## view 
model.as_pandas_table()
print(model.as_pandas_table())
print("hello")

In [1]:
import gpflow

opt = gpflow.optimizers.Scipy()
opt_logs = opt.minimize(model.training_loss, model.trainable_variables, options=dict(maxiter=5))
