# Gaussian Process
#### Replication of Adrianna's methods using LAQN data retrieved through the LAQN API and applied to multiple pollutants

In [None]:
## load in relevant packages
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import gpflow



np.random.seed(1)

### Access & Process Data 
#### Read in data downloaded from LAQN API in download_data.py

In [8]:
import pandas as pd  

## read in hourly data of given pollutant
pollutant = "O3"
hourly_df = pd.read_csv("../data/" + pollutant + "_data.csv", sep=',', parse_dates=["Timestamp"]).set_index("Timestamp")

print("Hourly Data of ", pollutant)
hourly_df.head()
# convert CSV to DF

Unnamed: 0_level_0,O3,SiteCode,Latitude,Longitude
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-12-01 00:00:00,71.8,TD0,51.424304,-0.345715
2007-12-01 01:00:00,71.2,TD0,51.424304,-0.345715
2007-12-01 02:00:00,67.9,TD0,51.424304,-0.345715
2007-12-01 03:00:00,65.4,TD0,51.424304,-0.345715
2007-12-01 04:00:00,61.9,TD0,51.424304,-0.345715


In [10]:
daily_df = df.groupby(by=["SiteCode"]).resample("D").mean().reset_index()
daily_df["date"] = daily_df["Timestamp"].apply(lambda x: x.strftime("%Y-%m-%d"))
daily_df.set_index("date", inplace=True)
daily_df.head()

Unnamed: 0_level_0,SiteCode,Timestamp,O3,Latitude,Longitude
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2007-12-01,BL0,2007-12-01,49.979167,51.522287,-0.125848
2007-12-02,BL0,2007-12-02,51.729167,51.522287,-0.125848
2007-12-03,BL0,2007-12-03,25.225,51.522287,-0.125848
2007-12-04,BL0,2007-12-04,19.604167,51.522287,-0.125848
2007-12-05,BL0,2007-12-05,35.7875,51.522287,-0.125848


#### Create Training & Testing Datasets

In [10]:
## divide into features and variable
params = ['Timestamp', 'SiteCode', 'Latitude', 'Longitude']

X = df[params].values
y = df.loc[:, pollutant].values
y = y.reshape(-1,1)

## print previews
print("Model Features:")
print(X.shape)
print(X[1:10,:])

print("--------------------")

print("Variables: ", pollutant)
print(y.shape)
print(y[0:10])


Model Features:
(5184309, 4)
[['2007-12-01 01:00' 'TD0' 51.4243043441456 -0.345714576446947]
 ['2007-12-01 02:00' 'TD0' 51.4243043441456 -0.345714576446947]
 ['2007-12-01 03:00' 'TD0' 51.4243043441456 -0.345714576446947]
 ['2007-12-01 04:00' 'TD0' 51.4243043441456 -0.345714576446947]
 ['2007-12-01 05:00' 'TD0' 51.4243043441456 -0.345714576446947]
 ['2007-12-01 06:00' 'TD0' 51.4243043441456 -0.345714576446947]
 ['2007-12-01 07:00' 'TD0' 51.4243043441456 -0.345714576446947]
 ['2007-12-01 08:00' 'TD0' 51.4243043441456 -0.345714576446947]
 ['2007-12-01 09:00' 'TD0' 51.4243043441456 -0.345714576446947]]
--------------------
Variables:  O3
(5184309, 1)
[[71.8]
 [71.2]
 [67.9]
 [65.4]
 [61.9]
 [63.4]
 [62.9]
 [57.4]
 [58.4]
 [61.4]]


In [11]:
from sklearn.model_selection import train_test_split
## create validation dataset (no test set since using MLL)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=0) 

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(3888231, 4)
(3888231, 1)
(1296078, 4)
(1296078, 1)


In [12]:

from sklearn.preprocessing import StandardScaler  
feature_scaler = StandardScaler() 

## Normalize Y (after splitting into training and validation)

## standardize y-values
y_train = feature_scaler.fit_transform(y_train)
y_val = feature_scaler.fit_transform(y_val)

print(y_train.shape)
print(y_val.shape)

(3888231, 1)
(1296078, 1)


### Building GP model

In [14]:
##===================================================================================
import gpflow 
# define GP kernel parameters
kernel_flag = 0

## fullcovariance kernel(lat, long, time; lat', long', time') =
## RBF(lat, long; lat', long') + RBF(time; time') * periodic(time; time')

print('Model' + str(kernel_flag) + ' selected')

## kernel 0: White noise
## kernel 1: RBF space (spatial relationship)
## kernel 2: RBF time (short-term irregularities due to weather, error measurement)
## kernel 3: RBF time * Periodic time
kernel0 = gpflow.kernels.White(1) +\
            gpflow.kernels.RBF(2, active_dims=[0,1], lengthscales=1.0) +\
            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) +\
            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) *\
            gpflow.kernels.Periodic(1, period=12., active_dims=[2])

## kernel 0: White noise
## kernel 1: RBF time (noise, should be shorter length-scale)
## kernel 1: RBF space
## kernel 2: RBF time (short)
## kernel 3: RBF time * Periodic time
kernel1 = gpflow.kernels.White(1) +\
            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) +\
            gpflow.kernels.RBF(2, active_dims=[0,1], lengthscales=1.0) +\
            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) +\
            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) *\
            gpflow.kernels.Periodic(1, period=12., active_dims=[2])

## OLD
## kernel 0: Constant
## kernel 1: RBF space * RBF time
## kernel 2: RBF time
## kernel 3: RBF time * Periodic time
#kernel1 = gpflow.kernels.Constant(1) +\
#            gpflow.kernels.RBF(2, active_dims=[0,1], lengthscales=1.0) *\
#            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) +\
#            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) +\
#            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) *\
#            gpflow.kernels.Periodic(1, period=12., active_dims=[2])

## kernel 0: Constant
## kernel 1: RBF space * RBF time * Periodic time
## kernel 2: RBF space * time
kernel2 = gpflow.kernels.Constant(1) +\
            gpflow.kernels.RBF(2, active_dims=[0,1], lengthscales=1.0) *\
            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0) *\
            gpflow.kernels.Periodic(1, period=12., active_dims=[2]) +\
            gpflow.kernels.RBF(2, active_dims=[0,1], lengthscales=1.0) *\
            gpflow.kernels.RBF(1, active_dims=[2], lengthscales=1.0)

if kernel_flag == 0:
    selected_kernel = kernel0
if kernel_flag == 1:
    selected_kernel = kernel1
if kernel_flag == 2:
    selected_kernel = kernel2

In [2]:
import gpflow
## build model
model = gpflow.models.GPR(X_train, y_train, kern=selected_kernel)
model.likelihood.variance = 0.01

## view 
model.as_pandas_table()
print(model.as_pandas_table())
print("hello")