In [1]:
import pandas as pd
import os
import math
import json
import copy
import numpy as np
import gpflow
import pickle
import calendar

import tensorflow as tf

from pathlib import Path
from datetime import datetime
from gpflow.utilities import print_summary

gpflow.config.set_default_summary_fmt("notebook")

# cleanair modules for scoot
from cleanair.scoot import (
    sample_n,
    ScootQuery,
    sample_intensity,
    plotly_results,
    choose_kernel,
    save_model_to_file,
    save_processed_data_to_file
)


## Setup filepaths

All data, results, figures and models are held in the `experiments/NAME` directories where `NAME` is the name of your experiment.

```
experiments/
    NAME/
        data/
            normal_scoot.csv
            lockdown_scoot.csv
        results/
            lockdown_SCOOT_ID.npy
            normal_SCOOT_ID.npy
            ...
        models/
            lockdown_SCOOT_ID.m5
            normal_SCOOT_ID.m5
            ...
        figures/
            lockdown_SCOOT_ID.png
            normal_SCOOT_ID.png
            ...
        settings/
            kernel_settings.json
            scoot_settings.json
```
 Here I'm assuming each scoot detector is trained independently. If this changes we may need to change file structure (should be ok through use of `cleanair.scoot.util` helper functions.)

In [2]:
# give your experiment a useful name
name = "test"

# setup filepaths
secretfile = "../../terraform/.secrets/db_traffic.json"
xpfp = "./experiments"  # root to experiments filepaths directory
data_dir = os.path.join(xpfp, name, "data")
results_dir = os.path.join(xpfp, name, "results")
models_dir = os.path.join(xpfp, name, "models")
settings_dir = os.path.join(xpfp, name, "settings")

# make directories
Path(os.path.join(xpfp, name)).mkdir(exist_ok=True, parents=True)
Path(data_dir).mkdir(exist_ok=True)         # input data and processed training data
Path(results_dir).mkdir(exist_ok=True)      # predictions from model
Path(models_dir).mkdir(exist_ok=True)       # saving model status
Path(settings_dir).mkdir(exist_ok=True)     # for storing parameters

# get the settings for kernels and scoot data
with open(os.path.join(settings_dir, "kernel_settings.json")) as kernel_file:
    kernel_settings = json.load(kernel_file)
with open(os.path.join(settings_dir, "scoot_settings.json")) as scoot_file:
    scoot_settings = json.load(scoot_file)

In [3]:
# if true, all input data will be read from a local file
read_data_from_file = True
save_data_to_file = False

# choose a start and end date for querying "normal traffic" period
normal_start = "2020-02-10 00:00:00"
normal_end = "2020-02-24 00:00:00"

# choose a start and end date for querying "lockdown traffic" period
lockdown_start = "2020-03-16 00:00:00"
lockdown_end = "2020-03-30 00:00:00"

# columns to analyse
columns = ["n_vehicles_in_interval", "occupancy_percentage", "congestion_percentage", "saturation"]

## Detector readings

You can read scoot either from the DB or from a local file. Make sure you have set `read_data_from_file` and `save_data_to_file` correctly before running the below cell.

In [4]:
normal_fp = os.path.join(data_dir, "normal_scoot.csv")
lockdown_fp = os.path.join(data_dir, "lockdown_scoot.csv")

if read_data_from_file:
    # read data from csv
    normal_df = pd.read_csv(normal_fp)
    lockdown_df = pd.read_csv(lockdown_fp)
else:
    # create an object for querying from DB
    SQ = ScootQuery(secretfile=secretfile)
    # read the data from DB
    normal_df = SQ.get_all_readings(
        start_datetime=normal_start,
        end_datetime=normal_end
    )
    lockdown_df = SQ.get_all_readings(
        start_datetime=lockdown_start,
        end_datetime=lockdown_end
    )
    # save the data to csv if required
    if save_data_to_file:
        normal_df.to_csv(normal_fp)
        lockdown_df.to_csv(lockdown_fp)

## Data cleaning

    - Convert Datetime to epoch
    - Add normalised/standardised columns
    - Get a dataframe for only a subset of sensors and for given time range.

In [5]:
def normalise(x):
    """Standardize all columns individually"""
    return (x - np.mean(x, axis=0)) / np.std(x, axis=0)

def denormalise(x, wrt_y):
    """Denormalize x given the original data it was standardized to"""
    return ( x * np.std(wrt_y, axis=0) ) + np.mean(wrt_y, axis=0)

def clean_and_normalise_df(df: pd.DataFrame):
    """Normalise lat, lon, epoch."""
    df['measurement_start_utc'] = pd.to_datetime(df['measurement_start_utc'])
    df['epoch'] = df['measurement_start_utc'].astype('int64')//1e9 #convert to epoch
    df['epoch_norm'] = normalise(df['epoch'])
    df['lat_norm'] = normalise(df['lat'])
    df['lon_norm'] = normalise(df['lon'])
    return df

def filter_df(df: pd.DataFrame, detector_list: list, start: str, end: str):
    """
    Return a dataframe that only contains sensors in the list
    and only contains observations between the start and end datetime.
    """
    return df.loc[
        (df['detector_id'].isin(detector_list)) &
        (df["measurement_start_utc"] >= start) &
        (df["measurement_start_utc"] < end)
    ]

In [6]:
# choose a start and end datetime to filter by
start_normal_interval = "2020-02-10 00:00:00"
end_normal_interval = "2020-02-17 00:00:00"
start_lockdown_interval = "2020-03-16 00:00:00"
end_lockdown_interval = "2020-03-23 00:00:00"

# get list of detectors from json file to filter by
detector_list = scoot_settings["scoot_ids"]
# detector_list = list(np.unique(normal_df['detector_id']))   # all scoot detectors

# clean data and normalise
# TODO: IMPORTANT - normalisation should be same for normal and lockdown periods.
normal_df = clean_and_normalise_df(normal_df)
lockdown_df = clean_and_normalise_df(lockdown_df)

# filter normal and lockdown dataframes by interval dates and same detectors
normal_interval_df = filter_df(normal_df, detector_list, start_normal_interval, end_normal_interval)
lockdown_interval_df = filter_df(lockdown_df, detector_list, start_lockdown_interval, end_lockdown_interval)

normal_interval_df.sample(3)

Unnamed: 0.1,Unnamed: 0,detector_id,lon,lat,measurement_start_utc,measurement_end_utc,n_vehicles_in_interval,occupancy_percentage,congestion_percentage,saturation,epoch,epoch_norm,lat_norm,lon_norm
618028,618028,N00/002e1,-0.107637,51.514252,2020-02-12 19:00:00,2020-02-12 20:00:00,300,10.026254,0.0,106.293103,1581534000.0,-1.113537,0.142769,0.114531
1451217,1451217,N00/002g1,-0.102058,51.513892,2020-02-16 07:00:00,2020-02-16 08:00:00,80,1.872896,0.0,21.083333,1581836000.0,-0.297813,0.136659,0.156069
955567,955567,N00/002g1,-0.102058,51.513892,2020-02-14 05:00:00,2020-02-14 06:00:00,144,9.918155,5.982143,43.589286,1581656000.0,-0.783363,0.136659,0.156069


## Split up into numpy arrays

In [7]:
def get_X(df):
    return np.array(df[['epoch_norm', 'lon_norm', 'lat_norm']])

def get_Y(df):
    return np.array(df[['n_vehicles_in_interval']])

In [8]:
normal_group = normal_interval_df.groupby("detector_id")
lockdown_group = lockdown_interval_df.groupby("detector_id")

# list of dfs for all sensors
normal_df_list = [normal_group.get_group(id) for id in detector_list] 
lockdown_df_list = [lockdown_group.get_group(id) for id in detector_list]

# get list of numpy arrays for each dataframe
X_arr = [get_X(df) for df in normal_df_list] # |Number of scoot sensors| x N_i x D
Y_arr = [get_Y(df) for df in normal_df_list] # |Number of scoot sensors| x N_i x 1

## Fit LGCP model for each sensor

The input $X$ is time epoch, lat, lon and output $Y$ is the integer `n_vehicles_in_interval`.

NOTE for 2 days of scoot data there are approx 400000 observations

In [9]:
## Set random seed
gpflow.config.set_default_float(np.float64)
np.random.seed(0)
tf.random.set_seed(0)

In [10]:
## Optimization functions - train the model for the given epochs
optimizer = tf.keras.optimizers.Adam(0.001)
def optimization_step(model: gpflow.models.SVGP, X, Y):
    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(model.trainable_variables)
        obj = -model.elbo(X, Y)
        grads = tape.gradient(obj, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
def simple_training_loop(X, Y, model: gpflow.models.SVGP, epochs: int = 1, logging_epoch_freq: int = 10, num_batches_per_epoch: int = 10):
    tf_optimization_step = tf.function(optimization_step)
    for epoch in range(epochs):
        for _ in range(num_batches_per_epoch):
            tf_optimization_step(model, X, Y)

        epoch_id = epoch + 1
        if epoch_id % logging_epoch_freq == 0:
            tf.print(f"Epoch {epoch_id}: ELBO (train) {model.elbo(X,Y)}")


In [11]:
## Given the data and the specific sensor this function optimise the ELBO and plot the results 
def train_sensor_model(X, Y, kernelsettings, epochs = 100, logging_epoch_freq = 10, plot=True):
    
    ## To remove newaxis when more features
    num_features = X[:,0][:,np.newaxis].shape[0]
    
    X = tf.convert_to_tensor(X[:,0][:,np.newaxis])
    Y = tf.convert_to_tensor(Y.astype(np.float64))
    
    ## To pass it as a function arg
    k = choose_kernel(kernelsettings)
#     k = gpflow.kernels.RBF() * gpflow.kernels.Periodic(0.1)
    
    lik = gpflow.likelihoods.Poisson()
    
    ## Add code for inducing inputs - Needed when we run on the full data
    model = gpflow.models.SVGP(kernel = k, likelihood=lik, inducing_variable=X)
    
    ## Uncomment to see which variables are training and those that are not
    #print_summary(model)
    
    simple_training_loop(X, Y, model, epochs = epochs, 
                         logging_epoch_freq = logging_epoch_freq)

    return model,X

# Run entire training routine

In [15]:
# setup parameters
import logging
epochs = 10
logging_epoch_freq = 100

# loop through list of sensor. train model for each sensor
for i in range(len(detector_list)):
    detector_id = detector_list[i]
    # train model
    model, Xtest = train_sensor_model(
        X_arr[i], Y_arr[i], kernel_settings, epochs, logging_epoch_freq
    )

    # save model and processed data to file
    save_model_to_file(model, name, detector_id)
    save_processed_data_to_file(X_arr[i], Y_arr[i], name, detector_id)


Using product of periodic and rbf kernels
Hyperparameters of periodic
{'period': 0.1}
Hyperparameters of rbf
{}
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: expected exactly one node node, found [<gast.gast.FunctionDef object at 0x7f7ef1b75510>, <gast.gast.Return object at 0x7f7ef1b75c50>]
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: expected exactly one node node, found [<gast.gast.FunctionDef object at 0x7f7efc98d710>, <gast.gast.Return object at 0x7f7efc98dd90>]
Using product of periodic and rbf kernels
Hyperparameters of periodic
{'period': 0.1}
Hyperparameters of rbf
{}
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: expecte