# Modelling SCOOT

Load the data (clean + normalise) then run an SVGP on each sensor individually.

In [41]:
import pandas as pd
import os
import math
import json
import copy
import numpy as np
import gpflow
import pickle
import calendar
import tensorflow as tf

from pathlib import Path
from datetime import datetime
from gpflow.utilities import print_summary

gpflow.config.set_default_summary_fmt("notebook")

# cleanair modules for scoot
from cleanair.scoot import (
    ScootQuery,
    parse_kernel,
    save_model_to_file,
    save_processed_data_to_file
)


## Setup filepaths

All data, results, figures and models are held in the `EXPERIMENT_DIR/NAME` directories where `NAME` is the name of your experiment.

```
EXPERIMENT_DIR/
    NAME/
        data/
            normal_scoot.csv
            lockdown_scoot.csv
            SCOOT_ID.npy
        results/
            KERNEL_ID
                lockdown_SCOOT_ID.npy
                normal_SCOOT_ID.npy
            ...
        models/
            KERNEL_ID
                lockdown_SCOOT_ID.m5
                normal_SCOOT_ID.m5
                ...
        figures/
            KERNEL_ID
                lockdown_SCOOT_ID.png
                normal_SCOOT_ID.png
                ...
        settings/
            kernel_settings.json
            scoot_settings.json
```
 Here I'm assuming each scoot detector is trained independently. If this changes we may need to change file structure (should be ok through use of `cleanair.scoot.util` helper functions.)

In [42]:
# give your experiment a useful name
name = "monday"

# setup filepaths
user_settings_fp = os.path.join("..", "..", "terraform", ".secrets", "user_settings.json")
with open(user_settings_fp) as json_file:
    user_settings = json.load(json_file)
secretfile = user_settings["secretfile"]
xpfp = user_settings["experiment_dir"]  # root to experiments filepaths directory
data_dir = os.path.join(xpfp, name, "data")
results_dir = os.path.join(xpfp, name, "results")
models_dir = os.path.join(xpfp, name, "models")
settings_dir = os.path.join(xpfp, name, "settings")

# make directories
Path(os.path.join(xpfp, name)).mkdir(exist_ok=True, parents=True)
Path(data_dir).mkdir(exist_ok=True)         # input data and processed training data
Path(results_dir).mkdir(exist_ok=True)      # predictions from model
Path(models_dir).mkdir(exist_ok=True)       # saving model status
Path(settings_dir).mkdir(exist_ok=True)     # for storing parameters

In [43]:
# if true, all input data will be read from a local file
read_data_from_file = True
save_data_to_file = False

# choose a start and end date for querying "normal traffic" period
normal_start = "2020-02-10 00:00:00"
normal_end = "2020-02-24 00:00:00"

# choose a start and end date for querying "lockdown traffic" period
lockdown_start = "2020-03-16 00:00:00"
lockdown_end = "2020-03-30 00:00:00"

# columns to analyse
columns = ["n_vehicles_in_interval"]

# seeds
gpflow.config.set_default_float(np.float64)
np.random.seed(0)
tf.random.set_seed(0)

## Detector readings

You can read scoot either from the DB or from a local file. Make sure you have set `read_data_from_file` and `save_data_to_file` correctly before running the below cell.

In [44]:
normal_fp = os.path.join(data_dir, "normal_scoot.csv")
lockdown_fp = os.path.join(data_dir, "lockdown_scoot.csv")

if read_data_from_file:
    # read data from csv
    normal_df = pd.read_csv(normal_fp)
    lockdown_df = pd.read_csv(lockdown_fp)
else:
    # create an object for querying from DB
    SQ = ScootQuery(secretfile=secretfile)
    # read the data from DB
    normal_df = SQ.get_all_readings(
        start_datetime=normal_start,
        end_datetime=normal_end
    )
    lockdown_df = SQ.get_all_readings(
        start_datetime=lockdown_start,
        end_datetime=lockdown_end
    )
    # save the data to csv if required
    if save_data_to_file:
        normal_df.to_csv(normal_fp)
        lockdown_df.to_csv(lockdown_fp)

## Data cleaning

    - Convert Datetime to epoch
    - Add normalised/standardised columns
    - Get a dataframe for only a subset of sensors and for given time range.

In [45]:
def normalise(x):
    """Standardize all columns individually"""
    return (x - np.mean(x, axis=0)) / np.std(x, axis=0)

def denormalise(x, wrt_y):
    """Denormalize x given the original data it was standardized to"""
    return ( x * np.std(wrt_y, axis=0) ) + np.mean(wrt_y, axis=0)

def clean_and_normalise_df(df: pd.DataFrame):
    """Normalise lat, lon, epoch."""
    df['measurement_start_utc'] = pd.to_datetime(df['measurement_start_utc'])
    df['weekday'] = df['measurement_start_utc'].dt.dayofweek
    df['weekend'] = (df.weekday // 5 == 1).astype(float)
    df['epoch'] = df['measurement_start_utc'].astype('int64')//1e9 #convert to epoch
    df['epoch_norm'] = normalise(df['epoch'])
    df['lat_norm'] = normalise(df['lat'])
    df['lon_norm'] = normalise(df['lon'])
    return df

def filter_df(df: pd.DataFrame, detector_list: list, start: str, end: str):
    """
    Return a dataframe that only contains sensors in the list
    and only contains observations between the start and end datetime.
    """
    return df.loc[
        (df['detector_id'].isin(detector_list)) &
        (df["measurement_start_utc"] >= start) &
        (df["measurement_start_utc"] < end)
    ]

In [93]:
# choose a start and end datetime to filter by
start_normal_interval = "2020-02-10 00:00:00"
# end_normal_interval = "2020-02-17 00:00:00"
end_normal_interval = "2020-02-11 00:00:00"
start_lockdown_interval = "2020-03-16 00:00:00"
end_lockdown_interval = "2020-03-23 00:00:00"

# get list of detectors from json file to filter by
detector_list = ["N00/002e1","N00/002g1","N13/016a1"]
# detector_list = list(np.unique(normal_df['detector_id']))   # all scoot detectors

# clean data and normalise
# TODO: IMPORTANT - normalisation should be same for normal and lockdown periods.
normal_df = clean_and_normalise_df(normal_df)
lockdown_df = clean_and_normalise_df(lockdown_df)

# filter normal and lockdown dataframes by interval dates and same detectors
normal_interval_df = filter_df(normal_df, detector_list, start_normal_interval, end_normal_interval)
lockdown_interval_df = filter_df(lockdown_df, detector_list, start_lockdown_interval, end_lockdown_interval)

normal_interval_df.sample(3)

Unnamed: 0.1,Unnamed: 0,detector_id,lon,lat,measurement_start_utc,measurement_end_utc,n_vehicles_in_interval,occupancy_percentage,congestion_percentage,saturation,weekday,weekend,epoch,epoch_norm,lat_norm,lon_norm
57022,57022,N13/016a1,-0.002975,51.551367,2020-02-10 07:00:00,2020-02-10 08:00:00,62,1.285546,0.0,32.15,0,0.0,1581318000.0,-1.692983,0.773122,0.892927
140560,140560,N13/016a1,-0.002975,51.551367,2020-02-10 18:00:00,2020-02-10 19:00:00,87,2.191358,-0.037037,44.333333,0,0.0,1581358000.0,-1.578419,0.773122,0.892927
90750,90750,N00/002g1,-0.102058,51.513892,2020-02-10 12:00:00,2020-02-10 13:00:00,258,8.726597,1.533333,123.266667,0,0.0,1581336000.0,-1.640908,0.137691,0.155247


In [94]:
id = "N00/002e1"
day_df = normal_interval_df.loc[normal_df.detector_id == id]
day_df = day_df.drop(day_df.columns[0], axis=1)
day_df.to_csv(os.path.join("/Users","pohara", "Data", "scoot_profiles", "10Feb_"+id.replace("/","_")+".csv"))

## Save kernel and data settings

## Split up into numpy arrays

In [47]:
# def get_X(df):
#     return np.array(df[['epoch_norm', 'lon_norm', 'lat_norm', 'weekday', 'weekend']])

def get_X(df)
    return np.array(df[['epoch_norm', 'lon_norm', 'lat_norm']])

def get_Y(df):
    return np.array(df[['n_vehicles_in_interval']])

SyntaxError: invalid syntax (<ipython-input-47-7e1920ed2189>, line 4)

In [48]:
normal_group = normal_interval_df.groupby("detector_id")
lockdown_group = lockdown_interval_df.groupby("detector_id")

# list of dfs for all sensors
normal_df_list = [normal_group.get_group(id) for id in detector_list] 
lockdown_df_list = [lockdown_group.get_group(id) for id in detector_list]

# get list of numpy arrays for each dataframe
X_arr = [get_X(df) for df in normal_df_list] # |Number of scoot sensors| x N_i x D
Y_arr = [get_Y(df) for df in normal_df_list] # |Number of scoot sensors| x N_i x 1

## Fit LGCP model for each sensor

The input $X$ is time epoch, lat, lon and output $Y$ is the integer `n_vehicles_in_interval`.

NOTE for 2 days of scoot data there are approx 400000 observations

In [49]:
## Optimization functions - train the model for the given epochs
optimizer = tf.keras.optimizers.Adam(0.001)
def optimization_step(model: gpflow.models.SVGP, X, Y):
    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(model.trainable_variables)
        obj = -model.elbo(X, Y)
        grads = tape.gradient(obj, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
def simple_training_loop(X, Y, model: gpflow.models.SVGP, epochs: int = 1, logging_epoch_freq: int = 10, num_batches_per_epoch: int = 10):
    tf_optimization_step = tf.function(optimization_step)
    for epoch in range(epochs):
        for _ in range(num_batches_per_epoch):
            tf_optimization_step(model, X, Y)

        epoch_id = epoch + 1
        if epoch_id % logging_epoch_freq == 0:
            tf.print(f"Epoch {epoch_id}: ELBO (train) {model.elbo(X,Y)}")


In [50]:
## Given the data and the specific sensor this function optimise the ELBO and plot the results 
def train_sensor_model(X, Y, kernel, epochs = 100, logging_epoch_freq = 10, M=10, inducing_point_method="random"):
    
    ## To remove newaxis when more features
    num_features = X[:,0][:,np.newaxis].shape[0]
    
    X = tf.convert_to_tensor(X[:,0][:,np.newaxis])
    Y = tf.convert_to_tensor(Y.astype(np.float64))

    # ToDo : number of rows
    if M == X.shape[0]:
        ind_points = X
    elif inducing_point_method == "random":
        # randomly select 
        ind_points = tf.random.shuffle(X)[:M]
    else:
        # select of regular grid
        ind_points = tf.expand_dims(
            tf.linspace(np.min(X_arr[0][:,0]), np.max(X_arr[0][:,0]), M),1
        )
    
    lik = gpflow.likelihoods.Poisson()
    
    ## Add code for inducing inputs - Needed when we run on the full data
    model = gpflow.models.SVGP(kernel=kernel, likelihood=lik, inducing_variable=ind_points)
    
    ## Uncomment to see which variables are training and those that are not
    #print_summary(model)
    
    simple_training_loop(X, Y, model, epochs = epochs, 
                         logging_epoch_freq = logging_epoch_freq)

    return model,X

In [71]:
scoot_settings = dict(
  scoot_ids=detector_list,
  lockdown_start=start_lockdown_interval,
  lockdown_end=end_lockdown_interval,
  normal_start=start_normal_interval,
  normal_end=end_normal_interval,
  columns=columns,
)

# periodic with 0.5, lengthscale ...
# periodic with rbf with params from virgi
# periodic x matern32/12 shouldn't work
# periodic with matern52 play
# periodic with 0.5 + rbf
kernel_settings = {  
    "matern52_ls=0.1_v=0.1": {
        "name":"matern52",
        "hyperparameters":{
            "lengthscale":0.1,
            "variance": 0.1
        }
    },
    "matern52_ls=0.1_v=1": {
        "name":"matern52",
        "hyperparameters":{
            "lengthscale":0.1,
            "variance": 1
        }
    },
    "matern52_ls=1_v=0.1": {
        "name":"matern52",
        "hyperparameters":{
            "lengthscale":1,
            "variance": 0.1
        }
    },
    "matern52_ls=1_v=1": {
        "name":"matern52",
        "hyperparameters":{
            "lengthscale":1,
            "variance": 1
        }
    }
    # "matern52": {
    #     "name": "matern52",
    #     "hyperparameters": {}
    # },
    # "periodic": {               # periodic with hand tuned params
    #     "name": "periodic",
    #     "hyperparameters": {
    #         "period": 0.5,
    #         "lengthscale": 0.7,
    #         "variance": 4.5
    #     }
    # },
    # "matern12": {
    #     "name": "matern12",
    #     "hyperparameters": {
            
    #     }
    # }
}

# get the settings for kernels and scoot data
try:
    with open(os.path.join(settings_dir, "kernel_settings.json"), "r+") as kernel_file:
        current_settings = json.load(kernel_file)
        current_settings.update(kernel_settings)
        kernel_file.seek(0)
        json.dump(current_settings, kernel_file)
except FileNotFoundError:
    with open(os.path.join(settings_dir, "kernel_settings.json"), "w") as kernel_file:
        json.dump(kernel_settings, kernel_file)
with open(os.path.join(settings_dir, "scoot_settings.json"), "w") as scoot_file:
    json.dump(scoot_settings, scoot_file)

## Run entire training routine

In [72]:
# setup parameters
epochs = 2000
logging_epoch_freq = 100
M = 24      # number of inducing points

# loop through list of sensor. train model for each sensor
for i in range(len(detector_list)):
    for kernel_id in kernel_settings:
        detector_id = detector_list[i]
        # get a kernel from json/dict/list
        kernel = parse_kernel(kernel_settings[kernel_id])

        # train model
        model, Xtest = train_sensor_model(
            X_arr[i], Y_arr[i], kernel, epochs, logging_epoch_freq, M=M
        )

        # save model and processed data to file
        save_model_to_file(model, name, kernel_id, detector_id, xp_root=xpfp)
        save_processed_data_to_file(X_arr[i], Y_arr[i], name, detector_id, xp_root=xpfp)


rain) -166.46951184268812
Epoch 700: ELBO (train) -162.78494177502927
Epoch 800: ELBO (train) -159.3672413178486
Epoch 900: ELBO (train) -156.0258902094483
Epoch 1000: ELBO (train) -152.95495978297325
Epoch 1100: ELBO (train) -150.3869083111983
Epoch 1200: ELBO (train) -148.54496892506842
Epoch 1300: ELBO (train) -147.1850314217212
Epoch 1400: ELBO (train) -146.26474928471086
Epoch 1500: ELBO (train) -145.6316260312551
Epoch 1600: ELBO (train) -145.16211414679702
Epoch 1700: ELBO (train) -144.82272117236727
Epoch 1800: ELBO (train) -144.60044002989312
Epoch 1900: ELBO (train) -144.27960767794937
Epoch 2000: ELBO (train) -144.07901144524402
matern32
{'lengthscale': 1, 'variance': 1}
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: expected exactly one node node, found [<gast.gast.FunctionDef object at 0x7fbc08a9d810>, <gast.gast.Return object at 0x7fbc08a9db90>]
Please r