In [1]:
%matplotlib inline
import pandas as pd
import os
import math
import json
import copy
import numpy as np
import matplotlib.pyplot as plt
import gpflow
import pickle
import calendar
import tensorflow as tf

from pathlib import Path
from datetime import datetime
from gpflow.utilities import print_summary

gpflow.config.set_default_summary_fmt("notebook")

# plotly viz - use matplotlib if you prefer
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# cleanair modules for scoot
from cleanair.scoot import (
    sample_n,
    ScootQuery,
    sample_intensity,
    plotly_results,
    choose_kernel,
    save_model_and_metadata
)


In [2]:
# setup filepaths
secretfile = "../../terraform/.secrets/db_traffic.json"
xpfp = "./experiments"  # root to experiments filepaths directory
data_dir = os.path.join(xpfp, "data")
results_dir = os.path.join(xpfp, "results")
models_dir = os.path.join(xpfp, "models")
settings_dir = os.path.join(xpfp, "settings")

# make directories
Path(xpfp).mkdir(exist_ok=True)
Path(data_dir).mkdir(exist_ok=True)         # input data and processed training data
Path(results_dir).mkdir(exist_ok=True)      # predictions from model
Path(models_dir).mkdir(exist_ok=True)       # saving model status
Path(settings_dir).mkdir(exist_ok=True)     # for storing parameters

In [3]:
# if true, all input data will be read from a local file
read_data_from_file = True
save_data_to_file = False

# choose a start and end date for querying "normal traffic" period
normal_start = "2020-02-10 00:00:00"
normal_end = "2020-02-24 00:00:00"

# choose a start and end date for querying "lockdown traffic" period
lockdown_start = "2020-03-16 00:00:00"
lockdown_end = "2020-03-30 00:00:00"

# columns to analyse
columns = ["n_vehicles_in_interval", "occupancy_percentage", "congestion_percentage", "saturation"]

## Detector readings

You can read scoot either from the DB or from a local file. Make sure you have set `read_data_from_file` and `save_data_to_file` correctly before running the below cell.

In [5]:
normal_fp = os.path.join(data_dir, "normal_scoot.csv")
lockdown_fp = os.path.join(data_dir, "lockdown_scoot.csv")

if read_data_from_file:
    # read data from csv
    normal_df = pd.read_csv(normal_fp)
    lockdown_df = pd.read_csv(lockdown_fp)
else:
    # create an object for querying from DB
    SQ = ScootQuery(secretfile=secretfile)
    # read the data from DB
    normal_df = SQ.get_all_readings(
        start_datetime=normal_start,
        end_datetime=normal_end
    )
    lockdown_df = SQ.get_all_readings(
        start_datetime=lockdown_start,
        end_datetime=lockdown_end
    )
    # save the data to csv if required
    if save_data_to_file:
        normal_df.to_csv(normal_fp)
        lockdown_df.to_csv(lockdown_fp)

2020-04-01 12:04:48     INFO: Database connection information loaded from<_io.TextIOWrapper name='../../terraform/.secrets/db_traffic.json' mode='r' encoding='UTF-8'>


In [10]:
with open(os.path.join(settings_dir, "kernel_settings.json")) as kernel_file:
    kernel_settings = json.load(kernel_file)
with open(os.path.join(settings_dir, "scoot_settings.json")) as scoot_file:
    scoot_settings = json.load(scoot_file)

# Simple Data cleaning

    - Convert Datetime to epoch
    - Add normalised/standardised columns

In [11]:
def normalise(x):
    """Standardize all columns individually"""
    return (x - np.mean(x, axis=0)) / np.std(x, axis=0)

def denormalise(x, wrt_y):
    """Denormalize x given the original data it was standardized to"""
    return ( x * np.std(wrt_y, axis=0) ) + np.mean(wrt_y, axis=0)

def clean_and_normalise_df(df: pd.DataFrame):
    """Normalise lat, lon, epoch."""
    df['measurement_start_utc'] = pd.to_datetime(df['measurement_start_utc'])
    df['epoch'] = df['measurement_start_utc'].astype('int64')//1e9 #convert to epoch
    df['epoch_norm'] = normalise(df['epoch'])
    df['lat_norm'] = normalise(df['lat'])
    df['lon_norm'] = normalise(df['lon'])
    return df

def filter_df(df: pd.DataFrame, detector_list: list, start: str, end: str):
    """
    Return a dataframe that only contains sensors in the list
    and only contains observations between the start and end datetime.
    """
    return df.loc[
        (df['detector_id'].isin(detector_list)) &
        (df["measurement_start_utc"] >= start) &
        (df["measurement_start_utc"] < end)
    ]

In [24]:
# choose a start and end datetime to filter by
start_normal_interval = "2020-02-10 00:00:00"
end_normal_interval = "2020-02-17 00:00:00"
start_lockdown_interval = "2020-03-16 00:00:00"
end_lockdown_interval = "2020-03-23 00:00:00"

# get list of detectors from json file to filter by
detector_list = scoot_settings["scoot_ids"]
# detector_list = list(np.unique(normal_df['detector_id']))   # all scoot detectors

# clean data and normalise
# TODO: IMPORTANT - normalisation should be same for normal and lockdown periods.
normal_df = clean_and_normalise_df(normal_df)
lockdown_df = clean_and_normalise_df(lockdown_df)

# filter normal and lockdown dataframes by interval dates and same detectors
normal_interval_df = filter_df(normal_df, detector_list, start_normal_interval, end_normal_interval)
lockdown_interval_df = filter_df(lockdown_df, detector_list, start_lockdown_interval, end_lockdown_interval)

normal_interval_df.sample(3)

Unnamed: 0,detector_id,lon,lat,measurement_start_utc,measurement_end_utc,n_vehicles_in_interval,occupancy_percentage,congestion_percentage,saturation,epoch,epoch_norm,lat_norm,lon_norm
67954,N00/002g1,-0.102058,51.513892,2020-02-10 09:00:00,2020-02-10 10:00:00,238,41.052026,25.050847,197.338983,1581325000.0,-1.676775,0.136659,0.156069
1055111,N00/002e1,-0.107637,51.514252,2020-02-14 15:00:00,2020-02-14 16:00:00,248,7.938712,0.0,76.637931,1581692000.0,-0.686253,0.142769,0.114531
98340,N00/002e1,-0.107637,51.514252,2020-02-10 13:00:00,2020-02-10 14:00:00,252,8.434961,0.0,102.9,1581340000.0,-1.637931,0.142769,0.114531


### Helper functions

In [25]:
def get_X(df):
    return np.array(df[['epoch_norm', 'lon_norm', 'lat_norm']])

def get_Y(df):
    return np.array(df[['n_vehicles_in_interval']])

## Train scoot individually as a time series

In [27]:
normal_group = normal_interval_df.groupby("detector_id")
lockdown_group = lockdown_interval_df.groupby("detector_id")

# list of dfs for all sensors
normal_df_list = [normal_group.get_group(id) for id in detector_list] 
lockdown_df_list = [lockdown_group.get_group(id) for id in detector_list]

# get list of numpy arrays for each dataframe
X_arr = [get_X(df) for df in normal_df_list] # |Number of scoot sensors| x N_i x D
Y_arr = [get_Y(df) for df in normal_df_list] # |Number of scoot sensors| x N_i x 1

## Simple Time series plot

In [28]:
color_counts = 'C0'
label_counts = 'N'
color_estimated_counts = 'red'
label_estimated_counts = '$\hat{N}$'

In [29]:
index = 0
sensor_df = normal_df_list[index]

fig = go.Figure()

fig.add_trace(go.Scatter(x=sensor_df['measurement_start_utc'], y=sensor_df['n_vehicles_in_interval'],
                    mode='lines+markers',
                    name='lines+markers')
)

fig.update_layout(title='Timeseries of sensor {scoot_id}'.format(scoot_id=detector_list[index]),
                xaxis_title="Datetime",
                yaxis_title="# of vechicles per hour",
                font=dict(
                    size=16)
)

fig.show()

# Fit LGCP model for each sensor

In [30]:
## Set random seed
gpflow.config.set_default_float(np.float64)
np.random.seed(0)
tf.random.set_seed(0)

In [31]:
## Optimization functions - train the model for the given epochs
optimizer = tf.keras.optimizers.Adam(0.001)
def optimization_step(model: gpflow.models.SVGP, X, Y):
    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(model.trainable_variables)
        obj = -model.elbo(X, Y)
        grads = tape.gradient(obj, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
def simple_training_loop(X, Y, model: gpflow.models.SVGP, epochs: int = 1, logging_epoch_freq: int = 10, num_batches_per_epoch: int = 10):
    tf_optimization_step = tf.function(optimization_step)
    for epoch in range(epochs):
        for _ in range(num_batches_per_epoch):
            tf_optimization_step(model, X, Y)

        epoch_id = epoch + 1
        if epoch_id % logging_epoch_freq == 0:
            tf.print(f"Epoch {epoch_id}: ELBO (train) {model.elbo(X,Y)}")


In [32]:
## Given the data and the specific sensor this function optimise the ELBO and plot the results 
def train_sensor_model(scoot_id, X_arr, Y_arr, kernelsettings, epochs = 100, logging_epoch_freq = 10, plot=True):
    
    ## To remove newaxis when more features
    num_features = X_arr[scoot_id][:,0][:,np.newaxis].shape[0]
    
    X = tf.convert_to_tensor(X_arr[scoot_id][:,0][:,np.newaxis])
    Y = tf.convert_to_tensor(Y_arr[scoot_id].astype(np.float64))
    
    ## To pass it as a function arg
    k = choose_kernel(kernelsettings)
#     k = gpflow.kernels.RBF() * gpflow.kernels.Periodic(0.1)
    
    lik = gpflow.likelihoods.Poisson()
    
    ## Add code for inducing inputs - Needed when we run on the full data
    model = gpflow.models.SVGP(kernel = k, likelihood=lik, inducing_variable=X)
    
    ## Uncomment to see which variables are training and those that are not
    #print_summary(model)
    
    simple_training_loop(X, Y, model, epochs = epochs, 
                         logging_epoch_freq = logging_epoch_freq)

    return model,X

In [33]:
# THIS IS BUGGY
## Computes percentage cover (see Virginia's pdf for details)
def percentage_coverage(model,test_inputs,Ytest,quantile:int = 0.95, num_samples:int = 10,num_pertubations: int = 100):
    # Number of times total counts were within 90th percentile
    coverage_events = 0
    
    # Loop over pertubations
    for i in range(num_pertubations):

        # Change seed
        np.random.seed(i)
        
        # Sample from latent function (intensity)
        intensity_sample = np.exp(model.predict_f_samples(test_inputs,num_samples))
        # Compute emprical distribution of counts
        empirical_count_distribution = np.random.poisson(intensity_sample)
        
        # Total number of actual counts
        total_counts = np.sum(Ytest)
       
        # Compute upper and lower quantiles from the empirical distribution of counts
        upper_q = np.quantile(np.sum(samples[:,:,0],axis=1),quantile)
        lower_q = np.quantile(np.sum(samples[:,:,0],axis=1),1-quantile)
    
        # Add 1 - if total counts are within quantile, 0 - otherwise
        coverage_events += int((total_counts < upper_q) & (total_counts > lower_q))
        binary = int((total_counts < upper_q) & (total_counts > lower_q)) # this is kept for debugging (remove afterwards)

    return empirical_count_distribution, binary, total_counts, upper_q, lower_q # this is kept for debugging (remove afterwards)
    return coverage_events/num_pertubations # this should be the output after debugging



# Run entire training routine

In [34]:
epochs = 10
logging_epoch_freq = 100

In [35]:
model0,Xtest0 = train_sensor_model(0, X_arr, Y_arr, kernel_settings, epochs, logging_epoch_freq)

Using product of periodic and rbf kernels
Hyperparameters of periodic
{'period': 0.1}
Hyperparameters of rbf
{}
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: expected exactly one node node, found [<gast.gast.FunctionDef object at 0x7fd6fe41e810>, <gast.gast.Return object at 0x7fd6fe41e150>]
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: expected exactly one node node, found [<gast.gast.FunctionDef object at 0x7fd6fe41e810>, <gast.gast.Return object at 0x7fd6fe41e150>]
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: expected exactly one node node, found [<gast.gast.FunctionDef object at 0x7fd6fe41e810>, <gast.gast.Return object at 0x

In [41]:
# model1,Xtest1 = train_sensor_model(1, X_arr, Y_arr, kernel_settings, epochs, logging_epoch_freq)

# Save results

In [39]:
save_model_and_metadata(detector_list[index], model0, X_arr[index], Y_arr[index], start_normal_interval, end_lockdown_interval, kernel_settings, scoot_settings)

Saving data todata/models/N00_002e1/10Feb_23Mar
data/models/N00_002e1/10Feb_23Mar/Y.npy


# Jointly train all sensors

The input $X$ is time epoch, lat, lon and output $Y$ is the integer n_vehicles_in_interval

NOTE for 2 days of scoot data there are approx 400000 observations