# Metrics

Investigating the different metrics.

In [17]:
import os
import json
from datetime import datetime
import pandas as pd
import numpy as np
import tensorflow as tf

# plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# scoot functions
from cleanair.scoot import (
    generate_fp,
    load_model_from_file,
    load_processed_data_from_file,
    load_scoot_df,
    plotly_results,
    percentage_coverage,
    sample_intensity,
    sample_n
)

In [18]:
# setup global params
experiment = "daily"

user_settings_fp = os.path.join("..", "..", "terraform", ".secrets", "user_settings.json")
with open(user_settings_fp) as json_file:
    user_settings = json.load(json_file)
root = user_settings["root"]

# get the settings for kernels and scoot data
with open(os.path.join(root, experiment, "settings", "kernel_settings.json")) as kernel_file:
    kernel_settings = json.load(kernel_file)
with open(os.path.join(root, experiment, "settings", "data_settings.json")) as scoot_file:
    data_settings = json.load(scoot_file)

## Percentage of normal/lockdown

Provide change in traffic for normal vs previous day and lockdown vs previous day.

1. Take the total traffic on a normal/lockdown Monday.
2. Take the total traffic of the most recent Monday.
3. Remove outliers from this detector (e.g. $\mu \pm 3\sigma$).
4. Calculate the percentage change in total traffic from (1) to (3).

In [19]:
# lets look at just one day
normal_day = "2020-02-10T00:00:00"     # a normal day
lockdown_day = "2020-03-23T00:00:00"   # a normal lockdown day 
latest_day = "2020-03-30T00:00:00"     # the most recent day

# load data for these day
normal_df = load_scoot_df(
    root=root,
    experiment=experiment,
    timestamp=normal_day,
    filename="scoot"
)
lockdown_df = load_scoot_df(
    root=root,
    experiment=experiment,
    timestamp=lockdown_day,
    filename="scoot"
)
latest_df = load_scoot_df(
    root=root,
    experiment=experiment,
    timestamp=latest_day,
    filename="scoot"
)

In [33]:
col = "n_vehicles_in_interval"

# groupby detector
normal_gb = normal_df.groupby("detector_id")
normal_gb[col].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
detector_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N00/002e1,24.0,193.541667,77.388337,61.0,139.00,204.5,254.00,300.0
N00/002g1,24.0,200.458333,71.757429,51.0,164.25,232.0,246.50,282.0
N00/002p1,24.0,356.625000,132.671485,112.0,272.75,420.0,452.50,498.0
N00/003a1,24.0,138.750000,61.089564,29.0,107.25,154.5,185.25,211.0
N00/004b1,24.0,370.958333,146.772666,77.0,296.50,430.0,472.75,540.0
...,...,...,...,...,...,...,...,...
N32/208a2,3.0,794.333333,167.846160,614.0,718.50,823.0,884.50,946.0
N32/209a1,3.0,584.666667,92.316485,481.0,548.00,615.0,636.50,658.0
N32/209a2,3.0,730.000000,130.678996,583.0,678.50,774.0,803.50,833.0
N32/210a1,3.0,638.333333,125.556096,515.0,574.50,634.0,700.00,766.0


In [37]:
# remove outliers
to_remove = []   # list of indices to remove
detector_anomalies = []  # list of detectors with anomalies
num_sigma = 3

for detector_id, group in normal_gb:
    remove_in_group = group.index[abs(group[col] - group[col].mean()) > 3 * group[col].std()].tolist()
    if remove_in_group:
        detector_anomalies.append(detector_id)
    to_remove.extend(remove_in_group)
print("Number of anomalous detectors:", len(detector_anomalies))
print("Total number of detectors:", len(normal_df.detector_id.unique()))

Number of anomalous detectors: 75
Total number of detectors: 10077


In [None]:
# now remove detectors and re-run groupby/stats

## Percentage coverage

Given a confidence interval (90%) over the posterior distribution of our model, then coverage is the proportion of observations (true values) that are contained within the confidence interval.

In [10]:
# look at just one detector for now
detector_id = "N00/002e1"
kernel_id = "matern32_ls=0.1_v=0.1"
kwargs = dict(
    root=root,
    experiment=experiment,
    detector_id=detector_id,
    kernel_id=kernel_id
)
# get model, X, Y for normal
normal_model = load_model_from_file(timestamp=normal_day, **kwargs)
normal_x, normal_y = load_processed_data_from_file(timestamp=normal_day, **kwargs)

# get model, X, Y for lockdown
# lockdown_model = load_model_from_file(timestamp=lockdown_day, **kwargs)
lockdown_x, lockdown_y = load_processed_data_from_file(timestamp=lockdown_day, **kwargs)

# get X, Y for latest day
latest_x, latest_y = load_processed_data_from_file(timestamp=latest_day, **kwargs)

In [12]:
# coverage params
num_pertubations=1000
num_samples=1000
quantile=0.99

# calculate coverage of normal, normal vs lockdown, normal vs latest
normal_coverage = percentage_coverage(
    normal_model,
    normal_x[:,0][:,np.newaxis],
    normal_y,
    num_pertubations=num_pertubations,
    num_samples=num_samples,
    quantile=quantile
)
normal_to_lockdown_coverage = percentage_coverage(
    normal_model,
    lockdown_x[:,0][:,np.newaxis],
    lockdown_y,
    num_pertubations=num_pertubations,
    num_samples=num_samples,
    quantile=quantile
)
normal_to_latest_coverage = percentage_coverage(
    normal_model,
    latest_x[:,0][:,np.newaxis],
    latest_y,
    num_pertubations=num_pertubations,
    num_samples=num_samples,
    quantile=quantile
)

### Manipulate data

Hack the data to see how the percentage coverage is reacting.

1. Take the first purple outlier where the intensity is high. For all outliers after this, set purple points to be blue. Calculate percentage coverage with new data.
2. Take a purple point where the intensity is low and make it an outlier (e.g. increase it by $2\sigma$). Set all other purple outliers and set them to be blue.

### Binning

1. Split a day into 6 hour bins.
2. Calculate the percentage coverage for each bin
3. The percentage coverage will now be a vector of length 4.

> Question: should we do this per hour or for 6 hour bins (or both)?

## Negative log predicted likelihood (NLPL)

In [14]:
# ToDo: check this code from Virgi
np.sum(poisson.logpmf(true_counts, pred_counts))/pred_counts.shape[0]

NameError: name 'poisson' is not defined

## Collecting metrics into dataframe