# Getting started with Scoot data

First things first: **Scoot data is a big dataset**.

> 100 days of SCOOT data from all sensors is approx 8GB.

Whilst your are testing and developing please reduce the amount of data you are requesting from the DB to reduce the strain on the DB. You can do this by restricting your queries:

- 1 day (or even one hour) of data if you are looking at spatial information
- aggregation of data if you want temporal trends. E.g. Take the sum of vehicle counts over all scoot sensors for each day.
- for spatial-temporal analysis, choose a subset of scoot detectors and perhaps also aggregate your data. E.g. 100 randomly chosen sensors (Coming soon: not yet implemented) for daily aggregates of data.


In [1]:
import pandas as pd
import os
from datetime import datetime

# plotly viz - use matplotlib if you prefer
import plotly.graph_objects as go
import plotly.express as px

# cleanair modules
from cleanair.scoot import ScootQuery


In [3]:
secretfile = "../../terraform/.secrets/db_secrets.json"

SQ = ScootQuery(secretfile=secretfile)

2020-03-24 08:00:22     INFO: Database connection information loaded from <_io.TextIOWrapper name='../../terraform/.secrets/db_secrets.json' mode='r' encoding='UTF-8'>


In [9]:
# get an aggregate of all the scoot data for the last 2 weeks
df = SQ.groupby_datetime_df(start_datetime="2020-03-10", end_datetime="2020-03-24")


            SELECT measurement_start_utc, measurement_end_utc,
                SUM(n_vehicles_in_interval) AS sum_n_vehicles_in_interval,
                AVG(occupancy_percentage) as avg_occupancy_percentage,
                AVG(congestion_percentage) as avg_congestion_percentage,
                AVG(saturation_percentage) as avg_saturation_percentage,
                SUM(flow_raw_count) as sum_flow_raw_count,
                SUM(occupancy_raw_count) as sum_occupancy_raw_count,
                SUM(congestion_raw_count) as sum_congestion_raw_count,
                SUM(saturation_raw_count) as sum_saturation_raw_count
            FROM dynamic_data.scoot_reading
            WHERE measurement_start_utc >= '2020-02-01' AND measurement_start_utc < '2020-03-24'
            group by measurement_start_utc, measurement_end_utc
            order by measurement_start_utc;
        


In [10]:
df.sample(10)

Unnamed: 0,measurement_start_utc,measurement_end_utc,sum_n_vehicles_in_interval,avg_occupancy_percentage,avg_congestion_percentage,avg_saturation_percentage,sum_flow_raw_count,sum_occupancy_raw_count,sum_congestion_raw_count,sum_saturation_raw_count
873,2020-03-11 12:00:00,2020-03-11 13:00:00,4184824,16.049227,4.406874,53.869018,373399192,343611005,323984040,322680207
901,2020-03-12 16:00:00,2020-03-12 17:00:00,4600133,19.382986,6.043463,61.536105,371526494,341647778,322794071,324606706
312,2020-02-17 03:00:00,2020-02-17 04:00:00,679542,2.000145,0.443584,8.483926,357799548,325729846,308616597,222556471
199,2020-02-12 10:00:00,2020-02-12 11:00:00,4049967,15.647797,4.202455,51.672029,373306774,345192185,324868518,321592399
571,2020-02-27 22:00:00,2020-02-27 23:00:00,2922112,8.714346,1.79426,31.44791,371572103,342202069,324481515,291690614
447,2020-02-22 18:00:00,2020-02-22 19:00:00,4400676,16.173958,4.224833,54.39306,367703946,337605986,317808609,310959920
88,2020-02-07 19:00:00,2020-02-07 20:00:00,4408197,15.699694,3.991402,53.588973,372842977,344132918,325169696,320511784
309,2020-02-17 00:00:00,2020-02-17 01:00:00,1495280,4.110131,0.686306,15.544476,359203587,326963825,310069756,250741577
207,2020-02-12 18:00:00,2020-02-12 19:00:00,4600231,18.410223,5.535832,60.037713,371449391,342764027,323206528,324370947
832,2020-03-09 19:00:00,2020-03-09 20:00:00,4043695,13.489701,3.031274,47.279009,371574700,342490266,323070959,316076395


In [11]:
col = "sum_n_vehicles_in_interval"

all_sensors_fig = dict(
    data=[
        dict(
            x=df.measurement_start_utc,
            y=df[col],
            mode="lines",
        )
    ],
    layout=dict(
        title="{col} over time.".format(col=col),
    )
)

go.Figure(all_sensors_fig)

In [7]:
# get all data for the timeperiod
start_datetime = "2020-03-20 00:00:00"
end_datetime = "2020-03-24 00:00:00"
all_scoot_df = SQ.get_all_readings(
    start_datetime=start_datetime,
    end_datetime=end_datetime,
)

In [8]:
from datetime import timedelta, datetime

# set the mapbox token
secrets_dir = "../../terraform/.secrets"
mapbox_filepath = os.path.join(secrets_dir, ".mapbox_token")
mapbox_access_token = open(mapbox_filepath).read()
px.set_mapbox_access_token(mapbox_access_token)

col = "n_vehicles_in_interval"
lower_bound = 500
timestamp = "2020-03-11 12:00:00"

n_hours = 3   # number of hours to increase group by

all_scoot_df["measurement_start_utc"] = pd.to_datetime(all_scoot_df["measurement_start_utc"])

start = datetime.strptime(start_datetime, "%Y-%m-%d %H:%M:%S")
i = 0
all_scoot_df["label"] = -1
group_list = []
while start < datetime.strptime(end_datetime, "%Y-%m-%d %H:%M:%S"):
    end = start +timedelta(hours=n_hours)
    all_scoot_df.loc[
        (all_scoot_df["measurement_start_utc"] >= start) & (all_scoot_df["measurement_start_utc"] < end), "label"
    ] = i
    i += 1
    group_list.append(dict(
        start=start,
        end=end,
    )
    start = end
    
gb = all_scoot_df.groupby(["label", "detector_id"])
gb.get_group((1,"N00/002e1"))
    


SyntaxError: invalid syntax (<ipython-input-8-c91c34f699f5>, line 31)

In [None]:
# DECREASE NUMBER OF ROWS IN DATAFRAME
# otherwise we will have problems rendering cause too many scoot sensors
filtered_df = all_scoot_df.loc[
    (all_scoot_df[col] > lower_bound) & (all_scoot_df["measurement_start_utc"] == timestamp)
]

# get the mapbox figure
map_dict = px.scatter_mapbox(
    filtered_df,
    lat="lat",
    lon="lon",
    size=[2 for i in range(len(filtered_df.index))],
    color=col,
    zoom=10,
    mapbox_style="basic",
)
map_fig = go.FigureWidget(map_dict)
map_fig.show()