In [None]:
# Tl;dr
#
# We should try the following for the y-axis of the line chart:
#
# It's 2 lines, and therefore 2 axes.
# Range for the gap line will always be 0-100%, where we plot (100% - Coverage)
#
# For bunches, let n = 70% of max vehicles for the day on a given route, rounded
# down. 0-n is the range of the y-axis for the bunch line.
#
# If it doesn't work, tweak it. Maybe only 40% of max is necessary, and
# in the unlikely case where it's surpassed, the chart can just remain
# pegged at the top.
#
#
#
#
#
#
# Methodology:
#
# The idea is you want a "bad day" or an intraday "bad peak" to fill up
# most of the scale, and a "good day" to run along the bottom of the chart, 
# such that you can intuit quickly what's going on.
#
# If you were to hardcode, say, "30" on the y-axis, but the route never has
# more than 6 vehicles out, "bunch" data will never fill up more than
# 1/5 of the scale. If you just allow peak bunching to dictate
# the scale, the chart will always appear full somewhere, even if you
# have an excellent day on the route, which is visually misleading.


In [None]:
# Pull a full day's worth of data from our API and store it in df

import requests
import pandas as pd
from time import time
import numpy as np
import json

In [None]:
url = 'http://sfmta-ds.eba-hqpuyrup.us-east-1.elasticbeanstalk.com/daily-general-json'

In [None]:
json_data = requests.get(url, params={'day': '2020-05-24'}).json()

In [None]:
df = pd.DataFrame(data=json_data).sort_values('timestamp')

In [8]:
print(df.shape)
df.head()

(213445, 9)


Unnamed: 0,timestamp,rid,vid,age,kph,heading,latitude,longitude,direction
0,2020-05-24 00:00:13,38,8641,55,43,270,37.7796,-122.502,38___O_F00
21,2020-05-24 00:00:13,14,8749,31,37,345,37.7512,-122.418,14___I_F00
22,2020-05-24 00:00:13,91,8861,55,0,218,37.7216,-122.475,91___O_N00
23,2020-05-24 00:00:13,91,8639,55,0,86,37.7989,-122.443,91___O_N00
24,2020-05-24 00:00:13,22,8884,7,0,218,37.7607,-122.389,22___I_F00


In [9]:
# Most-frequently-reported routes

df['rid'].value_counts()

14       19414
NBUS     17186
49       15800
8        15765
1        15727
38       13360
TBUS     11997
24       11486
22       11458
9        10764
14R      10646
LBUS     10325
38R       8849
29        7444
5         7440
19        6620
44        6215
91        3115
25        2694
L_OWL     2331
N_OWL     1756
12        1702
90        1351
Name: rid, dtype: int64

In [10]:
# The most buses out at any one time was 223
df['timestamp'].value_counts()

2020-05-24 16:16:12    223
2020-05-24 16:19:12    223
2020-05-24 16:18:12    223
2020-05-24 16:15:12    223
2020-05-24 14:55:12    222
                      ... 
2020-05-24 03:34:13     35
2020-05-24 22:18:12     35
2020-05-24 01:51:12     35
2020-05-24 01:52:12     35
2020-05-24 23:28:12     35
Name: timestamp, Length: 1440, dtype: int64

In [11]:
# 14, NBUS, 49, 8, AND 1 are the heaviest routes

# Let's look at 14 only

df14 = df[df['rid'] == '14']

print(df14.shape)
df14.head()

(19414, 9)


Unnamed: 0,timestamp,rid,vid,age,kph,heading,latitude,longitude,direction
21,2020-05-24 00:00:13,14,8749,31,37,345,37.7512,-122.418,14___I_F00
30,2020-05-24 00:00:13,14,8868,31,42,30,37.7896,-122.398,14___I_F00
35,2020-05-24 00:00:13,14,8848,31,0,225,37.7774,-122.413,14___O_F00
17,2020-05-24 00:00:13,14,8815,31,0,247,37.7089,-122.453,14___O_F00
1,2020-05-24 00:00:13,14,8738,7,0,220,37.7063,-122.46,14___O_F00


In [12]:
print(df14['timestamp'].value_counts().max())

18


In [None]:
# Make a dictionary where each route has its own separate df

uniqueroutes = df.rid.unique()

routedict = {elem : pd.DataFrame() for elem in uniqueroutes}

for key in routedict.keys():
    routedict[key] = df[:][df.rid == key]

In [14]:
routedict

{'1':                   timestamp rid   vid  ...  latitude  longitude   direction
 8119    2020-05-24 03:40:12   1  5805  ...   37.7906   -122.428  1____I_S00
 8157    2020-05-24 03:41:13   1  5805  ...   37.7908   -122.426  1____I_S00
 8195    2020-05-24 03:42:12   1  5805  ...   37.7919   -122.425  1____I_S00
 8233    2020-05-24 03:43:12   1  5805  ...   37.7924   -122.421  1____I_S00
 8271    2020-05-24 03:44:12   1  5805  ...   37.7929   -122.417  1____I_S00
 ...                     ...  ..   ...  ...       ...        ...         ...
 209068  2020-05-24 22:00:12   1  5799  ...   37.7885   -122.437  1____O_S00
 209058  2020-05-24 22:00:12   1  5871  ...   37.7864   -122.454  1____I_E00
 209108  2020-05-24 22:01:12   1  5799  ...   37.7882   -122.440  1____O_S00
 209098  2020-05-24 22:01:12   1  5871  ...   37.7871   -122.448  1____I_E00
 209147  2020-05-24 22:02:12   1  5799  ...   37.7875   -122.446  1____O_S00
 
 [15727 rows x 9 columns],
 '12':                   timestamp rid   v

In [15]:
# Print each Route ID and its max vehicles out for the day

for key, value in routedict.items():
  print("Route ID: ", key)
  print(value['timestamp'].value_counts().max())

Route ID:  38
17
Route ID:  14
18
Route ID:  91
7
Route ID:  22
12
Route ID:  25
4
Route ID:  L_OWL
5
Route ID:  90
3
Route ID:  24
11
Route ID:  N_OWL
4
Route ID:  44
7
Route ID:  1
22
Route ID:  14R
16
Route ID:  29
9
Route ID:  NBUS
18
Route ID:  19
8
Route ID:  TBUS
15
Route ID:  49
20
Route ID:  38R
11
Route ID:  9
13
Route ID:  8
20
Route ID:  5
9
Route ID:  LBUS
13
Route ID:  12
2


In [None]:
# 22 was the max for this day. Route 1.
#
# Once Covid is over, maybe you'll get 30-35 on a route.
#
# Remember that the line plots bunches at each snapshot, so the most there can 
# be is n - 1 bunches at a time (n = number of buses out).

# For Route 1 you're looking at 22 * 0.7 = 15.4 --> 15 (0-15 y-axis range)
# For Route 91 it would be 7 * 0.7 = 4.9 --> 4 (0-4 y-axis range)

# The ideal graph is one where a bad day has big spikes and uses up
# most of the scale, while a good day stays near the bottom.