In [1]:
import json
import requests
import pandas as pd
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# color-blind friendly color configuration based on ggplot
color_list = ["#E69F00", "#56B4E9", "#009E73", "#F0E442", "#D55E00", "#0072B2",  "#CC79A7"]

### Make MBTA API request

In [3]:
CLIENT_KEY = ''

fetched = []

offset = 0
page = 20
total = 200 # over 10000000 rows

while offset <= total - page:
    url = "https://api-v3.mbta.com/alerts?page%5Boffset%5D={}&page%5Blimit%5D={}&filter%5Bactivity%5D=BOARD%2CEXIT%2CRIDE&api_key={}".format(offset, page, CLIENT_KEY)
    try:
        results = requests.get(url).json()
        print('Fetched page {} with offset {}'.format(page, offset))

        fetched += [d['attributes'] for d in results['data']]
        offset += page
    except json.decoder.JSONDecodeError as err:
        print('API error: {}'.format(err))
        offset = total

Fetched page 20 with offset 0
Fetched page 20 with offset 20
Fetched page 20 with offset 40
Fetched page 20 with offset 60
Fetched page 20 with offset 80
Fetched page 20 with offset 100
Fetched page 20 with offset 120
Fetched page 20 with offset 140
Fetched page 20 with offset 160
Fetched page 20 with offset 180


In [42]:
df = pd.DataFrame(data=fetched, columns=['description', 'cause', 'created_at', 'effect', 'informed_entity', 'severity']) #, delim_whitespace=True)
df['route'] = df['informed_entity'].apply(lambda x: x[0]['route'])
df['route_type'] = df['informed_entity'].apply(lambda x: x[0]['route_type'])
df.drop('informed_entity', axis=1, inplace=True)
df

Unnamed: 0,description,cause,created_at,effect,severity,route,route_type
0,This change is part of the second round of Bet...,UNKNOWN_CAUSE,2019-11-15T14:41:38-05:00,SERVICE_CHANGE,10,225,3
1,"For Clarendon Hill, connections can be made at...",UNKNOWN_CAUSE,2019-05-03T12:48:53-04:00,SERVICE_CHANGE,7,89,3
2,Affected stops:\r\nBow St @ Warren Ave\r\n51 B...,UNKNOWN_CAUSE,2019-12-27T11:45:12-05:00,DETOUR,1,85,3
3,Signs will be placed around the facility to di...,UNKNOWN_CAUSE,2019-10-02T18:58:29-04:00,STATION_ISSUE,1,Red,1
4,Affecting:\r\n120,UNKNOWN_CAUSE,2019-12-06T22:18:22-05:00,SERVICE_CHANGE,10,120,3
...,...,...,...,...,...,...,...
85,Affecting:\r\n216,UNKNOWN_CAUSE,2019-12-06T22:32:56-05:00,SERVICE_CHANGE,10,214216,3
86,This change is part of the second round of Bet...,UNKNOWN_CAUSE,2019-11-15T14:57:33-05:00,SERVICE_CHANGE,10,71,3
87,Learn more at MBTA.com/betterbus\r\n\r\nAffect...,UNKNOWN_CAUSE,2019-12-06T16:28:46-05:00,STOP_MOVE,1,52,3
88,,UNKNOWN_CAUSE,2019-12-27T15:06:29-05:00,SERVICE_CHANGE,10,CR-Fairmount,2


In [47]:
df['route'].value_counts(dropna=False)

CR-Worcester    9
64              5
52              4
87              3
201             2
               ..
747             1
350             1
CR-Fairmount    1
42              1
29              1
Name: route, Length: 61, dtype: int64

In [49]:
# df.set_index('', drop=True)
df.dtypes

description    object
cause          object
created_at     object
effect         object
severity        int64
route          object
route_type      int64
dtype: object

In [None]:
# cleaned_df['service_time_int'] = cleaned_df.service_time.astype(np.int64)
cleaned_df.loc[:,'service_time_int'] = cleaned_df.service_time.astype(np.int64)

fig, ax = plt.subplots(figsize=(10,6))
cleaned_df.plot(kind='scatter', x='service_time_int', y='station_entries', ax=ax)
ax.set_xticklabels([datetime.fromtimestamp(ts / 1e9).strftime('%D %H:%M:%S') for ts in ax.get_xticks()])
ax.set_xlabel('Service time')
ax.set_ylabel('Num entries')
plt.show()