In [1]:
import pandas as pd

Let's grab some vehicle updates for the MBTA. Go to http://mbta-history.apptic.xyz and download a zipped CSV file. Unzip and untar it, then read it into a dataframe:

In [14]:
# Replace the path below
df = pd.read_csv("~/Downloads/2017-09-12.csv")
df.head()

Unnamed: 0,trip_id,trip_start,stop_id,stop_sequence,vehicle_id,status,timestamp,lat,lon
0,35271979,2017-09-12,6407,50,y1629,STOPPED_AT,2017-09-13 06:02:19,42.24548,-71.12101
1,35241143,2017-09-12,11148,28,y1879,INCOMING_AT,2017-09-13 06:01:45,42.328892,-71.084808
2,34874718,2017-09-12,41031,33,y0769,IN_TRANSIT_TO,2017-09-13 06:02:16,42.24229,-71.004929
3,100_0,2017-09-12,70107,310,10078,IN_TRANSIT_TO,2017-09-13 06:02:13,42.340118,-71.165123
4,35122583,2017-09-12,382,17,y1251,STOPPED_AT,2017-09-13 06:01:59,42.300446,-71.086067


CSV doesn't have any special handling for dates, so we need to convert the timestamp to an actual datetime, which can be used in comparisons.

In [59]:
df.timestamp = pd.to_datetime(df.timestamp).dt.tz_localize("UTC").dt.tz_convert("US/Eastern")

# TODO: Could group by the trip_id and take the maximum timestamp

In [71]:
# df["max_timestamp"] = df.groupby(["trip_id", "timestamp"]).max().timestamp
# df.head(10)

In [62]:
stops = df.query("status == 'STOPPED_AT'").reset_index()
stops.head()

Unnamed: 0,index,trip_id,trip_start,stop_id,stop_sequence,vehicle_id,status,timestamp,lat,lon,max_timestamp
0,0,35271979,2017-09-12,6407,50,y1629,STOPPED_AT,2017-09-13 02:02:19-04:00,42.24548,-71.12101,NaT
1,4,35122583,2017-09-12,382,17,y1251,STOPPED_AT,2017-09-13 02:01:59-04:00,42.300446,-71.086067,NaT
2,6,35122583,2017-09-12,1736,14,y1251,STOPPED_AT,2017-09-13 02:00:59-04:00,42.294285,-71.088036,NaT
3,9,35241886,2017-09-12,150,36,y1753,STOPPED_AT,2017-09-13 02:00:24-04:00,42.342514,-71.057388,NaT
4,12,35086428,2017-09-12,2320,9,y4123,STOPPED_AT,2017-09-13 02:00:15-04:00,42.394157,-71.12645,NaT


Let's define some functions for pulling in information from the MBTA's GTFS manifest.

In [11]:
from zipfile import ZipFile
from io import BytesIO, TextIOWrapper
from urllib.request import urlopen

def get_manifest(url="http://www.mbta.com/uploadedfiles/MBTA_GTFS.zip"):
    with urlopen(url) as u:
        return ZipFile(BytesIO(u.read()))
    
def get_manifest_item(manifest, name):
    data = TextIOWrapper(BytesIO(manifest.read(name + ".txt")), 
                         encoding="utf-8", line_buffering=True)
    return pd.read_csv(data)

In [12]:
manifest = get_manifest()

In [63]:
# Stop times for each trip and stop
stop_times = get_manifest_item(manifest, "stop_times")
trips = get_manifest_item(manifest, "trips")

# Add the route_ids and trip_headsigns
stop_times = pd.merge(stop_times[["trip_id", "stop_sequence", "arrival_time"]],
                      trips[["trip_id", "route_id", "trip_headsign"]],
                      on="trip_id")
stop_times

  if self.run_code(code, result):
  if self.run_code(code, result):


Unnamed: 0,trip_id,stop_sequence,arrival_time,route_id,trip_headsign
0,Logan-22-Weekday-trip,1,08:00:00,Logan-22,Loop
1,Logan-22-Weekday-trip,2,08:04:00,Logan-22,Loop
2,Logan-22-Weekday-trip,3,08:09:00,Logan-22,Loop
3,Logan-22-Weekday-trip,4,08:12:00,Logan-22,Loop
4,Logan-22-Weekday-trip,5,08:17:00,Logan-22,Loop
5,Logan-22-Weekday-trip,6,08:21:00,Logan-22,Loop
6,Logan-22-Weekday-trip,7,08:26:00,Logan-22,Loop
7,Logan-22-Weekend-trip,1,12:00:00,Logan-22,Loop
8,Logan-22-Weekend-trip,2,12:04:00,Logan-22,Loop
9,Logan-22-Weekend-trip,3,12:09:00,Logan-22,Loop


Let's merge the schedule information with the observed data. Join the rows together wherever the trip_id and stop_sequence are the same.

In [64]:
joined = pd.merge(stops, stop_times, how="inner", on=["trip_id", "stop_sequence"])
joined.head()

Unnamed: 0,index,trip_id,trip_start,stop_id,stop_sequence,vehicle_id,status,timestamp,lat,lon,max_timestamp,arrival_time,route_id,trip_headsign
0,432,35291700,2017-09-12,70091,160,5450709B,STOPPED_AT,2017-09-13 01:40:37-04:00,42.293201,-71.065857,NaT,24:59:00,Red,Ashmont
1,644,35291700,2017-09-12,70085,130,5450709B,STOPPED_AT,2017-09-13 01:30:02-04:00,42.320572,-71.052589,NaT,24:51:00,Red,Ashmont
2,741,35291704,2017-09-12,70066,200,545070BD,STOPPED_AT,2017-09-13 01:27:14-04:00,42.388329,-71.118942,NaT,25:04:00,Red,Alewife
3,1045,35291700,2017-09-12,70087,140,5450709B,STOPPED_AT,2017-09-13 01:33:02-04:00,42.310558,-71.053726,NaT,24:53:00,Red,Ashmont
4,1094,35291700,2017-09-12,70083,110,5450709B,STOPPED_AT,2017-09-13 01:25:24-04:00,42.330219,-71.056999,NaT,24:48:00,Red,Ashmont


Since stops are recurring, scheduled arrival times are recorded as wallclock times in the format hh:mm:ss. Since they are anchored to the start of the day, the hour can be greater than 24.

In [54]:
from datetime import datetime, timedelta
import pytz

timezone = pytz.timezone("US/Eastern")

def convert_clock_time(row):
    y, M, d = map(int, row.trip_start.split("-"))
    dt = timezone.localize(datetime(y, M, d))
    h, m, s = map(int, row.arrival_time.split(":", 2))
    # This is here to avoid DST issues
    if h >= 24:
        dt += timedelta(days=1)
        h %= 24
    return dt.replace(hour=h, minute=m, second=s)

Now let's use the function to convert the scheduled arrival time to a timestamp, using the trip start date.

In [65]:
joined["scheduled_arrival_time"] = joined.apply(convert_clock_time, axis=1)
joined

Unnamed: 0,index,trip_id,trip_start,stop_id,stop_sequence,vehicle_id,status,timestamp,lat,lon,max_timestamp,arrival_time,route_id,trip_headsign,scheduled_arrival_time
0,432,35291700,2017-09-12,70091,160,5450709B,STOPPED_AT,2017-09-13 01:40:37-04:00,42.293201,-71.065857,NaT,24:59:00,Red,Ashmont,2017-09-13 00:59:00-04:00
1,644,35291700,2017-09-12,70085,130,5450709B,STOPPED_AT,2017-09-13 01:30:02-04:00,42.320572,-71.052589,NaT,24:51:00,Red,Ashmont,2017-09-13 00:51:00-04:00
2,741,35291704,2017-09-12,70066,200,545070BD,STOPPED_AT,2017-09-13 01:27:14-04:00,42.388329,-71.118942,NaT,25:04:00,Red,Alewife,2017-09-13 01:04:00-04:00
3,1045,35291700,2017-09-12,70087,140,5450709B,STOPPED_AT,2017-09-13 01:33:02-04:00,42.310558,-71.053726,NaT,24:53:00,Red,Ashmont,2017-09-13 00:53:00-04:00
4,1094,35291700,2017-09-12,70083,110,5450709B,STOPPED_AT,2017-09-13 01:25:24-04:00,42.330219,-71.056999,NaT,24:48:00,Red,Ashmont,2017-09-13 00:48:00-04:00
5,1143,35291704,2017-09-12,70068,190,545070BD,STOPPED_AT,2017-09-13 01:24:07-04:00,42.374069,-71.118759,NaT,25:02:00,Red,Alewife,2017-09-13 01:02:00-04:00
6,1193,35291700,2017-09-12,70081,100,5450709B,STOPPED_AT,2017-09-13 01:23:13-04:00,42.342560,-71.057121,NaT,24:46:00,Red,Ashmont,2017-09-13 00:46:00-04:00
7,1380,35291704,2017-09-12,70064,210,545070BD,STOPPED_AT,2017-09-13 01:36:28-04:00,42.396759,-71.122414,NaT,25:06:00,Red,Alewife,2017-09-13 01:06:00-04:00
8,1381,35291700,2017-09-12,70089,150,5450709B,STOPPED_AT,2017-09-13 01:36:09-04:00,42.300018,-71.061829,NaT,24:56:00,Red,Ashmont,2017-09-13 00:56:00-04:00
9,1498,35291704,2017-09-12,70070,180,545070BD,STOPPED_AT,2017-09-13 01:17:08-04:00,42.365540,-71.104012,NaT,24:57:00,Red,Alewife,2017-09-13 00:57:00-04:00


Let's add a column with the delay:

In [72]:
joined["delay"] = joined.timestamp - joined.scheduled_arrival_time

What was the average delay for the Red line?

In [73]:
joined.query('route_id == "Red"').delay.mean()

Timedelta('0 days 00:07:10.492668')

In [88]:
joined.query('route_id == "Blue"').trip_id.nunique()

0