In [1]:
import geoplotlib as gpl
import numpy as np
import pandas as pd
import re
import datetime as dt

%matplotlib inline

In [2]:
def time_of_day_from_unix_ts(ts):
  """unix timestamp to time of day"""
  return dt.datetime.fromtimestamp(ts).time()

def time_of_day_from_minutes(mins):
  """minutes after midnight to standard time of day"""
  return (dt.datetime(1,1,1) + dt.timedelta(minutes = mins)).time()

def day_of_week_from_datestamp(datestamp):
  """Datestamps like '20180902' -> 6 (for Sunday)"""
  if not(len(datestamp) == 8 and datestamp.isdigit()):
    raise ValueError(f"invalid datestamp: {datestamp}")
  year = int(datestamp[:4])
  month = int(datestamp[4:6])
  day = int(datestamp[6:])
  return dt.date(year, month, day).weekday()

day1 = dt.date(1,1,1)
day2 = dt.date(1,1,2)

def timediff(time1, time2):
  """finds the minimum time between two time-of-day-stamps"""
  if time1 > time2:
    time1, time2 = time2, time1
  diff1 = dt.datetime.combine(day1, time2) - dt.datetime.combine(day1, time1)
  diff2 = dt.datetime.combine(day2, time1) - dt.datetime.combine(day1, time2)
  return min(diff1, diff2)

Instructions for using this notebook:
1. Create a folder to put data in.  Set `datapath` below to that folder's path
2. Download the GTFS schedule data (link below), unzip to a folder called `<datapath>/gtfs_schedule`
3. Download the GTFS-RT archive data (link in relevant section) and unzip into a folder called `<datapath>/gtfs_realtime/august_2018`.
4. Unzip `<datapath>/gtfs_realtime/august_2018/20180801.zip` where it is

In [3]:
datapath = '/home/tanner/Documents/transit/'

# GTFS schedule data
All this is static, i.e. a new version is published on the order of every few months.  It can be downloaded straight from the MTA [here](http://web.mta.info/developers/data/nyct/subway/google_transit.zip).

It consists of a dozen or so tables, all with a standard structure.  Together, these tables define everything a third party might need to know about how a transit system runs.  Additional documentation can be found [here](https://developers.google.com/transit/gtfs/reference/).

In [4]:
agency = pd.read_csv(datapath + 'gtfs_schedule/agency.txt')
agency

Unnamed: 0,agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone
0,MTA NYCT,MTA New York City Transit,http://www.mta.info,America/New_York,en,718-330-1234


## Routes

In [5]:
routes = pd.read_csv(datapath + 'gtfs_schedule/routes.txt')
routes.sort_index()

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,1,MTA NYCT,1,Broadway - 7 Avenue Local,Trains operate between 242 St in the Bronx and...,1,http://web.mta.info/nyct/service/pdf/t1cur.pdf,EE352E,
1,2,MTA NYCT,2,7 Avenue Express,"Trains operate between Wakefield-241 St, Bronx...",1,http://web.mta.info/nyct/service/pdf/t2cur.pdf,EE352E,
2,3,MTA NYCT,3,7 Avenue Express,"Trains operate between 148 St, 7 Av, Manhattan...",1,http://web.mta.info/nyct/service/pdf/t3cur.pdf,EE352E,
3,4,MTA NYCT,4,Lexington Avenue Express,Trains operate daily between Woodlawn/Jerome A...,1,http://web.mta.info/nyct/service/pdf/t4cur.pdf,00933C,
4,5,MTA NYCT,5,Lexington Avenue Express,"Weekdays daytime, most trains operate between ...",1,http://web.mta.info/nyct/service/pdf/t5cur.pdf,00933C,
5,5X,MTA NYCT,5X,Lexington Avenue Express,"Weekdays daytime, most trains operate between ...",1,http://web.mta.info/nyct/service/pdf/t5cur.pdf,00933C,
6,6,MTA NYCT,6,Lexington Avenue Local,Local trains operate between Pelham Bay Park/B...,1,http://web.mta.info/nyct/service/pdf/t6cur.pdf,00933C,
7,6X,MTA NYCT,6X,Lexington Avenue Express,Express trains operate between Pelham Bay Park...,1,http://web.mta.info/nyct/service/pdf/t6cur.pdf,00A65C,
8,7,MTA NYCT,7,Flushing Local,"Trains operate between Main St-Flushing, Queen...",1,http://web.mta.info/nyct/service/pdf/t7cur.pdf,B933AD,
9,7X,MTA NYCT,7X,Flushing Express,"Trains operate between Main St-Flushing, Queen...",1,http://web.mta.info/nyct/service/pdf/t7cur.pdf,B933AD,


## Trips
A trip is a single journey by a train from one end of a line to the other.

In [6]:
trips = pd.read_csv(datapath + 'gtfs_schedule/trips.txt').dropna(how='all', axis=1)
trips.query('route_id == "1"').sample(5).sort_index() #[trips.trip_id == '016900_A..S']

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,shape_id
220,1,ASP18GEN-1037-Sunday-00,ASP18GEN-1037-Sunday-00_119750_1..S03R,South Ferry,1,1..S03R
369,1,ASP18GEN-1038-Saturday-00,ASP18GEN-1038-Saturday-00_061500_1..N03R,Van Cortlandt Park - 242 St,0,1..N03R
550,1,ASP18GEN-1038-Saturday-00,ASP18GEN-1038-Saturday-00_133550_1..S03R,South Ferry,1,1..S03R
777,1,ASP18GEN-1087-Weekday-00,ASP18GEN-1087-Weekday-00_075150_1..S03R,South Ferry,1,1..S03R
794,1,ASP18GEN-1087-Weekday-00,ASP18GEN-1087-Weekday-00_080150_1..N03R,Van Cortlandt Park - 242 St,0,1..N03R


Lots of information is encoded in the trip ID, so here I parse it all out.

In [7]:
trip_id_re = re.compile(r'(A|B|(SIR-))((SP)|(FA))(20)?\d{2}(GEN)?-([0-9A-Z]\w?)\d{3}-(\w+)-\d{2}_(\d+)_([0-9A-Z]\w?)\.\.?([NS])(\d{2}(R|X)(\d{3})?)')

def parse_trip_id(trip_id):
  m = re.match(trip_id_re, trip_id)
  if m is None:
    raise ValueError(f'no match: {trip_id}')
  groups = m.groups()
  if groups[7] != groups[10]:
    raise ValueError(f"incongruent id: {trip_id}")
  return {'trip_id': trip_id,
          'division': groups[0],
          'line': groups[7],
          'day_of_week': groups[8],
          'start_time': groups[9],
          'direction': groups[11],
          'trip_path': groups[12]}
    

In [8]:
trip_ids = set(trips.trip_id)
parsed_trips = pd.DataFrame(parse_trip_id(trip_id) for trip_id in trip_ids)
parsed_trips = parsed_trips.sort_values(['day_of_week','division','line','direction','start_time']).reset_index(drop=True)
parsed_trips['start_time'] = parsed_trips.start_time.apply(lambda x: time_of_day_from_minutes(int(x)/100))

In [9]:
parsed_trips.head()

Unnamed: 0,day_of_week,direction,division,line,start_time,trip_id,trip_path
0,Saturday,N,A,1,01:12:00,ASP18GEN-1038-Saturday-00_007200_1..N03R,03R
1,Saturday,N,A,1,01:32:00,ASP18GEN-1038-Saturday-00_009200_1..N03R,03R
2,Saturday,N,A,1,01:52:00,ASP18GEN-1038-Saturday-00_011200_1..N03R,03R
3,Saturday,N,A,1,02:12:00,ASP18GEN-1038-Saturday-00_013200_1..N03R,03R
4,Saturday,N,A,1,02:32:00,ASP18GEN-1038-Saturday-00_015200_1..N03R,03R


And here I join the new, parsed data with the published data.

In [10]:
trips_full = trips.set_index('trip_id').join(parsed_trips.set_index('trip_id'))
trips_full.query("day_of_week == 'Weekday'").sample(10).sort_values('route_id')

Unnamed: 0_level_0,route_id,service_id,trip_headsign,direction_id,shape_id,day_of_week,direction,division,line,start_time,trip_path
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ASP18GEN-2097-Weekday-00_074300_2..N01R,2,ASP18GEN-2097-Weekday-00,Wakefield - 241 St,0,2..N01R,Weekday,N,A,2,12:23:00,01R
ASP18GEN-6085-Weekday-00_103300_6..S03R,6,ASP18GEN-6085-Weekday-00,Brooklyn Bridge - City Hall,1,6..S03R,Weekday,S,A,6,17:13:00,03R
ASP18GEN-6085-Weekday-00_032600_6..N01R,6,ASP18GEN-6085-Weekday-00,Pelham Bay Park,0,6..N01R,Weekday,N,A,6,05:26:00,01R
ASP18GEN-6085-Weekday-00_106700_6..N03R,6,ASP18GEN-6085-Weekday-00,Parkchester,0,6..N03R,Weekday,N,A,6,17:47:00,03R
ASP18GEN-6085-Weekday-00_113000_6..S01R,6,ASP18GEN-6085-Weekday-00,Brooklyn Bridge - City Hall,1,6..S01R,Weekday,S,A,6,18:50:00,01R
ASP18GEN-6085-Weekday-00_115100_6..S01R,6,ASP18GEN-6085-Weekday-00,Brooklyn Bridge - City Hall,1,6..S01R,Weekday,S,A,6,19:11:00,01R
ASP18GEN-7058-Weekday-00_054000_7..N97R,7,ASP18GEN-7058-Weekday-00,Flushing - Main St,0,7..N97R,Weekday,N,A,7,09:00:00,97R
BSP18GEN-B080-Weekday-00_036750_B..N45R,B,BSP18GEN-B080-Weekday-00,Bedford Park Blvd,0,B..N45R,Weekday,N,B,B,06:07:30,45R
BSP18GEN-C049-Weekday-00_041350_C..N04R,C,BSP18GEN-C049-Weekday-00,168 St,0,C..N04R,Weekday,N,B,C,06:53:30,04R
ASP18GEN-GS019-Weekday-00_069550_GS.S01R,GS,ASP18GEN-GS019-Weekday-00,Grand Central - 42 St,1,GS.S01R,Weekday,S,A,GS,11:35:30,01R


## Stop Times
Each trip has published arrive and depart times for every stop, which are in this table.

In [11]:
stop_times = pd.read_csv(datapath + 'gtfs_schedule/stop_times.txt').dropna(how='all', axis=1)
stop_times['route_id'] = stop_times.trip_id.str.extract('_([^_]*)\.\.')
stop_times.query('trip_id == "ASP18GEN-1087-Weekday-00_090300_1..N03R"').head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,route_id
34477,ASP18GEN-1087-Weekday-00_090300_1..N03R,15:03:00,15:03:00,142N,1,0,0,1
34478,ASP18GEN-1087-Weekday-00_090300_1..N03R,15:05:30,15:05:30,139N,2,0,0,1
34479,ASP18GEN-1087-Weekday-00_090300_1..N03R,15:06:30,15:06:30,138N,3,0,0,1
34480,ASP18GEN-1087-Weekday-00_090300_1..N03R,15:07:30,15:07:30,137N,4,0,0,1
34481,ASP18GEN-1087-Weekday-00_090300_1..N03R,15:08:30,15:08:30,136N,5,0,0,1


## Stops
Each stop has additional metadata, like lat/long, which can be found here.

In [12]:
stops = pd.read_csv(datapath + 'gtfs_schedule/stops.txt').dropna(how='all', axis=1)
stops.query('stop_id in ["G06N","G07N"]').sort_index(ascending=False)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station
967,G07N,Jamaica - Van Wyck,40.702566,-73.816859,0,G07
964,G06N,Sutphin Blvd - Archer Av - JFK Airport,40.700486,-73.807969,0,G06


In [13]:
stops[stops.stop_id == 'A59S']

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station
677,A59S,80 St,40.679371,-73.858992,0,A59


## Other tables 

In [14]:
calendar_dates = pd.read_csv(datapath + 'gtfs_schedule/calendar_dates.txt')
calendar_dates.head()

Unnamed: 0,service_id,date,exception_type
0,ASP18GEN-1037-Sunday-00,20180903,1
1,ASP18GEN-2048-Sunday-00,20180903,1
2,ASP18GEN-3041-Sunday-00,20180903,1
3,ASP18GEN-4049-Sunday-00,20180903,1
4,ASP18GEN-5048-Sunday-00,20180903,1


In [15]:
calendar = pd.read_csv(datapath + 'gtfs_schedule/calendar.txt')
calendar.head()

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
0,ASP18GEN-1037-Sunday-00,0,0,0,0,0,0,1,20180624,20181028
1,ASP18GEN-2048-Sunday-00,0,0,0,0,0,0,1,20180624,20181028
2,ASP18GEN-3041-Sunday-00,0,0,0,0,0,0,1,20180624,20181028
3,ASP18GEN-4049-Sunday-00,0,0,0,0,0,0,1,20180624,20181028
4,ASP18GEN-5048-Sunday-00,0,0,0,0,0,0,1,20180624,20181028


In [16]:
transfers = pd.read_csv(datapath + 'gtfs_schedule/transfers.txt')
transfers.head()

Unnamed: 0,from_stop_id,to_stop_id,transfer_type,min_transfer_time
0,101,101,2,180
1,103,103,2,180
2,104,104,2,180
3,106,106,2,180
4,107,107,2,180


In [17]:
shapes = pd.read_csv(datapath + 'gtfs_schedule/shapes.txt').dropna(how='all', axis=1)
shapes.head()

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence
0,1..N03R,40.702068,-74.013664,0
1,1..N03R,40.703199,-74.014792,1
2,1..N03R,40.703226,-74.01482,2
3,1..N03R,40.703253,-74.014846,3
4,1..N03R,40.70328,-74.01487,4


# historical GTFS realtime data
Realtime data is delivered via a protocol called GTFS-RT, which looks almost nothing like the GTFS protocol above.  It is delivered via a Google-developed format called protobuf, which is essentially a more efficient but non-human-readable alternative to JSON.

In [18]:
from google.transit import gtfs_realtime_pb2
from python_src import nyct_subway_pb2
import os
import re
import pandas as pd
import datetime
import time
from protobuf_to_dict import protobuf_to_dict

In [19]:
basepath = datapath + 'gtfs_realtime/august_2018/20180801/'
filename = 'gtfs_ace_20180801_041946.gtfs'
with open(basepath + filename, 'rb') as f:
  content = f.read()

feed = gtfs_realtime_pb2.FeedMessage()
feed.ParseFromString(content)

33908

### enumerated types

In [20]:
vehicle_stop_statuses = {v.number: v.name for v in gtfs_realtime_pb2._VEHICLEPOSITION_VEHICLESTOPSTATUS.values}
vehicle_stop_statuses

{0: 'INCOMING_AT', 1: 'STOPPED_AT', 2: 'IN_TRANSIT_TO'}

In [21]:
directions = {v.number: v.name for v in nyct_subway_pb2._NYCTTRIPDESCRIPTOR_DIRECTION.values}
directions

{1: 'NORTH', 2: 'EAST', 3: 'SOUTH', 4: 'WEST'}

In [22]:
schedule_relationships = {v.number: v.name for v in gtfs_realtime_pb2._TRIPDESCRIPTOR_SCHEDULERELATIONSHIP.values}
schedule_relationships

{0: 'SCHEDULED', 1: 'ADDED', 2: 'UNSCHEDULED', 3: 'CANCELED'}

### tracing one trip across lots of trip_updates

In [23]:
def grouped_entities(feed_message):
  if len(feed_message.entity) %2 != 0:
    raise ValueError('odd number of entities')
  for e1, e2 in zip(feed_message.entity[::2], feed_message.entity[1::2]):
    yield e1.trip_update, e2.vehicle
  return

In [24]:
l = []
trip_id = '025700_E..N' # started 257 minutes after midnight, or at 04:17 - E train - heading "north" (toward Queens)
for filename in sorted(os.listdir(basepath)):
  if filename.split('_')[1] != 'ace':
    continue
  with open(basepath + filename, 'rb') as f:
    content = f.read()
  feed_message = gtfs_realtime_pb2.FeedMessage()
  try:
    feed_message.ParseFromString(content)
  except:
    continue
  
  d = {}
  for e in feed_message.entity: # only one of these will match the trip_id (in theory)
    if e.trip_update.trip.trip_id == trip_id:
      d = {
          'timestamp': time_of_day_from_unix_ts(feed_message.header.timestamp),
          **{u.stop_id: time_of_day_from_unix_ts(u.arrival.time) for u in e.trip_update.stop_time_update}
      }
    
  if d:
    l.append(d)  

In [25]:
scheds = pd.DataFrame(l).set_index('timestamp')
timetable = scheds.drop_duplicates().sort_values(scheds.index[0], axis=1)
timetable.columns = [stops.set_index('stop_id').loc[stop_id].stop_name for stop_id in timetable.columns]
timetable.head(15)  

Unnamed: 0_level_0,Canal St,Spring St,W 4 St,14 St,23 St,34 St - Penn Station,42 St - Port Authority Bus Terminal,50 St,7 Av,5 Av/53 St,Lexington Av/53 St,Court Sq,Queens Plaza,Jackson Hts - Roosevelt Av,Forest Hills - 71 Av,Kew Gardens - Union Tpke,Jamaica - Van Wyck,Sutphin Blvd - Archer Av - JFK Airport,Jamaica Center - Parsons/Archer
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
04:19:46,04:19:22,04:20:30,04:22:30,04:24:30,04:26:00,04:28:00,04:29:00,04:30:30,04:32:00,04:33:00,04:34:30,04:37:30,04:39:00,04:45:15,04:54:00,04:58:15,05:04:15,05:08:45,05:13:00
04:20:01,04:19:37,04:20:42,04:22:42,04:24:42,04:26:12,04:28:12,04:29:12,04:30:42,04:32:12,04:33:12,04:34:42,04:37:42,04:39:12,04:45:27,04:54:12,04:58:27,05:04:27,05:08:57,05:13:12
04:20:16,,04:19:50,04:21:50,04:23:50,04:25:20,04:27:20,04:28:20,04:29:50,04:31:20,04:32:20,04:33:50,04:36:50,04:38:20,04:44:35,04:53:20,04:57:35,05:03:35,05:08:05,05:12:20
04:20:31,,04:20:12,04:21:50,04:23:50,04:25:20,04:27:20,04:28:20,04:29:50,04:31:20,04:32:20,04:33:50,04:36:50,04:38:20,04:44:35,04:53:20,04:57:35,05:03:35,05:08:05,05:12:20
04:20:46,,04:20:22,04:21:50,04:23:50,04:25:20,04:27:20,04:28:20,04:29:50,04:31:20,04:32:20,04:33:50,04:36:50,04:38:20,04:44:35,04:53:20,04:57:35,05:03:35,05:08:05,05:12:20
04:21:01,,04:20:37,04:21:50,04:23:50,04:25:20,04:27:20,04:28:20,04:29:50,04:31:20,04:32:20,04:33:50,04:36:50,04:38:20,04:44:35,04:53:20,04:57:35,05:03:35,05:08:05,05:12:20
04:21:16,,04:20:47,04:21:54,04:23:54,04:25:24,04:27:24,04:28:24,04:29:54,04:31:24,04:32:24,04:33:54,04:36:54,04:38:24,04:44:39,04:53:24,04:57:39,05:03:39,05:08:09,05:12:24
04:21:31,,04:21:12,04:22:18,04:24:18,04:25:48,04:27:48,04:28:48,04:30:18,04:31:48,04:32:48,04:34:18,04:37:18,04:38:48,04:45:03,04:53:48,04:58:03,05:04:03,05:08:33,05:12:48
04:21:46,,04:21:22,04:22:30,04:24:30,04:26:00,04:28:00,04:29:00,04:30:30,04:32:00,04:33:00,04:34:30,04:37:30,04:39:00,04:45:15,04:54:00,04:58:15,05:04:15,05:08:45,05:13:00
04:22:01,,04:21:37,04:22:42,04:24:42,04:26:12,04:28:12,04:29:12,04:30:42,04:32:12,04:33:12,04:34:42,04:37:42,04:39:12,04:45:27,04:54:12,04:58:27,05:04:27,05:08:57,05:13:12


### Examples of protobuf objects

In [26]:
feed.entity[44].trip_update.stop_time_update[0]

arrival {
  time: 1533111562
}
departure {
  time: 1533111562
}
stop_id: "A34N"
schedule_relationship: SCHEDULED
[nyct_stop_time_update] {
  scheduled_track: "A2"
  actual_track: "A2"
}

In [28]:
time_of_day_from_unix_ts(feed.entity[44].trip_update.stop_time_update[0].arrival.time)

datetime.time(4, 19, 22)

In [29]:
feed.entity[45].vehicle

trip {
  trip_id: "025700_E..N"
  start_date: "20180801"
  route_id: "E"
  [nyct_trip_descriptor] {
    train_id: "1E 0408+ WTC/P-A"
    is_assigned: true
    direction: NORTH
  }
}
current_stop_sequence: 1
current_status: STOPPED_AT
timestamp: 1533111569

In [31]:
time_of_day_from_unix_ts(feed.entity[45].vehicle.timestamp)

datetime.time(4, 19, 29)

### Finding closest scheduled trip
Because the trip_ids in the realtime data feed don't match up with the ones in the schedule, we have to make an educated guess.  I do this by searching the schedule for trips on the same day, on the same line, and in the same direction, and then picking the one with the start time that is closest to the actual start time.

In [32]:
short_trip_id_re = re.compile(r'(\d{6})_(\w)..(\w)')
def parse_short_trip_id(trip_id):
  m = re.match(short_trip_id_re, trip_id)
  if m is None:
    raise ValueError(f'no match: {trip_id}')
  groups = m.groups()
  return {'timestamp': groups[0],
          'line': groups[1],
          'direction': groups[2]}

In [33]:
feed.entity[45].vehicle

trip {
  trip_id: "025700_E..N"
  start_date: "20180801"
  route_id: "E"
  [nyct_trip_descriptor] {
    train_id: "1E 0408+ WTC/P-A"
    is_assigned: true
    direction: NORTH
  }
}
current_stop_sequence: 1
current_status: STOPPED_AT
timestamp: 1533111569

In [36]:
start_date = feed.entity[45].vehicle.trip.start_date
day_of_week_from_datestamp(start_date)

days =  {0: 'Weekday',
         1: 'Weekday',
         2: 'Weekday',
         3: 'Weekday',
         4: 'Weekday',
         5: 'Saturday',
         6: 'Sunday'}

days[day_of_week_from_datestamp(start_date)]

'Weekday'

In [37]:
tid = feed.entity[45].vehicle.trip.trip_id
short_trip_id_params = parse_short_trip_id(tid)
short_trip_id_params

{'timestamp': '025700', 'line': 'E', 'direction': 'N'}

In [39]:
timestamp = short_trip_id_params['timestamp']
line = short_trip_id_params['line']
direction = short_trip_id_params['direction']

day_of_week = days[day_of_week_from_datestamp(start_date)]
time_of_day = time_of_day_from_minutes(int(timestamp)/100)

relevant_trips = trips_full[(trips_full.line == line) & (trips_full.direction == direction) & (trips_full.day_of_week == day_of_week)]
closest_trip_id = relevant_trips.start_time.apply(lambda ts: timediff(ts, time_of_day)).idxmin()
closest_trip = relevant_trips.loc[closest_trip_id]
closest_trip

route_id                                       E
service_id              BSP18GEN-E070-Weekday-00
trip_headsign    Jamaica Center - Parsons/Archer
direction_id                                   0
shape_id                                 E..N05R
day_of_week                              Weekday
direction                                      N
division                                       B
line                                           E
start_time                              04:08:30
trip_path                                    05R
Name: BSP18GEN-E070-Weekday-00_024850_E..N05R, dtype: object

In [40]:
time_of_day

datetime.time(4, 17)

### upcoming stop

In [41]:
schedule = stop_times.set_index('trip_id').loc[closest_trip_id]
schedule.head()

Unnamed: 0_level_0,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,route_id
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BSP18GEN-E070-Weekday-00_024850_E..N05R,04:08:30,04:08:30,E01N,1,0,0,E
BSP18GEN-E070-Weekday-00_024850_E..N05R,04:10:30,04:10:30,A34N,2,0,0,E
BSP18GEN-E070-Weekday-00_024850_E..N05R,04:12:00,04:12:00,A33N,3,0,0,E
BSP18GEN-E070-Weekday-00_024850_E..N05R,04:14:00,04:14:00,A32N,4,0,0,E
BSP18GEN-E070-Weekday-00_024850_E..N05R,04:16:00,04:16:00,A31N,5,0,0,E


In [42]:
feed.entity[45].vehicle

trip {
  trip_id: "025700_E..N"
  start_date: "20180801"
  route_id: "E"
  [nyct_trip_descriptor] {
    train_id: "1E 0408+ WTC/P-A"
    is_assigned: true
    direction: NORTH
  }
}
current_stop_sequence: 1
current_status: STOPPED_AT
timestamp: 1533111569

In [43]:
schedule[schedule.stop_sequence == feed.entity[45].vehicle.current_stop_sequence]

Unnamed: 0_level_0,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,route_id
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BSP18GEN-E070-Weekday-00_024850_E..N05R,04:08:30,04:08:30,E01N,1,0,0,E


### all extant trips at a given time

In [45]:
trips = pd.DataFrame([(tu.trip_update.trip.trip_id, v.vehicle.current_stop_sequence) for tu, v in zip(feed.entity[::2], feed.entity[1::2])], columns=['trip_id', 'current_stop_sequence'])
trips['start_times'] = (trips.trip_id.str[:6].astype(int)/100).apply(time_of_day_from_minutes)

In [46]:
trips.sort_values('start_times')

Unnamed: 0,trip_id,current_stop_sequence,start_times
1,016500_A..N,52,02:45:00
0,016900_A..S,47,02:49:00
2,018500_A..N,38,03:05:00
3,018950_A..S,43,03:09:30
19,019200_E..S,30,03:12:00
4,020500_A..N,26,03:25:00
5,020900_A..S,31,03:29:00
21,022050_E..N,26,03:40:30
18,022426_E..S,18,03:44:15.600000
6,022500_A..N,15,03:45:00


### ~~ Everything after this is unmaintained ~~

In [47]:
# checking for stop_ids in vehicle updates - there are none 
line_groups = []
basepath = '/home/tanner/Documents/transit/gtfs_realtime/august_2018/20180801/'
for filename in sorted(os.listdir(basepath)):
  line_group = filename.split('_')[1]
  if line_group not in line_groups:
    print('parsing group', line_group)
    line_groups.append(line_group)
    
  with open(basepath + filename, 'rb') as f:
    content = f.read()
  feed = gtfs_realtime_pb2.FeedMessage()
  try:
    feed.ParseFromString(content)
  except:
    print('error parsing', filename)
    continue
  
  if any(e.vehicle.stop_id for e in feed.entity):
    print(filename, 'has stop_ids')
  

parsing group ace
error parsing gtfs_ace_20180801_085717.gtfs
error parsing gtfs_ace_20180801_181116.gtfs


KeyboardInterrupt: 

In [507]:
basepath = '/home/tanner/Documents/transit/gtfs_realtime/august_2018/20180801/'
for filename in sorted(os.listdir(basepath)):
  with open(basepath + filename, 'rb') as f:
    content = f.read()
  feed = gtfs_realtime_pb2.FeedMessage()
  try:
    feed.ParseFromString(content)
  except:
    print('error parsing', filename)
    continue
 
  for e in feed.entity:
    if str(e.alert):
      print(e.alert)
      raise StopIteration()
    

error parsing gtfs_bdfm_20180801_052206.gtfs
error parsing gtfs_bdfm_20180801_054951.gtfs
error parsing gtfs_bdfm_20180801_060906.gtfs
error parsing gtfs_bdfm_20180801_210708.gtfs
error parsing gtfs_bdfm_20180801_213453.gtfs
error parsing gtfs_bdfm_20180801_213753.gtfs
error parsing gtfs_bdfm_20180801_213823.gtfs
error parsing gtfs_bdfm_20180801_220053.gtfs
error parsing gtfs_bdfm_20180801_231853.gtfs
error parsing gtfs_nqrw_20180801_054424.gtfs
error parsing gtfs_nqrw_20180801_055654.gtfs
error parsing gtfs_nqrw_20180801_060009.gtfs
error parsing gtfs_nqrw_20180801_060854.gtfs
error parsing gtfs_nqrw_20180801_065155.gtfs
error parsing gtfs_nqrw_20180801_070124.gtfs
error parsing gtfs_nqrw_20180801_071654.gtfs
error parsing gtfs_nqrw_20180801_090124.gtfs
error parsing gtfs_nqrw_20180801_093340.gtfs
error parsing gtfs_nqrw_20180801_093409.gtfs
error parsing gtfs_nqrw_20180801_093924.gtfs
error parsing gtfs_nqrw_20180801_094824.gtfs
error parsing gtfs_nqrw_20180801_094854.gtfs
error pars

In [435]:
vehicles = []
for e in feed.entity:
  if str(e.vehicle):
    vehicles.append({
      'id': e.id,
      'trip_id': e.vehicle.trip.trip_id,
      'start_date': e.vehicle.trip.start_date,
      'route_id': e.vehicle.trip.route_id,
      'direction': e.vehicle.trip.ListFields()[-1][-1].direction,
      'current_stop_sequence': e.vehicle.current_stop_sequence,
      'current_status': e.vehicle.current_status,
      'timestamp': e.vehicle.timestamp
    })
df = pd.DataFrame(vehicles)

In [443]:
df.sort_values(['route_id','direction','current_stop_sequence'])

Unnamed: 0,current_status,current_stop_sequence,direction,id,route_id,start_date,timestamp,trip_id
10,2,0,1,26000022,A,20180801,1533111583,026500_A..N
11,2,0,1,26000024,A,20180801,1533111583,026500_A..N
15,2,0,1,26000032,A,20180801,1533111583,028500_A..N
16,2,0,1,26000034,A,20180801,1533111583,028000_A..N
8,1,5,1,26000018,A,20180801,1533111577,024500_A..N
6,1,15,1,26000014,A,20180801,1533111537,022500_A..N
4,1,26,1,26000010,A,20180801,1533111517,020500_A..N
2,1,38,1,26000006,A,20180801,1533111577,018500_A..N
1,1,52,1,26000004,A,20180801,1533111577,016500_A..N
9,0,0,3,26000020,A,20180801,1533111583,026000_A..S


In [429]:
df.sort_values('timestamp')

Unnamed: 0,current_status,current_stop_sequence,id,route_id,start_date,timestamp,trip_id
4,1,26,26000010,A,20180801,1533111517,020500_A..N
20,1,13,28000006,E,20180801,1533111537,023200_E..N
6,1,15,26000014,A,20180801,1533111537,022500_A..N
19,1,30,28000004,E,20180801,1533111547,019200_E..S
21,1,26,28000008,E,20180801,1533111562,022050_E..N
25,1,4,28000016,E,20180801,1533111562,025100_E..S
27,1,5,56000002,H,20180801,1533111562,025200_H..S
18,1,18,28000002,E,20180801,1533111562,022426_E..S
28,1,3,56000004,H,20180801,1533111562,025500_H..N
22,1,1,28000010,E,20180801,1533111569,025700_E..N


In [406]:
[(e.vehicle.current_stop_sequence, e.vehicle.trip.route_id) for e in feed.entity if str(e.vehicle)]

[(47, 'A'),
 (52, 'A'),
 (38, 'A'),
 (43, 'A'),
 (26, 'A'),
 (31, 'A'),
 (15, 'A'),
 (19, 'A'),
 (5, 'A'),
 (0, 'A'),
 (0, 'A'),
 (0, 'A'),
 (7, 'A'),
 (0, 'A'),
 (0, 'A'),
 (0, 'A'),
 (0, 'A'),
 (0, 'A'),
 (18, 'E'),
 (30, 'E'),
 (13, 'E'),
 (26, 'E'),
 (1, 'E'),
 (0, 'E'),
 (0, 'E'),
 (4, 'E'),
 (0, 'E'),
 (5, 'H'),
 (3, 'H'),
 (0, 'H'),
 (0, 'H'),
 (0, 'H'),
 (3, 'FS'),
 (0, 'FS'),
 (0, 'FS'),
 (0, 'FS'),
 (0, 'FS')]

In [279]:
[p.replacement_period.end - feed.header.timestamp for p in feed.header.ListFields()[-1][-1].trip_replacement_period]

[1800, 1786, 1788, 1791, 1792]

In [60]:
desc = entity.ListFields()[0][0]

In [69]:
[x[0].name for x in entity.ListFields()]

['id', 'trip_update']

### protobuf to dict

In [291]:
from protobuf_to_dict import protobuf_to_dict

In [295]:
protobuf_to_dict(feed)['header']['___X']['1001'][]

{'nyct_subway_version': '1.0',
 'trip_replacement_period': [{'route_id': 'A',
   'replacement_period': {'end': 1533113386}},
  {'route_id': 'C', 'replacement_period': {'end': 1533113372}},
  {'route_id': 'E', 'replacement_period': {'end': 1533113374}},
  {'route_id': 'H', 'replacement_period': {'end': 1533113377}},
  {'route_id': 'FS', 'replacement_period': {'end': 1533113378}}]}

### Exploration

In [84]:
s = shapes[shapes.shape_id == '1..N03R'].copy(deep=True)

# geoplotlib.dot(shapes.rename({'shape_pt_lat':'lat', 'shape_pt_lon':'lon'}, axis=1))
# geoplotlib.show()

In [74]:
g = pd.DataFrame()
g['src_lat'] = s.shape_pt_lat.shift(1)
g['src_lon'] = s.shape_pt_lon.shift(1)
g['dest_lat'] = s.shape_pt_lat
g['dest_lon'] = s.shape_pt_lon

In [99]:
gpl.graph(g[1:],'src_lat','src_lon','dest_lat','dest_lon', linewidth=10, color='Blues')

In [100]:
gpl.show()

In [181]:
scheds.columns

Index(['A25N', 'A27N', 'A28N', 'A30N', 'A31N', 'A32N', 'A33N', 'A34N', 'D14N',
       'F06N', 'F09N', 'F11N', 'F12N', 'G05N', 'G06N', 'G07N', 'G08N', 'G14N',
       'G21N'],
      dtype='object')

In [187]:
stops[stops.stop_id.]

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [186]:
stops.set_index('stop_id')

Unnamed: 0_level_0,stop_name,stop_lat,stop_lon,location_type,parent_station
stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
101,Van Cortlandt Park - 242 St,40.889248,-73.898583,1,
101N,Van Cortlandt Park - 242 St,40.889248,-73.898583,0,101
101S,Van Cortlandt Park - 242 St,40.889248,-73.898583,0,101
103,238 St,40.884667,-73.900870,1,
103N,238 St,40.884667,-73.900870,0,103
103S,238 St,40.884667,-73.900870,0,103
104,231 St,40.878856,-73.904834,1,
104N,231 St,40.878856,-73.904834,0,104
104S,231 St,40.878856,-73.904834,0,104
106,Marble Hill - 225 St,40.874561,-73.909831,1,
