In [1]:
import pandas as pd
import numpy as np
import json
import requests

# real-time data

**Data Source**
- google gtfs-realtime-bindings: https://developers.google.com/transit/gtfs-realtime/examples/python-sample
- gtfs.py: https://pypi.org/project/gtfs.py/
- pygtfs: https://pypi.org/project/pygtfs/
    https://pygtfs.readthedocs.io/en/latest/

**Reference**
- https://www.residentmar.io/2018/01/29/gtfs-tripify.html
- https://rstudio-pubs-static.s3.amazonaws.com/368765_99afb255f92341d5932e585a6889b83e.html

In [2]:
# install dependent library first
# ! pip install --upgrade gtfs-realtime-bindings

In [13]:
from google.transit import gtfs_realtime_pb2
import urllib.request

# url = 'http://datamine.mta.info/mta_esi.php?key=0a31860d17592bb141b8fe2052e64091&feed_id=26'
url = 'https://datamine-history.s3.amazonaws.com/gtfs-2014-09-17-09-31'
feed = gtfs_realtime_pb2.FeedMessage()
response = urllib.request.urlopen(url)
feed.ParseFromString(response.read())

print(feed.entity[0])
print(feed.entity[1])

id: "000001"
trip_update {
  trip {
    trip_id: "050400_1..S02R"
    start_date: "20140917"
    route_id: "1"
  }
  stop_time_update {
    arrival {
      time: 1410960713
    }
    stop_id: "140S"
  }
}

id: "000002"
vehicle {
  trip {
    trip_id: "050400_1..S02R"
    start_date: "20140917"
    route_id: "1"
  }
  current_stop_sequence: 38
  current_status: IN_TRANSIT_TO
  timestamp: 1410960574
  stop_id: "140S"
}



# historical GTFS-realtime

- AmazonAWS DATA: https://datamine-history.s3.amazonaws.com/index.html
    - http://datamine-history.s3.amazonaws.com/gtfs-2014-09-17.tgz
- Read local NTFS: https://stackoverflow.com/questions/38958751/parsing-nyc-transit-mta-historical-gtfs-data-not-realtime?answertab=votes#tab-top


## Integret all gtfs of a single day

In [4]:
def parsegtfs(gtfsFilePath):
    # This function parsegtfs overwrites the global dictionary named 'dict1'
    
    # read in gtfs
    f = open(gtfsFilePath, 'rb')
    raw_str = f.read()
    msg = gtfs_realtime_pb2.FeedMessage()
    msg.ParseFromString(raw_str)    

    # structure gtfs
    gtfs_timestamp = msg.header.timestamp
    for i,entity in enumerate(msg.entity):
        if entity.HasField('trip_update'):
            arrival_time = entity.trip_update.stop_time_update[0].arrival.time
            stop_id = entity.trip_update.stop_time_update[0].stop_id
            trip_id = entity.trip_update.trip.trip_id
            route_id = entity.trip_update.trip.route_id

            entity2 = msg.entity[i+1]
            current_stop_sequence = entity2.vehicle.current_stop_sequence
            current_status = entity2.vehicle.current_status
            vehicle_timestamp = entity2.vehicle.timestamp
            vehicle_stop_id = entity2.vehicle.stop_id       

            if arrival_time < gtfs_timestamp or current_status == 1:  # current_status == stopped_at or arrival before gtfs uploading are considered to be actual arrival. 
                dict1[trip_id+ '-' + str(gtfs_timestamp)] = \
                                {'trip_id': trip_id,
                                    'gtfs_timestamp': gtfs_timestamp,
                                    'arrival_time': arrival_time,
                                    'stop_id': stop_id,
                                    'route_id': route_id,
                                    'current_stop_sequence': current_stop_sequence,
                                    'current_status': current_status,
                                    'vehicle_timestamp': vehicle_timestamp,
                                    'vehicle_stop_id': vehicle_stop_id
                                }

In [5]:
import os
FolderPath = "20190630"
gtfsFileNames = os.listdir(FolderPath)
dict1 = {}
for gtfsFileName in gtfsFileNames:
    gtfsFilePath = FolderPath + '/' + gtfsFileName
    try:
        parsegtfs(gtfsFilePath)
    except:
        continue

  


In [6]:
print(len(dict1))
dict1['023580_7..N-1561882795']

281552


{'trip_id': '023580_7..N',
 'gtfs_timestamp': 1561882795,
 'arrival_time': 1561882785,
 'stop_id': '711N',
 'route_id': '7',
 'current_stop_sequence': 13,
 'current_status': 2,
 'vehicle_timestamp': 1561882785,
 'vehicle_stop_id': '711'}

In [7]:
df = pd.DataFrame.from_dict(dict1).T
print(df.shape)
df.head()

(281552, 9)


Unnamed: 0,arrival_time,current_status,current_stop_sequence,gtfs_timestamp,route_id,stop_id,trip_id,vehicle_stop_id,vehicle_timestamp
023580_7..N-1561882795,1561882785,2,13,1561882795,7,711N,023580_7..N,711,1561882785
023950_7..S-1561882795,1561882775,2,13,1561882795,7,716S,023950_7..S,716,1561882775
025685_7..N-1561882795,1561882785,2,2,1561882795,7,724N,025685_7..N,724,1561882785
026000_7..S-1561882795,1561882800,1,0,1561882795,7,701S,026000_7..S,701,1561882800
023580_7..N-1561882815,1561882810,2,13,1561882815,7,711N,023580_7..N,711,1561882810


## Drop duplicates

## Match the schedules

## Calculate delays