### Parse and convert protocol buffers

Inspired by
https://stackoverflow.com/questions/38958751/parsing-nyc-transit-mta-historical-gtfs-data-not-realtime
Data Source

This extracts data from the protobufs manually downloaded from [MTA Alert Archive](http://web.mta.info/developers/data/archives.html)the latest source suggested at:
https://groups.google.com/d/msg/mtadeveloperresources/Whm5XTVINcE/z-LO12ANAAAJ

Additional feeds are listed here:
http://web.mta.info/developers/developer-data-terms.html

Note that the above historical datasource is outdated, and the above MTA Alert Archive is correct

NOTE: This assumes that the protobufs have already been downloaded to <code>MTADelayPredict/data/raw/status</code> e.g. <code>MTADelayPredict/data/raw/status/201901.zip</code>

In [7]:
import os
data_dir = '../data/raw/status'

In [2]:
proto_file = os.path.join(os.path.join(data_dir), 'gtfs-realtime.proto')
! wget -O $proto_file https://developers.google.com/transit/gtfs-realtime/gtfs-realtime.proto

--2020-04-27 23:32:52--  https://developers.google.com/transit/gtfs-realtime/gtfs-realtime.proto
Resolving developers.google.com (developers.google.com)... 172.217.10.142, 2607:f8b0:4006:812::200e
Connecting to developers.google.com (developers.google.com)|172.217.10.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27065 (26K) [None]
Saving to: ‘../data/raw/status/gtfs-realtime.proto’


2020-04-27 23:32:52 (1.89 MB/s) - ‘../data/raw/status/gtfs-realtime.proto’ saved [27065/27065]



In [3]:
mta_proto_file = os.path.join(os.path.join(data_dir), 'nyct-subway.proto')
! wget -O $mta_proto_file https://api.mta.info/nyct-subway.proto.txt

--2020-04-27 23:32:53--  https://api.mta.info/nyct-subway.proto.txt
Resolving api.mta.info (api.mta.info)... 13.224.215.82, 13.224.215.125, 13.224.215.111, ...
Connecting to api.mta.info (api.mta.info)|13.224.215.82|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5387 (5.3K) [text/plain]
Saving to: ‘../data/raw/status/nyct-subway.proto’


2020-04-27 23:32:53 (238 MB/s) - ‘../data/raw/status/nyct-subway.proto’ saved [5387/5387]



In [10]:
! protoc -I $data_dir --python_out=$data_dir $data_dir/nyct-subway.proto $data_dir/gtfs-realtime.proto



### Download data to test parsing

Download data as per http://web.mta.info/developers/resources/nyct/MTA-Bus-Time-documentation.htm

Unfortunately we have to fetch the minutely ones, as the daily batches no longer seem available

These should be fetched using <code>wget https://datamine-history.s3.amazonaws.com/gtfs-2014-09-17-09-31<\code> for the timestamp of <code>2014-09-17-09-31<\code>
    
Historical alert data can be found at: https://m.mymtaalerts.com/archive
Realtime service status can be avialable here: http://web.mta.info/status/serviceStatus.txt
    
Through experimentation, it looks like historical data is only available up until 2018-10-14.


In [8]:
import pandas as pd
import urllib3
import sys

In [9]:
sys.path.append(os.path.join(data_dir))
import nyct_subway_pb2
import gtfs_realtime_pb2

In [8]:
import glob
protobuf_paths = glob.glob('{}/[0-9]*.zip'.format(data_dir))

if len(protobuf_paths) == 0:
    raise ValueError('No matching protbufs found in {}, please download from https://m.mymtaalerts.com/archive')
    
print(protobuf_paths)

['../data/raw/status/201811.zip', '../data/raw/status/201812.zip', '../data/raw/status/201901.zip', '../data/raw/status/201902.zip']


In [10]:
import zipfile
import shutil
import progressbar
import io

msg = gtfs_realtime_pb2.FeedMessage()

# Keep a list of files with failed conversions
failed_files = os.path.join(data_dir, 'failures.txt')

force = False

# unzip monthly rollups, then unzip the daily files inside
# This code is largely copied from: https://stackoverflow.com/questions/36285502/how-to-extract-zip-file-recursively-in-python
# The daily zipfiles are ~1GB, so there are big speed gains from unzipping in memory
#for monthly_file in protobuf_paths[-1:]:
for monthly_file in ['../data/raw/status/201903.zip',]:
    widgets = [progressbar.Percentage(), progressbar.Bar(), progressbar.Variable('failures')]    

    
    print("Extracting: " + monthly_file)
    z = zipfile.ZipFile(monthly_file)
    for i,f in enumerate(z.namelist()):
        print("{}/{}".format(i+1, len(z.namelist())))
        # get directory name from file
        dirname = os.path.join(data_dir, os.path.splitext(f)[0])
        # create new directory
        os.makedirs(dirname, exist_ok=True)
        # read inner zip file into bytes buffer 
        content = io.BytesIO(z.read(f))
        zip_file = zipfile.ZipFile(content)
        
        # Skip if already unzipped
        if not force:
            if len(glob.glob(dirname+'/*')) == len(zip_file.namelist()):
                print("Skipping " + os.path.basename(dirname))
                continue
         
        # Iterate through in-memory zipfile, decoding protobuf into json
        bar = progressbar.ProgressBar(widgets=widgets, max_value=len(zip_file.namelist()), min_poll_interval=.5).start()
        failures = 0
        for j,f2 in enumerate(zip_file.namelist()):
            
            try:
                zip_file.extract(f2, dirname)
                
                # add message handler
                # Something like:
                
                #msg.ParseFromString(zip_file.read(f2)) 
            except Exception as e:
                # At the moment, some messages a sporadically unable to parse
                with io.open(failed_files, 'a') as fh:
                    fh.write(f2+'\n')
                    
                failures += 1
            
            # For now, just bail in order to examine the msg object
            # raise Exception("Debug Exception")
            
            sys.stdout.flush()
            bar.update(j+1, failures=failures)
        zip_file.close()
        
        bar.finish()
    
    

Extracting: ../data/raw/status/201903.zip
1/31


NameError: name 'glob' is not defined

In [12]:
%debug

> [0;32m/miniconda3/lib/python3.7/zipfile.py[0m(1325)[0;36m_RealGetContents[0;34m()[0m
[0;32m   1323 [0;31m            [0;32mraise[0m [0mBadZipFile[0m[0;34m([0m[0;34m"File is not a zip file"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1324 [0;31m        [0;32mif[0m [0;32mnot[0m [0mendrec[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1325 [0;31m            [0;32mraise[0m [0mBadZipFile[0m[0;34m([0m[0;34m"File is not a zip file"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1326 [0;31m        [0;32mif[0m [0mself[0m[0;34m.[0m[0mdebug[0m [0;34m>[0m [0;36m1[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1327 [0;31m            [0mprint[0m[0;34m([0m[0mendrec[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  up


> [0;32m/miniconda3/lib/python3.7/zipfile.py[0m(1258)[0;36m__init__[0;34m()[0m
[0;32m   1256 [0;31m        [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1257 [0;31m            [0;32mif[0m [0mmode[0m [0;34m==[0m [0;34m'r'[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1258 [0;31m                [0mself[0m[0;34m.[0m[0m_RealGetContents[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1259 [0;31m            [0;32melif[0m [0mmode[0m [0;32min[0m [0;34m([0m[0;34m'w'[0m[0;34m,[0m [0;34m'x'[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1260 [0;31m                [0;31m# set the modified flag so central directory gets written[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  up


> [0;32m<ipython-input-11-2770aeb685b3>[0m(30)[0;36m<module>[0;34m()[0m
[0;32m     28 [0;31m        [0;31m# read inner zip file into bytes buffer[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     29 [0;31m        [0mcontent[0m [0;34m=[0m [0mio[0m[0;34m.[0m[0mBytesIO[0m[0;34m([0m[0mz[0m[0;34m.[0m[0mread[0m[0;34m([0m[0mf[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 30 [0;31m        [0mzip_file[0m [0;34m=[0m [0mzipfile[0m[0;34m.[0m[0mZipFile[0m[0;34m([0m[0mcontent[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     31 [0;31m[0;34m[0m[0m
[0m[0;32m     32 [0;31m        [0;31m# Skip if already unzipped[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  print f


*** SyntaxError: Missing parentheses in call to 'print'. Did you mean print(f)?


ipdb>  f


'201812.7z'


ipdb>  monthly_file


'../data/raw/status/201812.zip'


ipdb>  f


'201812.7z'


ipdb>  q


In [79]:
# What trains are available here
train_set = set()
for e in entity:
    train_set.append

header {
  gtfs_realtime_version: "1.0"
  incrementality: FULL_DATASET
  timestamp: 1541060371
  [nyct_feed_header] {
    nyct_subway_version: "1.0"
    trip_replacement_period {
      route_id: "J"
      replacement_period {
        end: 1541062171
      }
    }
    trip_replacement_period {
      route_id: "Z"
      replacement_period {
        end: 1541062171
      }
    }
  }
}
entity {
  id: "46000001"
  trip_update {
    trip {
      trip_id: "021500_J..N"
      start_date: "20181101"
      route_id: "J"
      [nyct_trip_descriptor] {
        train_id: "1J 0335 BRD/P-A"
        is_assigned: true
        direction: NORTH
      }
    }
    stop_time_update {
      arrival {
        time: 1541060354
      }
      departure {
        time: 1541060354
      }
      stop_id: "J17N"
      schedule_relationship: SCHEDULED
      [nyct_stop_time_update] {
        scheduled_track: "J1"
        actual_track: "J1"
      }
    }
    stop_time_update {
      arrival {
        time: 1541060444
 