In [1]:
import gcsfs
import pandas as pd
import json

from siuba.dply.vector import row_number
from siuba import *

fs = gcsfs.GCSFileSystem(project="cal-itp-data-infra")

BUCKET_URL = "gs://gtfs-data/schedule/2021-04-05T00:00:00+00:00"
DATA_URL_TMPL = BUCKET_URL + "/{itp_id}/{url_number}"

In [2]:
status = pd.read_csv(BUCKET_URL + "/status.csv") >> select(-_.startswith("Unnamed"))

status_success = status >> filter(_.status == "success")

# Note that I've opened an issue in siuba to implement rowwise(),
# to replace some cumbersome parts of this group_by -> mutate
# could also use df.apply(lambda x: ..., axis = 1)
tidy_gtfs_files = (status_success
    >> group_by(tmp = row_number(_))
    >> mutate(
        gtfs_url = lambda d: DATA_URL_TMPL.format(**d.squeeze()),
        gtfs_files = lambda d: [fs.listdir(d.squeeze()["gtfs_url"])]
    )
    >> ungroup()
    >> pipe(_.explode("gtfs_files"))
    >> mutate(gtfs_file_name = _.gtfs_files.apply(lambda x: x['name']))
)

In [3]:
validation_files = (tidy_gtfs_files
  >> filter(_.gtfs_file_name.str.contains("validation\\.json"))
  >> group_by(tmp = row_number(_))
  >> mutate(
      validation = lambda d: [json.load(fs.open(d.squeeze().gtfs_file_name))],
      notices = lambda d: [d["validation"].iloc[0]["data"]["report"]["notices"]],
      n_codes = lambda d: len(d["notices"].iloc[0])
  )
  >> ungroup()
)

### Sanity check that each successful unzip has validation.json

In [4]:
validation_files >> count()

Unnamed: 0,n
0,171


In [5]:
status_success >> count()

Unnamed: 0,n
0,171


In [6]:
validation_files.validation[0]

{'version': 'v2.0.0',
 'data': {'report': {'notices': [{'code': 'duplicate_fare_rule_zone_id_fields',
     'severity': 'ERROR',
     'totalNotices': 8,
     'notices': [{'csvRowNumber': 61,
       'fareId': 'L17_E',
       'previousCsvRowNumber': 59,
       'previousFareId': 'TL25'},
      {'csvRowNumber': 62,
       'fareId': 'L17_W',
       'previousCsvRowNumber': 60,
       'previousFareId': 'TL25'},
      {'csvRowNumber': 79,
       'fareId': 'L17_E',
       'previousCsvRowNumber': 77,
       'previousFareId': 'TL25'},
      {'csvRowNumber': 80,
       'fareId': 'L17_W',
       'previousCsvRowNumber': 78,
       'previousFareId': 'TL25'},
      {'csvRowNumber': 87,
       'fareId': 'L17_E',
       'previousCsvRowNumber': 85,
       'previousFareId': 'TL25'},
      {'csvRowNumber': 88,
       'fareId': 'L17_W',
       'previousCsvRowNumber': 86,
       'previousFareId': 'TL25'},
      {'csvRowNumber': 94,
       'fareId': 'L17_E',
       'previousCsvRowNumber': 92,
       'previousF

## Tidy notices

Note that validation.json has two levels of "notices":

* codes - the violated rule
* details - each specific case that violated the rule

Both are called "notices" in the json data. E.g...

```python
{
  data:
    report:
      # note the code, versus nested notices data
      notices: [{code: "some_code", notices: [...]}]
      
    # hopefully should be empty
    system_errors: {...}
}
```

### Notice codes

In [7]:
notice_codes = (validation_files
           .explode("notices")
           .assign(notices = lambda d: d["notices"].transform(pd.DataFrame))
          )

# note that siuba unnest currently requires resetting index
tidy_notice_codes = (notice_codes.reset_index(drop=True)
  >> select(_.agency_name, _.itp_id, _.url_number, _.notices)
  >> unnest("notices")
  )

In [8]:
tidy_notice_codes.head()

Unnamed: 0,agency_name,itp_id,url_number,code,severity,totalNotices,notices
0,AC Transit,4,0,duplicate_fare_rule_zone_id_fields,ERROR,8,"{'csvRowNumber': 61, 'fareId': 'L17_E', 'previ..."
1,AC Transit,4,0,duplicate_fare_rule_zone_id_fields,ERROR,8,"{'csvRowNumber': 62, 'fareId': 'L17_W', 'previ..."
2,AC Transit,4,0,duplicate_fare_rule_zone_id_fields,ERROR,8,"{'csvRowNumber': 79, 'fareId': 'L17_E', 'previ..."
3,AC Transit,4,0,duplicate_fare_rule_zone_id_fields,ERROR,8,"{'csvRowNumber': 80, 'fareId': 'L17_W', 'previ..."
4,AC Transit,4,0,duplicate_fare_rule_zone_id_fields,ERROR,8,"{'csvRowNumber': 87, 'fareId': 'L17_E', 'previ..."


In [9]:
tidy_notice_codes[425951: 425953].notices.tolist()

[{'csvRowNumber': 6745,
  'tripId': '426',
  'stopSequence': 3,
  'specifiedField': 'arrival_time'},
 {'csvRowNumber': 6745,
  'tripId': '426',
  'stopSequence': 3,
  'specifiedField': 'departure_time'}]

### Notice details

In [10]:
# create a giant table of notices
# note that it's the same length as tidy_notice_cases, but the "notices" dict of details
# is unpacked into separate columns. many are NA because notice details can have 
# different fields, but the big table helps wade through all the different forms
# they can take
_df_notice_details = pd.DataFrame(tidy_notice_codes.notices.tolist())

tidy_notice_details = tidy_notice_codes.join(_df_notice_details)
tidy_notice_details.head()

Unnamed: 0,agency_name,itp_id,url_number,code,severity,totalNotices,notices,csvRowNumber,fareId,previousCsvRowNumber,...,routeLongName,prevStopTimeDistTraveled,prevStopSequence,routeDesc,speedkmh,firstStopSequence,lastStopSequence,stopId,stopName,stopShapeThresholdMeters
0,AC Transit,4,0,duplicate_fare_rule_zone_id_fields,ERROR,8,"{'csvRowNumber': 61, 'fareId': 'L17_E', 'previ...",61.0,L17_E,59.0,...,,,,,,,,,,
1,AC Transit,4,0,duplicate_fare_rule_zone_id_fields,ERROR,8,"{'csvRowNumber': 62, 'fareId': 'L17_W', 'previ...",62.0,L17_W,60.0,...,,,,,,,,,,
2,AC Transit,4,0,duplicate_fare_rule_zone_id_fields,ERROR,8,"{'csvRowNumber': 79, 'fareId': 'L17_E', 'previ...",79.0,L17_E,77.0,...,,,,,,,,,,
3,AC Transit,4,0,duplicate_fare_rule_zone_id_fields,ERROR,8,"{'csvRowNumber': 80, 'fareId': 'L17_W', 'previ...",80.0,L17_W,78.0,...,,,,,,,,,,
4,AC Transit,4,0,duplicate_fare_rule_zone_id_fields,ERROR,8,"{'csvRowNumber': 87, 'fareId': 'L17_E', 'previ...",87.0,L17_E,85.0,...,,,,,,,,,,
