## Caputuring all of the D4 Validator results

* capture all the D4 agencies + 122 + 81
* Siuba code copy/pasted from `loading_gtfs_schedules`

In [46]:
import gcsfs
import pandas as pd
import json

from siuba.dply.vector import row_number
from siuba import *

fs = gcsfs.GCSFileSystem(project="cal-itp-data-infra")

BUCKET_URL = "gs://gtfs-data/schedule/2021-04-05T00:00:00+00:00"
DATA_URL_TMPL = BUCKET_URL + "/{itp_id}/{url_number}"

In [47]:
status = pd.read_csv(BUCKET_URL + "/status.csv") >> select(-_.startswith("Unnamed"))

status_success = status >> filter(_.status == "success")

# Note that I've opened an issue in siuba to implement rowwise(),
# to replace some cumbersome parts of this group_by -> mutate
# could also use df.apply(lambda x: ..., axis = 1)
tidy_gtfs_files = (status_success
    >> group_by(tmp = row_number(_))
    >> mutate(
        gtfs_url = lambda d: DATA_URL_TMPL.format(**d.squeeze()),
        gtfs_files = lambda d: [fs.listdir(d.squeeze()["gtfs_url"])]
    )
    >> ungroup()
    >> pipe(_.explode("gtfs_files"))
    >> mutate(gtfs_file_name = _.gtfs_files.apply(lambda x: x['name']))
)

In [48]:
validation_files = (tidy_gtfs_files
  >> filter(_.gtfs_file_name.str.contains("validation\\.json"))
  >> group_by(tmp = row_number(_))
  >> mutate(
      validation = lambda d: [json.load(fs.open(d.squeeze().gtfs_file_name))],
      notices = lambda d: [d["validation"].iloc[0]["data"]["report"]["notices"]],
      n_codes = lambda d: len(d["notices"].iloc[0])
  )
  >> ungroup()
)
validation_files >> count()
status_success >> count()

Unnamed: 0,n
0,171


In [49]:
notice_codes = (validation_files
           .assign(notices = lambda d: d["notices"].transform(pd.DataFrame))
          )

# note that siuba unnest currently requires resetting index
tidy_notice_codes = (notice_codes.reset_index(drop=True)
  >> select(_.agency_name, _.itp_id, _.url_number, _.notices)
  >> unnest("notices")
  )

In [50]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/1qr49azk6p30mp96_7myKoO-Bb_bXMMn5ZzgbL-uPiPw/gviz/tq?tqx=out:csv&sheet=Data')

In [51]:
agency_list= df[(df['Caltrans District (int)'] == 4) | (df.ITP_ID ==122) | (df.ITP_ID==81)][['Agency Name', 'ITP_ID']].to_dict(orient='records')

In [58]:
tidy_notice_codes[tidy_notice_codes.itp_id.isin([x['ITP_ID'] for x in agency_list])].to_csv('d4_notice_codes.csv', index=False)

In [59]:
notice_codes_long = tidy_notice_codes.explode("notices").reset_index(drop = True)

tidy_notice_details = notice_codes_long.join(
        pd.DataFrame(notice_codes_long.notices.tolist())
)

In [62]:
tidy_notice_details[tidy_notice_details.itp_id.isin([x['ITP_ID'] for x in agency_list])].to_csv('d4_notice_details.csv')

In [67]:
for x in agency_list:
    if x['ITP_ID'] in tidy_notice_codes['itp_id'].values:
        pass
    else:
        print(f'missing {x["Agency Name"]}')

missing Glenn Transit Service
missing SamTrans
missing Santa Rosa CityBus
missing Fairfield and Suisun Transit
missing Tri-Valley Wheels
missing Union City Transit
missing Tri Delta Transit
missing San Francisco Bay Ferry
missing SolTrans
missing Dixon Readi-Ride Transit Services
missing USDA Forest Service Southwest Region
missing Pleasanton Paratransit Service
missing Fremont Paratransit Program
missing Oakland International Airport
missing Emery Go-Round
missing Dumbarton Express
missing Berkeley Lab Bus System
missing Hill Hopper Shuttle
missing San Leandro LINKS
missing West Berkeley Shuttle
missing CSUEB Shuttle
missing Monument Community Shuttle
missing Angel Island-Tiburon Ferry Company
missing Marin Airporter
missing Blue and Gold Fleet
missing Metropolitan Transportation Commission
missing Mission Bay TMA
missing PresidioGo Shuttle
missing Commute.org Shuttles
missing Foster City Senior Express Shuttle
missing San Francisco International Airport
missing Menlo Park Shuttles
mi