In [1]:
# note that this notebook picks up w/ the loading-gtfs-schedules data
%run loading-gtfs-schedules.ipynb

from siuba import *
from siuba.dply.vector import n
import gcsfs

# Which agencies can we easily load GTFS data for?

This notebook investigates which agencies pass validation. These agencies are a good place to start for analyzing data, and figuring out how to store it in the warehouse.

**tl;dr** -

* Validator codes come in three levels of severity: error, warning, and info.
* There are 16 agencies that don't trigger validator error codes.
* Early win: we can try loading data for these agencies into the warehouse, while we figure out which error codes are less serious / can be worked around.

## What are the different error codes?

In [2]:
agencies_successful = status >> filter(_.status == "success") >> select(_.agency_name, _.itp_id, _.url_number)

In [3]:
code_severity = tidy_notice_codes >> distinct(_.code, _.severity) >> arrange(_.severity, _.code)

code_severity

Unnamed: 0,code,severity
4,decreasing_or_equal_shape_distance,ERROR
14,decreasing_or_equal_stop_time_distance,ERROR
0,duplicate_fare_rule_zone_id_fields,ERROR
1,invalid_phone_number,ERROR
11,invalid_url,ERROR
9,leading_or_trailing_whitespaces,ERROR
10,missing_required_field,ERROR
21,missing_required_file,ERROR
15,same_name_and_description_for_route,ERROR
2,unknown_column,INFO


## How many error codes does each agency have?

In [4]:
# This is a re-usable function
tally_codes = (
  group_by(_.agency_name, _.itp_id, _.url_number)
  >> summarize(
      n_codes = n(_),
      codes = lambda _: [_.code.tolist()]
  )
  >> arrange(_.n_codes)
)

In [5]:
ttl_agency_codes = tidy_notice_codes >> tally_codes
#ttl_agency_codes

In [6]:
ttl_agency_error_codes = (tidy_notice_codes
  >> filter(_.severity == "ERROR")
  >> tally_codes
  >> right_join(_, agencies_successful, ["agency_name", "itp_id", "url_number"])
  >> mutate(n_codes = _.n_codes.fillna(0).astype(int))
  >> arrange(_.n_codes)
)

ttl_agency_error_codes

Unnamed: 0,agency_name,itp_id,url_number,n_codes,codes
5,Arcadia Transit,17,0,0,
7,Arvin Transit,21,0,0,
12,Baldwin Park Transit,29,0,0,
18,Bell Gardens,36,0,0,
19,Bellflower Bus,37,0,0,
...,...,...,...,...,...
103,Ojai Trolley,231,0,4,"[decreasing_or_equal_shape_distance, invalid_p..."
122,San Diego Metropolitan Transit System,278,0,4,"[decreasing_or_equal_shape_distance, duplicate..."
127,Santa Clara Valley Transportation Authority,294,0,4,"[decreasing_or_equal_shape_distance, duplicate..."
37,Commuter Express,3,0,5,"[decreasing_or_equal_shape_distance, decreasin..."


In [7]:
# counting agencies by their number of error codes
ttl_agency_error_codes >> count(n_error_codes = _.n_codes)

Unnamed: 0,n_error_codes,n
0,0,16
1,1,102
2,2,40
3,3,8
4,4,3
5,5,2


## Which agencies are missing required files?

See [remix/partridge repo](https://github.com/remix/partridge) for useful diagrams, and suggestive hints about how this data can go wrong.

In [8]:
agency_missing_files = (tidy_notice_details
  >> filter(_.code == "missing_required_file")
  >> select(_.agency_name, _.filename)
)

In [9]:
# this detail gets flagged for every row of the CSV that's missing the column
agency_missing_files

Unnamed: 0,agency_name,filename
205558,OmniTrans,stop_times.txt
205559,OmniTrans,routes.txt
205560,OmniTrans,trips.txt
205561,OmniTrans,stops.txt
205562,OmniTrans,agency.txt
