In [1]:
import pandas as pd
import requests
import yaml
import io
import zipfile

from concurrent.futures import ThreadPoolExecutor


HEADERS = {
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4)"
            "AppleWebKit/537.36 (KHTML, like Gecko)"
            "Chrome/83.0.4103.97 Safari/537.36"
        )
    }


def check_url(url):
    """Test whether a url contains a valid zip file"""
    
    try:
        r = requests.get(url, headers=HEADERS)
        r.raise_for_status()
    except requests.exceptions.HTTPError as err:
        return f"HTTP error: {r.status_code}"
    except UnicodeError:
        return "Misformatted url"
        
    try:
        z = zipfile.ZipFile(io.BytesIO(r.content))
        #z.testzip()
    except zipfile.BadZipFile:
        return f"failed to unzip"
    
    return "success"



In [2]:
agency_data_url = (
    "https://raw.githubusercontent.com"
    "/cal-itp/data-infra/gtfs-list-tasks"
    "/airflow/data/agencies.yml"
)

r = requests.get(agency_data_url)
agencies_yml = yaml.safe_load(io.StringIO(r.text))

agencies = pd.DataFrame.from_dict(agencies_yml, orient = "index")

## Agencies missing ids

In [3]:
agencies[lambda d: d.itp_id.isna()]

Unnamed: 0,agency_name,gtfs_schedule_url,itp_id
altamont-corridor-express,Altamont Corridor Express,[https://transitfeeds.com/p/altamont-corridor-...,
banning-pass-transit,Banning Pass Transit,[http://data.trilliumtransit.com/gtfs/banning-...,
calaveras-transit,Calaveras Transit,[http://data.trilliumtransit.com/gtfs/calavera...,
clean-air-express,Clean Air Express,[http://www.cleanairexpress.com/GTFS7.24%COVID...,
cloverdale-transit,Cloverdale Transit,[http://data.trilliumtransit.com/gtfs/sonomaco...,


## Agencies not returning zip files

In [4]:
from concurrent import futures

agencies_long = agencies.explode("gtfs_schedule_url")

with ThreadPoolExecutor(max_workers = 8) as executor:
    agencies_long["status"] = list(executor.map(check_url, agencies_long.gtfs_schedule_url))

In [5]:
pd.set_option("display.max_colwidth", 99)
    
(agencies_long
  .loc[lambda d: d.status != "success"]
  .loc[:, ["agency_name", "status", "gtfs_schedule_url", "itp_id"]]
  .sort_values(["status", "agency_name"])
)

Unnamed: 0,agency_name,status,gtfs_schedule_url,itp_id
ac-transit,AC Transit,HTTP error: 401,https://api.actransit.org/transit/gtfs/current?token=2512B8179D2DC44895CDDC6D42,4.0
b-line,B-Line,HTTP error: 404,http://www.blinetransit.com/documents/google_transit.zip,48.0
clean-air-express,Clean Air Express,HTTP error: 404,http://www.cleanairexpress.com/GTFS7.24%COVID-19%CAE%GTFS.zip,
culver-citybus,Culver CityBus,HTTP error: 404,https://www.culvercity.org/home/showdocument?id=169,87.0
dinuba-area-regional-transit-,Dinuba Area Regional Transit,HTTP error: 404,https://tularecog.org/tcag/data-gis-modeling/gis-project-and-data/gtfs-data-as-of-june-19-/dart...,93.0
grapeline,Grapeline,HTTP error: 404,http://www.mjcaction.com/MJC_GTFS_Public/lodi_google_transit.zip,168.0
long-beach-transit,Long Beach Transit,HTTP error: 404,https://lbtransit.box.com/shared/static/aoyeskwmsa9g7pyg78q3xuiolgqe4f.zip,1.0
manteca-transit,Manteca Transit,HTTP error: 404,https://www.ci.manteca.ca.us/mantecatransit/googletransit/google_transit.zip,192.0
menlo-park-shuttles,Menlo Park Shuttles,HTTP error: 404,https://www.menlopark.org/DocumentCenter/View/73/google_transitzip,199.0
paso-express,Paso Express,HTTP error: 404,http://app.mecatran.com/urb/ws/feed/c2ZT1zbGcmFuc2O2NsaWVudD1zZWxmO2V4cGlyZ7dHlwZT1ndGZzO2tlezZ...,244.0
