This notebook was created to scrape information about the canonical GTFS validator's rules for consumption into dbt seed files. It downloads a zipfile from the [validator release page](https://github.com/MobilityData/gtfs-validator/releases), extracts the RULES.md file, and does some regex on the rules markdown to get a list of codes with their human-readable descriptions. 

In [None]:
import zipfile
from io import BytesIO
import requests
import re
import pandas as pd

In [None]:
for version in ['2.0.0', '3.1.1', '4.0.0']:
    print(f'Processing version {version}')
    # load the zipfile from github
    # borrowed some syntax here from https://github.com/chihacknight/chn-ghost-buses/blob/main/data_analysis/notebooks/static_gtfs_analysis.ipynb
    print('Loading zipfile')
    raw_zip = zipfile.ZipFile(BytesIO(requests.get(f'https://github.com/MobilityData/gtfs-validator/archive/refs/tags/v{version}.zip').content))
    
    # load the versioned rules file
    print('Parsing rules')
    with raw_zip.open(f'gtfs-validator-{version}/RULES.md') as f:
        rules = str(f.read())
        
    # regex the markdown to pull out rule descriptions 
    rule_text = dict(zip(re.findall('### (\w+)\\\\n\\\\n.+?\\\\n', rules), re.findall('### \w+\\\\n\\\\n(.+?)\\\\n', rules)))
    
    # convert regex results to dataframe
    rules_df = pd.DataFrame.from_dict(rule_text, orient = 'index', columns = ['human_readable_description']).reset_index().rename(columns = {'index': 'code'})
    
    # record version within dataframe
    rules_df['version'] = 'v' + version
    
    filename = f'gtfs_schedule_validator_rule_details_v{version}.csv'
    print(f'Saving to {filename}')
    # save to file
    rules_df.to_csv(filename, index = False)