From the current Solr set as of 2015-09-22T21:00.

Initial run to determine:

1. How many of the parsable XML responses include a schema location out of the set?
2. What are the unique schemas?
3. How many are federally-hosted?

TODOs:

- aggregate by protocol (post-identification)

In [1]:
import os
import json
import glob
from lxml import etree

In [2]:
# parsing code
# note: there's some older cruft to deal with
# related to cdata, encodings, etc.

_total_responses = 677510
_doc_dir = '/Users/sparky/Documents/solr_responses/solr_20150922_docs/'
_xpaths = [
    ['//*', '@schemaLocation'],
    ['//*', '@noNamespaceSchemaLocation']
]

def generate_localname_xpath(tags):
    unchangeds = ['*', '..', '.', '//*']
    return '/'.join(
        ['%s*[local-name()="%s"]' % ('@' if '@' in t else '', t.replace('@', ''))
         if t not in unchangeds else t for t in tags])


def extract_attribs(elem, tags):
    e = extract_elems(elem, tags)
    return list([' '.join(m.strip().split()) for m in e] if isinstance(e, list) else [' '.join(e.split())])


def extract_elems(elem, tags):
    xp = generate_localname_xpath(tags)
    return elem.xpath(xp)


def _clean_content(response):
    response = response.replace('\\\n', '').replace('\r\n', '').replace('\\r', '').replace('\\n', '').replace('\n', '')
    response = response.replace('\\\t', '').replace('\\t', '').replace('\t', '')
    # this is likely useless (mostly issues in the json)
    response = response.replace('\\\\ufffd', '').replace('\\\ufffd', '').replace('\\ufffd', '').replace('\ufffd', '')
    response = response.decode('utf-8', errors='replace').encode('unicode_escape') 
    return response


def _parse_content(response):
    parser = etree.XMLParser()
    return etree.fromstring(response, parser=parser)


def prep_content(filename):
    with open(filename, 'r') as f:
        data = json.loads(f.read())
    response = data.get('raw_content', '')
    response = _clean_content(response)
    return _parse_content(response)


In [3]:
# gather the schemas, noting that the method
# for extraction does *not* consider failovers
# ie, the ="schema_a.xsd schema_b.xsd" situation
# so we need to handle it after

parsed_responses = 0
failed_responses = []
unique_schemas = set()
packed_unique_schemas = set()
responses_with_a_schema = 0
for i, f in enumerate(glob.glob(_doc_dir + '*.json')):
    try:
        xml = prep_content(f)
        parsed_responses += 1
    except Exception as ex:
        failed_responses.append((f, ex))
        continue
    
    schemas = []
    for xp in _xpaths:
        schemas += extract_attribs(xml, xp)

    if not schemas:
        continue

    packed_unique_schemas = packed_unique_schemas.union(set(schemas))
    
    schemas = [a.strip() for s in schemas for a in s.split()]
    unique_schemas = unique_schemas.union(set(schemas))
    responses_with_a_schema += 1

In [4]:
parsed_responses

509608

In [5]:
responses_with_a_schema

124769

In [6]:
len(failed_responses)

167902

In [8]:
len(unique_schemas)

2382

In [9]:
len(packed_unique_schemas)

1960

In [7]:
with open('outputs/unique_schemas.txt', 'w') as f:
    f.write('\n'.join(list(unique_schemas)))
with open('outputs/unique_packed_schemas.txt', 'w') as f:
    f.write('\n'.join(list(packed_unique_schemas)))

And here we pause for a bit to do some manual cleanup. From that, we need to reset for unique from our now three schema lists (unique, unique packed, federal). 

Note: unique is the schema location split by spaces (each schema listed), unique packed is the schema location as is, and federal are the federally-hosted schemas.

So re-open, set, sort and save.

In [3]:
with open('outputs/unique_schemas.txt', 'r') as f:
    lines = f.readlines()
    
unique = sorted(list(set(lines)))
with open('outputs/unique_schemas_sorted.txt', 'w') as f:
    f.write(''.join(list(unique)))

print 'uniques: ', len(unique)

with open('outputs/unique_packed_schemas.txt', 'r') as f:
    lines = f.readlines()
    
unique = sorted(list(set(lines)))
with open('outputs/unique_packed_schemas_sorted.txt', 'w') as f:
    f.write(''.join(list(unique)))

print 'packed uniques: ', len(unique)

with open('outputs/federal_schemas.txt', 'r') as f:
    lines = f.readlines()
    
unique = sorted(list(set(lines)))
with open('outputs/federal_schemas_sorted.txt', 'w') as f:
    f.write(''.join(list(unique)))

print 'feds: ', len(unique)


uniques:  2366
packed uniques:  1960
feds:  206


A few more stats from the unique, unpacked list:

Relative paths or simple file names: 363

In [2]:
# linkrot 11/16/2015
import requests

with open('outputs/federal_schemas_sorted.txt', 'r') as f:
    federal_schemas = [s.strip() for s in f.readlines() if s]
    
statuses = []
for fs in federal_schemas:
    try:
        rsp = requests.head(fs, timeout=30)
        status_code = rsp.status_code
    except:
        status_code = 900
    
    statuses.append((fs, status_code))
        

In [3]:
statuses

[('ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem.xsd', 900),
 ('ftp://ftp.ncddc.noaa.gov/pub/Metadata/Online_ISO_Training/Intro_to_ISO/schemas/ISObio/schema.xsd',
  900),
 ('http://api.echo.nasa.gov/echo/wsdl/EchoForms.xsd', 200),
 ('http://aviationweather.gov/adds/schema/aircraftreport1_0.xsd', 200),
 ('http://aviationweather.gov/adds/schema/airsigmet1_1.xsd', 200),
 ('http://aviationweather.gov/adds/schema/gairmet1_0.xsd', 200),
 ('http://aviationweather.gov/adds/schema/pirep1_2.xsd', 200),
 ('http://aviationweather.gov/adds/schema/taf1_2.xsd', 200),
 ('http://data.nodc.noaa.gov/coris/data/CoRIS/fgdc_schema_coris/fgdc-std-001-1998.xsd',
  200),
 ('http://data.usgs.gov/nggdpp/NGGDPPMetadataSample_v2.xsd', 404),
 ('http://earthquake.usgs.gov/eqcenter/shakemap/xml/schemas/shakemap.xsd',
  301),
 ('http://earthquake.usgs.gov/shakemap/xml/schemas/shakemap.xsd', 404),
 ('http://echo.nasa.gov/v9/echoforms', 301),
 ('http://fgdcxml.sourceforge.net/schema/fgdc-std-012-2002/fgdc-st

In [None]:
import requests

with open('outputs/unique_schemas_sorted.txt', 'r') as f:
    schemas = [s.strip() for s in f.readlines() if s]
    
all_statuses = []
for fs in schemas:
    try: 
        rsp = requests.head(fs, timeout=30)
        status_code = rsp.status_code
    except:
        status_code = 900
    
    all_statuses.append((fs, status_code))

with open('outputs/schema_linkrot.csv', 'w') as f:
    f.write('\n'.join([','.join(s) for s in all_statuses]))

In [3]:
with open('outputs/schema_linkrot.csv', 'w') as f:
    f.write('\n'.join(['"{0}",{1}'.format(s[0], s[1]) for s in all_statuses]))

In [4]:
statuses = [('ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem.xsd', 900),
 ('ftp://ftp.ncddc.noaa.gov/pub/Metadata/Online_ISO_Training/Intro_to_ISO/schemas/ISObio/schema.xsd',
  900),
 ('http://api.echo.nasa.gov/echo/wsdl/EchoForms.xsd', 200),
 ('http://aviationweather.gov/adds/schema/aircraftreport1_0.xsd', 200),
 ('http://aviationweather.gov/adds/schema/airsigmet1_1.xsd', 200),
 ('http://aviationweather.gov/adds/schema/gairmet1_0.xsd', 200),
 ('http://aviationweather.gov/adds/schema/pirep1_2.xsd', 200),
 ('http://aviationweather.gov/adds/schema/taf1_2.xsd', 200),
 ('http://data.nodc.noaa.gov/coris/data/CoRIS/fgdc_schema_coris/fgdc-std-001-1998.xsd',
  200),
 ('http://data.usgs.gov/nggdpp/NGGDPPMetadataSample_v2.xsd', 404),
 ('http://earthquake.usgs.gov/eqcenter/shakemap/xml/schemas/shakemap.xsd',
  301),
 ('http://earthquake.usgs.gov/shakemap/xml/schemas/shakemap.xsd', 404),
 ('http://echo.nasa.gov/v9/echoforms', 301),
 ('http://fgdcxml.sourceforge.net/schema/fgdc-std-012-2002/fgdc-std-012-2002.xsd',
  200),
 ('http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/', 200),
 ('http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/dif.xsd', 200),
 ('http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/dif_v9.4.xsd', 200),
 ('http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/dif_v9.7.1.xsd', 200),
 ('http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/dif_v9.8.4.xsd', 200),
 ('http://gcmd.gsfc.nasa.gov/Aboutus/xml/kms/kms.xsd', 200),
 ('http://graphical.weather.gov/xml/DWMLgen/schema/DWML.xsd', 200),
 ('http://graphical.weather.gov/xml/DWMLgen/schema/meta_data.xsd', 200),
 ('http://hesapi.lbl.gov/mf_api/xsd/add_lu_schedule_obj', 200),
 ('http://hesapi.lbl.gov/mf_api/xsd/calculate', 200),
 ('http://hesapi.lbl.gov/mf_api/xsd/nearest_weather', 200),
 ('http://hesapi.lbl.gov/mf_api/xsd/retrieve_results', 200),
 ('http://hesapi.lbl.gov/mf_api/xsd/retrofit', 200),
 ('http://hesapi.lbl.gov/mf_api/xsd/submit_dhw_water_heater', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/archive_buildings_by_id', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/building_ca_id', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/calc_energy_multi', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/calculate', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/delete_buildings_by_id', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/doe2sim', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/export_label_results', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/generate_label', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/login_lbl', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/register_qa', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/retrieve_buildings_by_address', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/retrieve_buildings_by_id', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/retrieve_extended_results', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/retrieve_inputs', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/retrieve_label_results', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/retrieve_legacy_buildings', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/retrieve_recommendations', 200),
 ('http://hesapi.lbl.gov/st_api/xsd/user_exists', 200),
 ('http://isis.astrogeology.usgs.gov/Schemas/Application/application.xsd',
  200),
 ('http://isis.astrogeology.usgs.gov/Schemas/Documentation/documentation.xsd',
  200),
 ('http://lcweb2.loc.gov/mets/Schemas/AMD.xsd', 200),
 ('http://lcweb2.loc.gov/mets/Schemas/VMD.xsd', 200),
 ('http://leisp.usdoj.gov/niem/FoiaAnnualReport/exchange/1.02', 900),
 ('http://leisp.usdoj.gov/niem/FoiaAnnualReport/exchange/1.03', 900),
 ('http://ngdc.noaa.gov/metadata/published/xsd/ngdcSchema/schema.xsd', 200),
 ('http://ngdc.noaa.gov/metadata/published/xsd/schema.xsd', 200),
 ('http://ngdc.noaa.gov/mgg/ecs/metadata/seismic/xml/seismicMetadata.xsd',
  200),
 ('http://nssdc.gsfc.nasa.gov/schema/astro_mission.xsd', 200),
 ('http://nvd.nist.gov/schema/configuration_0.1.xsd', 301),
 ('http://nvd.nist.gov/schema/nvd-cce-feed_0.1.xsd', 301),
 ('http://nvd.nist.gov/schema/scap-core_0.3.xsd', 301),
 ('http://pds.jpl.nasa.gov/pds4/pds/v1/PDS4_PDS_1101.xsd', 302),
 ('http://pds.jpl.nasa.gov/pds4/pds/v1/PDS4_PDS_1201.xsd', 302),
 ('http://pds.jpl.nasa.gov/pds4/pds/v1/PDS4_PDS_1301.xsd', 302),
 ('http://pds.jpl.nasa.gov/pds4/pds/v1/PDS4_PDS_1400.xsd', 302),
 ('http://pds.jpl.nasa.gov/pds4/schema/released/pds/v1/PDS4_PDS_1000.xsd',
  302),
 ('http://pds.jpl.nasa.gov/pds4/schema/released/pds/v1/PDS4_PDS_1101.xsd',
  302),
 ('http://pds.jpl.nasa.gov/repository/pds4/examples/dph_examples_1100/dph_example_archive_VG2PLS/xml_schema/PDS4_PDS_1100.xsd',
  302),
 ('http://pds.jpl.nasa.gov/repository/pds4/examples/dph_examples_1100/dph_example_products/xml_schema/PDS4_PDS_1100.xsd',
  302),
 ('http://pds.jpl.nasa.gov/repository/pds4/examples/dph_examples_1101/dph_example_archive_VG2PLS/xml_schema/PDS4_PDS_1101.xsd',
  302),
 ('http://pds.jpl.nasa.gov/repository/pds4/examples/dph_examples_1101/dph_example_products/xml_schema/PDS4_PDS_1101.xsd',
  302),
 ('http://pds.jpl.nasa.gov/repository/pds4/examples/dph_examples_1101/dph_example_products/xml_schema/dph_example_dict_1101.xsd',
  302),
 ('http://pds.jpl.nasa.gov/repository/pds4/examples/dph_examples_1200/dph_example_archive_VG2PLS/xml_schema/PDS4_PDS_1200.xsd',
  302),
 ('http://pds.jpl.nasa.gov/repository/pds4/examples/dph_examples_1200/dph_example_products/xml_schema/PDS4_PDS_1200.xsd',
  302),
 ('http://pds.jpl.nasa.gov/repository/pds4/examples/test_0100/PDS4_PDS_1000.xsd',
  302),
 ('http://pds.jpl.nasa.gov/repository/pds4/examples/test_0100/dph_example_dict_0100.xsd',
  302),
 ('http://pds.nasa.gov/pds4/disp/v1', 302),
 ('http://pds.nasa.gov/pds4/disp/v1/PDS4_DISP_1301.xsd', 302),
 ('http://pds.nasa.gov/pds4/dph/v01', 302),
 ('http://pds.nasa.gov/pds4/mission/mvn/v1', 302),
 ('http://pds.nasa.gov/pds4/mvn/v1/PDS4_MVN_1000.xsd', 302),
 ('http://pds.nasa.gov/pds4/mvn/v1/PDS4_MVN_1011.xsd', 302),
 ('http://pds.nasa.gov/pds4/pds/v1', 302),
 ('http://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1000.xsd', 302),
 ('http://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1100.xsd', 302),
 ('http://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1101.xsd', 302),
 ('http://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1200.xsd', 302),
 ('http://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1300.xsd', 302),
 ('http://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1301.xsd', 302),
 ('http://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1400.xsd', 302),
 ('http://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1410.xsd', 302),
 ('http://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1500.xsd', 302),
 ('http://pds.nasa.gov/pds4/pds/v1\\thttp://pds.nasa.gov/pds4/schema/released/pds/v1/PDS4_PDS_1100.xsd',
  302),
 ('http://pds.nasa.gov/pds4/phxmd/v02', 302),
 ('http://pds.nasa.gov/pds4/schema/released/pds/v1/PDS4_PDS_1000.xsd', 302),
 ('http://pds.nasa.gov/pds4/schema/released/pds/v1/PDS4_PDS_1100.xsd', 302),
 ('http://pds.nasa.gov/pds4/schema/released/pds/v1/PDS4_PDS_1101.xsd', 302),
 ('http://pds.nasa.gov/pds4/schema/released/pds/v1/PDS4_PDS_1201.xsd', 302),
 ('http://pds.nasa.gov/pds4/schema/released/pds/v1/PDS4_PDS_1300.xsd', 302),
 ('http://pds.nasa.gov/pds4/schema/released/pds/v1/PDS4_PDS_1400.xsd', 302),
 ('http://pds.nasa.gov/pds4/sp/v1', 302),
 ('http://pubchem.ncbi.nlm.nih.gov.libproxy.lib.unc.edu/hierarchy_data', 302),
 ('http://pubchem.ncbi.nlm.nih.gov.libproxy.lib.unc.edu/pug_rest', 302),
 ('http://pubchem.ncbi.nlm.nih.gov.libproxy.lib.unc.edu/pug_rest/hierarchy_data.xsd',
  302),
 ('http://pubchem.ncbi.nlm.nih.gov.libproxy.lib.unc.edu/pug_rest/pug_rest.xsd',
  302),
 ('http://pubchem.ncbi.nlm.nih.gov/hierarchy_data', 404),
 ('http://pubchem.ncbi.nlm.nih.gov/pug_rest', 301),
 ('http://pubchem.ncbi.nlm.nih.gov/pug_rest/hierarchy_data.xsd', 200),
 ('http://pubchem.ncbi.nlm.nih.gov/pug_rest/pug_rest.xsd', 200),
 ('http://purl.lanl.gov/STB-RL/schemas/2003-09/DII.xsd', 900),
 ('http://purl.lanl.gov/STB-RL/schemas/2004-11/DIDL.xsd', 900),
 ('http://scap.nist.gov/schema/configuration/0.1', 404),
 ('http://scap.nist.gov/schema/feed/configuration/0.1', 404),
 ('http://search.ams.usda.gov/FarmersMarkets/v1/data.svc?xsd=xsd0', 900),
 ('http://search.ams.usda.gov/FarmersMarkets/v1/data.svc?xsd=xsd1', 900),
 ('http://search.ams.usda.gov/FarmersMarkets/v1/data.svc?xsd=xsd2', 900),
 ('http://spaseql.gsfc.nasa.gov/schema/spaseql_query.xsd', 900),
 ('http://starbase.jpl.nasa.gov/pds4/1201/dph_examples_20140625/dph_example_archive_VG2PLS/xml_schema/PDS4_PDS_1201.xsd',
  302),
 ('http://starbase.jpl.nasa.gov/pds4/1201/dph_examples_20140625/dph_example_archive_VG2PLS/xml_schema/dph_example_dict_1201.xsd',
  302),
 ('http://starbase.jpl.nasa.gov/pds4/1201/dph_examples_20140625/dph_example_products/xml_schema/PDS4_PDS_1201.xsd',
  302),
 ('http://starbase.jpl.nasa.gov/pds4/1201/dph_examples_20140625/dph_example_products/xml_schema/dph_example_dict_1201.xsd',
  302),
 ('http://starbase.jpl.nasa.gov/pds4/1300/dph_example_archive_VG2PLS/xml_schema/PDS4_PDS_1300.xsd',
  302),
 ('http://starbase.jpl.nasa.gov/pds4/1300/dph_example_archive_VG2PLS/xml_schema/dph_example_dict_1300.xsd',
  302),
 ('http://starbase.jpl.nasa.gov/pds4/1300/dph_example_products/xml_schema/PDS4_PDS_1300.xsd',
  302),
 ('http://starbase.jpl.nasa.gov/pds4/1300/dph_example_products/xml_schema/dph_example_dict_1300.xsd',
  302),
 ('http://starbase.jpl.nasa.gov/pds4/1400/dph_example_products/xml_schema/dph_example_dict_1400.xsd',
  302),
 ('http://stategeothermaldata.org/uri-gin/aasg/xmlschema/welllog/0.8', 301),
 ('http://water.usgs.gov/GIS/metadata/usgswrd/fgdc-std-001-1998.xsd', 200),
 ('http://waterservices.usgs.gov/WaterML-1.1.xsd', 200),
 ('http://waterservices.usgs.gov/WaterML-1.2.xsd', 200),
 ('http://weather.gov/ohd/hydroxc/schemas/hydrogen/HydroGenData.xsd', 302),
 ('http://www.csc.noaa.gov/ioos/schema', 301),
 ('http://www.digitalpreservation.gov:/formats/schemas/fdd/v1/htmlTypes.xsd',
  200),
 ('http://www.fgdc.gov/framework/geodeticControl', 404),
 ('http://www.fgdc.gov/metadata/fgdc-std-001-1998.xsd', 200),
 ('http://www.fgdc.gov/schemas/metadata/fgdc-std-001-1998.xsd', 200),
 ('http://www.imsglobal.org/xsd/ims_xlink.xsd', 200),
 ('http://www.loc.gov./standards/mets/mets.xsd', 200),
 ('http://www.loc.gov/AMD/', 404),
 ('http://www.loc.gov/MARC21/slim', 404),
 ('http://www.loc.gov/METS/', 303),
 ('http://www.loc.gov/METS_Profile/', 404),
 ('http://www.loc.gov/METS_Profile/v2', 404),
 ('http://www.loc.gov/VMD/', 404),
 ('http://www.loc.gov/audioMD/', 404),
 ('http://www.loc.gov/ead/ead.xsd', 200),
 ('http://www.loc.gov/mads/mads.xsd', 301),
 ('http://www.loc.gov/mets/mets.xsd', 301),
 ('http://www.loc.gov/mix/', 301),
 ('http://www.loc.gov/mix/mix.xsd', 301),
 ('http://www.loc.gov/mix/mix02.xsd', 301),
 ('http://www.loc.gov/mix/v20', 301),
 ('http://www.loc.gov/mods/', 301),
 ('http://www.loc.gov/mods/v3', 301),
 ('http://www.loc.gov/mods/v3/mods-3-1.xsd', 301),
 ('http://www.loc.gov/standards/amdvmd/audioMD.xsd', 200),
 ('http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd', 200),
 ('http://www.loc.gov/standards/mets/mets.xsd', 200),
 ('http://www.loc.gov/standards/mets/profile_docs/mets.profile.v1-2.xsd', 200),
 ('http://www.loc.gov/standards/mets/profile_docs/mets.profile.v2-0.xsd', 200),
 ('http://www.loc.gov/standards/mets/version16/mets.v1-6.xsd', 200),
 ('http://www.loc.gov/standards/mets/version17/mets.v1-7.xsd', 200),
 ('http://www.loc.gov/standards/mets/xlink.xsd', 200),
 ('http://www.loc.gov/standards/mix/', 200),
 ('http://www.loc.gov/standards/mix/mix.xsd', 200),
 ('http://www.loc.gov/standards/mix/mix02.xsd', 200),
 ('http://www.loc.gov/standards/mix/mix20/mix20.xsd', 200),
 ('http://www.loc.gov/standards/mods/mods.xsd', 200),
 ('http://www.loc.gov/standards/mods/v3/mods-3-0.xsd', 200),
 ('http://www.loc.gov/standards/mods/v3/mods-3-1.xsd', 200),
 ('http://www.loc.gov/standards/mods/v3/mods-3-2.xsd', 200),
 ('http://www.loc.gov/standards/mods/v3/mods-3-3.xsd', 200),
 ('http://www.loc.gov/standards/mods/v3/mods-3-4.xsd', 200),
 ('http://www.loc.gov/standards/mods/xlink.xsd', 200),
 ('http://www.loc.gov/standards/premis', 301),
 ('http://www.loc.gov/standards/premis/', 200),
 ('http://www.loc.gov/standards/premis/PREMIS-v1-0.xsd', 200),
 ('http://www.loc.gov/standards/premis/premis.xsdhttp://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1201.xsd',
  404),
 ('http://www.loc.gov/standards/premis/v1', 301),
 ('http://www.loc.gov/standards/premis/v1/PREMIS-v1-1.xsd', 200),
 ('http://www.loc.gov/standards/premis/v2/premis-v2-0.xsd', 200),
 ('http://www.loc.gov/standards/premis/v2/premis.xsd', 200),
 ('http://www.loc.gov/standards/rights/METSRights.xsd', 200),
 ('http://www.loc.gov/standards/sru/sru1-1archive/xml-files/srw-types.xsd',
  404),
 ('http://www.loc.gov/standards/textMD/textMD-v3.01a.xsd', 200),
 ('http://www.loc.gov/standards/xlink/xlink.xsd', 200),
 ('http://www.loc.gov/zing/srw/', 301),
 ('http://www.ncbi.nlm.nih.gov', 200),
 ('http://www.ncbi.nlm.nih.gov.libproxy.lib.unc.edu', 302),
 ('http://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/sra/doc/SRA_1-5/SRA.analysis.xsd?view=co',
  200),
 ('http://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/sra/doc/SRA_1-5/SRA.experiment.xsd?view=co',
  200),
 ('http://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/sra/doc/SRA_1-5/SRA.run.xsd?view=co',
  200),
 ('http://www.ngdc.noaa.gov/metadata/published/xsd/ngdcSchema/schema.xsd',
  200),
 ('http://www.ngdc.noaa.gov/metadata/published/xsd/schema.xsd', 200),
 ('http://www.ngdc.noaa.gov/metadata/published/xsd/schema/gmd/gmd.xsd', 200),
 ('http://www.ngdc.noaa.gov/metadata/published/xsd/schema/gmi', 301),
 ('http://www.ngdc.noaa.gov/mgg/ecs/metadata/seismic/xml/seismicMetadata.xsd',
  200),
 ('http://www.nws.noaa.gov/forecasts/xml/DWMLgen/schema/DWML.xsd', 301),
 ('http:/hdfeos.gsfc.nasa.gov/Schema/bmgt/collection_valids.xsd', 900),
 ('http:/hdfeos.gsfc.nasa.gov/Schema/bmgt/granule_valids.xsd', 900),
 ('https://ndar.nih.gov:443/DataDictionary/dataDictionary?xsd=1', 415),
 ('https://ndar.nih.gov:443/DataManager/dataManager?xsd=1', 415),
 ('https://starbase.jpl.nasa.gov/pds4/1400/dph_example_archive_VG2PLS/xml_schema/dph_example_dict_1400.xsd',
  200),
 ('https://starbase.jpl.nasa.gov/pds4/1400/dph_example_products/xml_schema/PDS4_PDS_1400.xsd',
  200),
 ('https://starbase.jpl.nasa.gov/pds4/1400/dph_example_products/xml_schema/dph_example_dict_1400.xsd',
  200),
 ('https://www.fgdc.gov/schemas/metadata/fgdc-std-001-1998.xsd', 200)]

with open('outputs/federal_schema_linkrot.csv', 'w') as f:
    f.write('\n'.join(['"{0}", {1}, 2015-11-15'.format(s[0], s[1]) for s in statuses if s]))