### Quick Exploration of linkrot in data.gov harvested metadata

Using the CSW endpoint and some sampling per 100 items.

This is not looking at linkrot in the harvest endpoints themselves but the links included in the metadata harvested. So the harvest source is still live, possibly unchanged from the initial harvest, and served through data.gov but the links referenced *within* the metadata record are no longer available for some reason. 

Steps: 

1. get the CSW GetCapabilities
2. make a getrecords/@hits request to know how many isos we'll find
3. 

Of course, this fails if the iso doesn't contain distribution links. 

In [2]:
from owslib.csw import CatalogueServiceWeb
from lxml import etree
import random 
import requests

_csw_service = 'http://catalog.data.gov/csw'
_csw_version = '2.0.2'

_sample_size = 4
_request_size = 10  # data.gov limits this (at least for requests + full in getrecords)


In [5]:
init_csw = CatalogueServiceWeb(_csw_service, version=_csw_version)

In [6]:
# make the initial getrecords request to get the total number of things
# which is doing bupkus. 
hit_request = init_csw.getrecords2(typenames='gmd:MD_Metadata', 
                                outputschema='http://www.isotc211.org/2005/gmd', 
                                resulttype='hits',
                                format='application/xml')
hit_request

timeout: timed out

In [7]:
# so we'll do it by hand. 
hit_req = requests.get('http://catalog.data.gov/csw?service=CSW&request=GetRecords&version=2.0.2&outputFormat=application/xml&outputSchema=http://www.isotc211.org/2005/gmd&resultType=hits&typeNames=gmd:MD_Metadata&ElementSetName=summary')

if hit_req.status_code != 200:
    print hit_req.content

In [13]:
xml = etree.fromstring(hit_req.content)

print hit_req.content

total = int(next(iter(xml.xpath('//*[local-name()="SearchResults"]/@*[local-name()="numberOfRecordsMatched"]')), '-9999'))
total

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- pycsw 1.10.1 -->
<csw:GetRecordsResponse xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:inspire_common="http://inspire.ec.europa.eu/schemas/common/1.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:dct="http://purl.org/dc/terms/" xmlns:ows="http://www.opengis.net/ows" xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0" xmlns:gml="http://www.opengis.net/gml" xmlns:dif="http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:gco="http://www.isotc211.org/2005/gco" xmlns:gmd="http://www.isotc211.org/2005/gmd" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:srv="http://www.isotc211.org/2005/srv" xmlns:ogc="http://www.opengis.net/ogc" xmlns:fgdc="http://www.opengis.net/cat/csw/csdgm" xmlns:inspire_ds="http://inspire.ec.europa.eu/schemas/inspire_ds/1.0" xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" xmlns:xsi="http://www.w3.org/200

103948

In [None]:
# okay. so we need to iterate over the service 
# i think pull down the brief/summary
# get the id to pull by id later
# (so pull sample and then ping one by one because the service is not speedy)

In [21]:
# not at zero
start_position = 1

response_req = requests.get('http://catalog.data.gov/csw?service=CSW&request=GetRecords&version=2.0.2&startPosition=%s&maxRecords=%s&outputFormat=application/xml&outputSchema=http://www.isotc211.org/2005/gmd&resultType=results&typeNames=gmd:MD_Metadata&ElementSetName=full' % (start_position, _request_size))

if response_req.status_code != 200:
    print response_req.content



In [23]:
response_req.content.replace('\n', '')

# no idea why we've got the newlines



In [18]:
# oops got a sql query (don't do that) - index starts at 1
response_req.content

'<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<!-- pycsw 1.10.1 -->\n<ows:ExceptionReport xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:inspire_common="http://inspire.ec.europa.eu/schemas/common/1.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:dct="http://purl.org/dc/terms/" xmlns:ows="http://www.opengis.net/ows" xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0" xmlns:gml="http://www.opengis.net/gml" xmlns:dif="http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:gco="http://www.isotc211.org/2005/gco" xmlns:gmd="http://www.isotc211.org/2005/gmd" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:srv="http://www.isotc211.org/2005/srv" xmlns:ogc="http://www.opengis.net/ogc" xmlns:fgdc="http://www.opengis.net/cat/csw/csdgm" xmlns:inspire_ds="http://inspire.ec.europa.eu/schemas/inspire_ds/1.0" xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" xmlns:xsi="http://www.w3.org/200

In [None]:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- pycsw 1.10.1 -->
<csw:GetRecordsResponse 
    xmlns:dc="http://purl.org/dc/elements/1.1/" 
    xmlns:inspire_common="http://inspire.ec.europa.eu/schemas/common/1.0" 
    xmlns:atom="http://www.w3.org/2005/Atom" 
    xmlns:xs="http://www.w3.org/2001/XMLSchema" 
    xmlns:dct="http://purl.org/dc/terms/" 
    xmlns:ows="http://www.opengis.net/ows" 
    xmlns:apiso="http://www.opengis.net/cat/csw/apiso/1.0" 
    xmlns:gml="http://www.opengis.net/gml" 
    xmlns:dif="http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/" 
    xmlns:xlink="http://www.w3.org/1999/xlink" 
    xmlns:gco="http://www.isotc211.org/2005/gco" 
    xmlns:gmd="http://www.isotc211.org/2005/gmd" 
    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" 
    xmlns:srv="http://www.isotc211.org/2005/srv" 
    xmlns:ogc="http://www.opengis.net/ogc" 
    xmlns:fgdc="http://www.opengis.net/cat/csw/csdgm" 
    xmlns:inspire_ds="http://inspire.ec.europa.eu/schemas/inspire_ds/1.0" 
    xmlns:csw="http://www.opengis.net/cat/csw/2.0.2" 
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
    xmlns:os="http://a9.com/-/spec/opensearch/1.1/" 
    xmlns:soapenv="http://www.w3.org/2003/05/soap-envelope" 
    xmlns:sitemap="http://www.sitemaps.org/schemas/sitemap/0.9" version="2.0.2" xsi:schemaLocation="http://www.opengis.net/cat/csw/2.0.2 http://schemas.opengis.net/csw/2.0.2/CSW-discovery.xsd">
    <csw:SearchStatus timestamp="2015-04-22T02:55:48Z"/>
    <csw:SearchResults nextRecord="11" numberOfRecordsMatched="102884" numberOfRecordsReturned="10" recordSchema="http://www.isotc211.org/2005/gmd" elementSet="summary"/>
</csw:GetRecordsResponse>