In [None]:
1. Can we generate a valid search URL from the URL Template provided?
    a. given the time frame of the harvests, this has to be considered alongside linkrot in the osdd (failed search query may not mean the template is incorrect, could just mean the service url is no longer valid at all).
2. Can we identify best practices (uses esip spatial, uses the time namespace, uses parameter elements)?
3. dataset/granule search (i don't think we have any data to support this at all).

Other info - can you identify the parent osdd of a resultset or of a nested osdd? can you grok it from the url only?

In [20]:
import requests
import json
from lxml import etree
import urlparse
import urllib
from bs4 import BeautifulSoup
from itertools import chain

In [30]:
def extract_namespaces(xml):
    '''
    Pull all of the namespaces in the source document
    and generate a list of tuples (prefix, URI) to dict
    '''
    if xml is None:
        return {}

    document_namespaces = dict(xml.xpath('/*/namespace::*'))
    if None in document_namespaces:
        document_namespaces['default'] = document_namespaces[None]
        del document_namespaces[None]

    # now run through any child namespace issues
    all_namespaces = xml.xpath('//namespace::*')
    for i, ns in enumerate(all_namespaces):
        if ns[1] in document_namespaces.values():
            continue
        new_key = ns[0] if ns[0] else 'default%s' % i
        document_namespaces[new_key] = ns[1]

    return document_namespaces

def extract_urls(xml, mimetype='atom+xml'):
    return xml.xpath('//*[local-name()="Url" and (@*[local-name()="type"]="application/%(mimetype)s" or @*[local-name()="type"]="text/%(mimetype)s")]' % {'mimetype': mimetype})

def extract_template(url, append_limit=True):
    # get the base url from the template
    template_parts = urlparse.urlparse(url)
    
    if not template_parts.scheme:
        return '', '', {}, False
    
    base_url = urlparse.urlunparse((
        template_parts.scheme,
        template_parts.netloc,
        template_parts.path,
        None,
        None,
        None
    ))

    qp = {k: v[0] for k, v in urlparse.parse_qs(template_parts.query).iteritems()}

    # get the hard-coded params
    defaults = {k:v for k, v in qp.iteritems() 
            if not v.startswith('{') 
            and not v.endswith('}')}
    
    # a flag for some hard-coded response format type to manage
    # accept headers or no
    format_defined = len([v for k, v in defaults.iteritems() if 'atom' in v.lower() or 'rss' in v.lower()]) > 0

    # get the rest (and ignore the optional/namespaces)
    parameters = {k: v[1:-1] for k, v in qp.iteritems() 
            if v.startswith('{') 
            and v.endswith('}')}
    
    if append_limit:
        terms = extract_parameter_key('count', parameters)
        if terms:
            defaults = dict(
                chain(defaults.items(), {k: 5 for k in terms.keys()}.items())
            )
            
    # note: not everyone manages url-encoded query parameter delimiters
    #       and not everyone manages non-url-encoded values so yeah. we are
    #       ignoring the non-url-encoded group tonight.
    # return the base, defaults, parameters as dict
    return base_url, defaults, parameters, format_defined

def extract_parameter_key(value, params):
    # sort out the query parameter name for a parameter
    # and don't send curly bracketed things, please
    return {k: v.split(':')[-1].replace('?', '') for k, v 
                in params.iteritems() 
                if value in v}

def extract_parameter_defs(url_elem, defined_terms):
    # could just go with a namespace check but
    # namespaces are included and not used more
    # than i'd like. safety first.
    params = url_elem.xpath('*[local-name()="Parameter"]')
    
    output = {}
    for i, param in enumerate(params):
        pname = param.attrib.get('name', i)
        pval = param.attrib.get('value', '')
        poptions = param.xpath('*[local-name()="Option"]')
        options = [(o.attrib.get('value'), o.attrib.get('label')) for o in poptions]
        
        output[pname] = {
            "value": pval,
            "options": options
        }
    
    return output
    
    # and go crazy pedant with a) does each query param value
    # have a defined parameter element? (don't know if it should) 
    # and b) required or not parameters?
#     for p in params:
#         p_value = p.attrib.get('value', '')
#         if not p_value:
#             continue
        
#         qps = {k:v for k, v in defined_terms.iteritems() if p_value[1:-1] in v}
        
#         if not qps:
#             continue
            
def extract_query_terms(xml, param_name):
    # find a query element that contains an example
    # for the provided param_name (no namespace, no optional flag)
    example_queries = {}
    xp = '//*[local-name()="Query" and @*[local-name()="role"]="example"]/@*[local-name()="{0}"]'.format(param_name)
    try:
        example_queries = xml.xpath(xp)
    except:
        print 'failed example query: ', xp
        return []
    
    return example_queries

def extract_search_rels(xml):
#     application/opensearchdescription+xml
    for elem in xml.xpath('//*/*[local-name()="link" and (@*[local-name()="type"]="application/opensearchdescription+xml") and (@*[local-name()="rel"]="search" or @*[local-name()="rel"]="http://esipfed.org/ns/fedsearch/1.0/search#")]'):
        parent = next(iter(elem.getparent().xpath('*[local-name()="title"]')), None)
        yield {
            "link_url": elem.attrib.get('href', ''),
            "link_title": elem.attrib.get('title', ''),
            "link_type": elem.attrib.get('type', ''),
            "parent_title": parent.text if parent is not None else ''
        }
            

def extract_response_stats(xml):
    total = next(iter(xml.xpath('//*[local-name()="totalResults"]/text()')), 'Unknown')
    subset = next(iter(xml.xpath('//*[local-name()="itemsPerPage"]/text()')), 'Unknown')
    
    return subset, total

def execute_request(url, headers={}):
    try:
        req = requests.get(url, headers=headers)
    except:
        logger.error('\tSkipping connection issue\'s')
        return '-999', '', ''
    
    return req.status_code, req.content, req.headers

def parse_response(content, headers={}):
    output = {}
    
    # see if it has content, see if the xml parses, see if it's even xml
    if not content:
        return {'error': 'No content'}
    
    if 'html' in headers.get('content-type'):
        return {'error': 'HTML response'}

    try:
        xml = etree.fromstring(content)
    except:
        return {'error': 'XML Parse error'}

    subset, total = extract_response_stats(xml)
    
    # this would get us to some nested search
    # there is no guarantee it is dataset/granule!
    # or can be identified as such!
    search_rels = [e for e in extract_search_rels(xml)]
    
    output.update({
        'subset': subset,
        'total': total,
    })
    if search_rels is not None:
        output.update({'search_rels': search_rels})
        
    return output

def parse_osdd(osdd):
    # get the url template to test basic search
    #    get the parameter list (prefix:term)
    # get the parameter elements
    #    match to parameter list
    # get namespaces 
    output = {}
    
    output['namespaces'] = extract_namespaces(osdd)
    output['templates'] = []
    
    for extracted_elem in extract_urls(osdd):
        template_base, template_defaults, template_params, format_defined = extract_template(extracted_elem.attrib.get('template'))
        accept_type = extracted_elem.attrib.get('type', '')
        
        search_url = ''
        search_terms = extract_parameter_key('searchTerms', template_params)
    
        if search_terms:
            qps = dict(
                chain(
                    template_defaults.items(),
                    {search_terms.keys()[0]: ''}.items()
                )
            )
            search_url = template_base + '?' + urllib.urlencode(qps.items())
        
        example_url = ''
        example_terms = list(
            chain.from_iterable(
                [extract_query_terms(extracted_elem.getparent(), s) for s in search_terms.values()]
            )
        )
        if example_terms:
            qps = dict(
                chain(
                    template_defaults.items(),
                    {search_terms.keys()[0]: example_terms[0]}.items()
                )
            )
            example_url = template_base + '?' + urllib.urlencode(qps.items())
        
        output['templates'].append({
            'base': template_base,
            'defaults': template_defaults,
            'parameters': template_params,
            'format_definition': format_defined,
            'accept_type': accept_type,
            'search_url': search_url,  # empty searchTerms
            'example_url': example_url  # searchTerms w/ provided keywords
        })
    
    output['parameter_definitions'] = extract_parameter_defs(extracted_elem, template_params)
    
    # get the basic definition bits (keywords, name, etc)
    
    
    return output

In [3]:
# from doug, see notes re: uptime
cwic_links = [
    'http://dap.onc.uvic.ca/erddap/opensearch1.1/description.xml',
    'http://gcmd.gsfc.nasa.gov/KeywordSearch/default/openSearch.jsp?Portal=cwic',
    'http://podaac.jpl.nasa.gov/ws/search/dataset/osd.xml',
    # 'http://nsidc.org/api/opensearch/1.1/dataset/description',  # we're just not going to run this
    'http://ghrc.nsstc.nasa.gov/hydro/ghost.xml',
    'http://mirador.gsfc.nasa.gov/mirador_dataset_opensearch.xml',
    'http://eo-virtual-archive4.esa.int/search/ER02_SAR_RAW_0P/description',
    'http://www1.usgs.gov/erddap/opensearch1.1/description.xml',
    # 'http://bison.usgs.ornl.gov/doc/api.jsp',  # this is now a dead link
    # 'http://ceocat.ccrs.nrcan.gc.ca/opensearch_description_document.xml',  # this is 403 access forbidden
    # 'http://rs211980.rs.hosteurope.de/mule/os-description/',  # 503 service down
    'http://geo.spacebel.be/opensearch/description.xml',  # from the fedeo documentation page listed
    'http://lance-modis.eosdis.nasa.gov/user_services/dataset_opensearch.xml'
]

In [11]:
# to download the osdds
cwic_osdds = []

for cwic_link in cwic_links:
    print 'Downloading {0}'.format(cwic_link)

    req = requests.get(cwic_link)
    osdd = {
        'url': cwic_link,
        'status': req.status_code
    }
    if req.status_code != 200:
        print '\tFailed request'
        cwic_osdds.append(osdd)
        continue
    
    # just checking
    xml = etree.fromstring(req.content)
    osdd.update({'xml':req.content})
    cwic_osdds.append(osdd)
    
with open('outputs/cwic_osdds.json', 'w') as f:
    f.write(json.dumps(cwic_osdds, indent=4))
    

Downloading http://dap.onc.uvic.ca/erddap/opensearch1.1/description.xml
Downloading http://gcmd.gsfc.nasa.gov/KeywordSearch/default/openSearch.jsp?Portal=cwic
Downloading http://podaac.jpl.nasa.gov/ws/search/dataset/osd.xml
Downloading http://ghrc.nsstc.nasa.gov/hydro/ghost.xml
	Failed request
Downloading http://mirador.gsfc.nasa.gov/mirador_dataset_opensearch.xml
Downloading http://eo-virtual-archive4.esa.int/search/ER02_SAR_RAW_0P/description
Downloading http://www1.usgs.gov/erddap/opensearch1.1/description.xml
Downloading http://geo.spacebel.be/opensearch/description.xml
Downloading http://lance-modis.eosdis.nasa.gov/user_services/dataset_opensearch.xml


In [27]:
# to reload from disk for parsing, etc
with open('outputs/cwic_osdds.json', 'r') as f:
    cwic_osdds = json.loads(f.read())

In [31]:
for i, osdd in enumerate(cwic_osdds):
    if osdd.get('status') != 200:
        continue
        
    xml = etree.fromstring(osdd.get('xml').encode('utf-8'))
    parsed_osdd = parse_osdd(xml)
    
    #print parsed_osdd
    
    # try the two example queries
    for j, template in enumerate(parsed_osdd.get('templates', [])):
        accept_type = template.get('accept_type', '')
        headers = {'Accept': accept_type} if accept_type else {}
        example_url = template.get('example_url', '')
        search_url = template.get('search_url', '')
        
        if search_url:
            try:
                req = requests.get(search_url, headers=headers, timeout=15)
                ex = {
                    'status': req.status_code,
                    'has_content': req.content is not None
                }
                ex.update(parse_response(req.content, req.headers))
                template.update({'search_url_response': ex})
            except requests.exceptions.ReadTimeout:
                template.update({'search_url_response': {'status': 'timeout'}})
            
            
        if example_url:
            try:
                req = requests.get(example_url, headers=headers, timeout=15)
                ex = {
                    'status': req.status_code,
                    'has_content': req.content is not None
                }
                ex.update(parse_response(req.content, req.headers))
                template.update({'example_url_response': ex})
            except requests.exceptions.ReadTimeout:
                template.update({'example_url_response': {'status': 'timeout'}})
        parsed_osdd['templates'][j] = template
    
    osdd.update(parsed_osdd)
    cwic_osdds[i] = osdd

In [32]:
with open('outputs/cwic_osdds_extended.json', 'w') as f:
    f.write(json.dumps(cwic_osdds, indent=4))

In [35]:
# let's see what we get from the nested osdds
osdd = cwic_osdds[1]
template = osdd.get('templates')[0]
secondary_urls = template.get('example_url_response', {}).get('search_rels', [])

for secondary_url in secondary_urls:
    try:
        req = requests.get(secondary_url.get('link_url'), timeout=15)
    except Exception as ex:
        print 'error', ex
        continue
    if req.status_code != 200:
        print 'site error'
        continue
      
    xml = etree.fromstring(req.content.encode('utf-8'))
    parsed = parse_osdd(xml)
    
    print
    print secondary_url.get('parent_title'), secondary_url.get('link_url')
    print parsed



MODIS AQUA Clear Sky Radiance 8 Day Composite Daily L3 Global 25 Km Equal Area http://cwic.wgiss.ceos.org/opensearch/datasets/MYDCSR_8/osdd.xml?clientId=fromgcmd
{'templates': [{'search_url': '', 'parameters': {'count': 'count?', 'geoBox': 'geo:box?', 'timeStart': 'time:start?', 'timeEnd': 'time:end?', 'startIndex': 'startIndex?'}, 'example_url': '', 'format_definition': False, 'base': 'http://cwic.wgiss.ceos.org/opensearch/granules.atom', 'accept_type': 'application/atom+xml', 'defaults': {'count': 5, 'clientId': 'fromgcmd', 'datasetId': 'MYDCSR_8'}}, {'search_url': '', 'parameters': {'count': 'count?', 'geoBox': 'geo:box?', 'timeStart': 'time:start?', 'startPage': 'startPage?', 'timeEnd': 'time:end?'}, 'example_url': '', 'format_definition': False, 'base': 'http://cwic.wgiss.ceos.org/opensearch/granules.atom', 'accept_type': 'application/atom+xml', 'defaults': {'count': 5, 'clientId': 'fromgcmd', 'datasetId': 'MYDCSR_8'}}], 'namespaces': {'xml': 'http://www.w3.org/XML/1998/namespace