1. Can we generate a valid search URL from the URL Template provided?
    a. given the time frame of the harvests, this has to be considered alongside linkrot in the osdd (failed search query may not mean the template is incorrect, could just mean the service url is no longer valid at all).
2. Can we identify best practices (uses esip spatial, uses the time namespace, uses parameter elements)?
3. dataset/granule search (i don't think we have any data to support this at all).

Other info - can you identify the parent osdd of a resultset or of a nested osdd? can you grok it from the url only?

In [1]:
import requests
import json
from lxml import etree
import urlparse
import urllib
from bs4 import BeautifulSoup
from itertools import chain

In [2]:
def extract_namespaces(xml):
    '''
    Pull all of the namespaces in the source document
    and generate a list of tuples (prefix, URI) to dict
    '''
    if xml is None:
        return {}

    document_namespaces = dict(xml.xpath('/*/namespace::*'))
    if None in document_namespaces:
        document_namespaces['default'] = document_namespaces[None]
        del document_namespaces[None]

    # now run through any child namespace issues
    all_namespaces = xml.xpath('//namespace::*')
    for i, ns in enumerate(all_namespaces):
        if ns[1] in document_namespaces.values():
            continue
        new_key = ns[0] if ns[0] else 'default%s' % i
        document_namespaces[new_key] = ns[1]

    return document_namespaces

def extract_urls(xml, mimetype='atom+xml'):
    return xml.xpath('//*[local-name()="Url" and (@*[local-name()="type"]="application/%(mimetype)s" or @*[local-name()="type"]="text/%(mimetype)s")]' % {'mimetype': mimetype})

def extract_template(url, append_limit=True):
    # get the base url from the template
    template_parts = urlparse.urlparse(url)
    
    if not template_parts.scheme:
        return '', '', {}, False
    
    base_url = urlparse.urlunparse((
        template_parts.scheme,
        template_parts.netloc,
        template_parts.path,
        None,
        None,
        None
    ))

    qp = {k: v[0] for k, v in urlparse.parse_qs(template_parts.query).iteritems()}

    # get the hard-coded params
    defaults = {k:v for k, v in qp.iteritems() 
            if not v.startswith('{') 
            and not v.endswith('}')}
    
    # a flag for some hard-coded response format type to manage
    # accept headers or no
    format_defined = len([v for k, v in defaults.iteritems() if 'atom' in v.lower() or 'rss' in v.lower()]) > 0

    # get the rest (and ignore the optional/namespaces)
    parameters = {k: v[1:-1] for k, v in qp.iteritems() 
            if v.startswith('{') 
            and v.endswith('}')}
    
    if append_limit:
        terms = extract_parameter_key('count', parameters)
        if terms:
            defaults = dict(
                chain(defaults.items(), {k: 5 for k in terms.keys()}.items())
            )
            
    # note: not everyone manages url-encoded query parameter delimiters
    #       and not everyone manages non-url-encoded values so yeah. we are
    #       ignoring the non-url-encoded group tonight.
    # return the base, defaults, parameters as dict
    return base_url, defaults, parameters, format_defined

def extract_parameter_key(value, params):
    # sort out the query parameter name for a parameter
    # and don't send curly bracketed things, please
    return {k: v.split(':')[-1].replace('?', '') for k, v 
                in params.iteritems() 
                if value in v}

def extract_parameter_defs(url_elem, defined_terms):
    # could just go with a namespace check but
    # namespaces are included and not used more
    # than i'd like. safety first.
    params = url_elem.xpath('*[local-name()="Parameter"]')
    
    # and go crazy pedant with a) does each query param value
    # have a defined parameter element? (don't know if it should) 
    # and b) required or not parameters?
    for p in params:
        p_value = p.attrib.get('value', '')
        if not p_value:
            continue
        
        qps = {k:v for k, v in defined_terms.iteritems() if p_value[1:-1] in v}
        
        if not qps:
            continue
            
def extract_query_terms(xml, param_name):
    # find a query element that contains an example
    # for the provided param_name (no namespace, no optional flag)
    example_queries = {}
    xp = '//*[local-name()="Query" and @*[local-name()="role"]="example"]/@*[local-name()="{0}"]'.format(param_name)
    try:
        example_queries = xml.xpath(xp)
    except:
        print 'failed example query: ', xp
        return []
    
    return example_queries

def extract_search_rel(xml):
    elem = next(
        iter(
                xml.xpath('/*/*[local-name()="link" and (@*[local-name()="rel"]="search" or @*[local-name()="rel"]="http://esipfed.org/ns/fedsearch/1.0/search#")]')
            ), None
        )
    return elem

def extract_item_links(xml):
    # item or entry links from a secondary search
    return xml.xpath('//*[local-name()="entry" or local-name()="item"]/*[local-name()="link"]')

def extract_response_stats(xml):
    total = next(iter(xml.xpath('//*[local-name()="totalResults"]/text()')), 'Unknown')
    subset = next(iter(xml.xpath('//*[local-name()="itemsPerPage"]/text()')), 'Unknown')
    
    if subset == 'Unknown':
        subset = len(xml.xpath('//*[local-name()="entry" or local-name()="item"]'))
    
    return subset, total

def generate_requests(url_elem):
    # rebuild the url(s)
    url_base, defaults, params, format_defined = extract_template(url_elem.attrib.get('template'))
    accept_type = url_elem.attrib.get('type', '')
    
    headers = {'Accept': accept_type} if format_defined == False else {}
    
    search_terms = extract_parameter_key('searchTerms', params)
    
    search_urls = []
    
    # build the empty search
    if search_terms:
        qps = dict(
            chain(
                defaults.items(),
                {search_terms.keys()[0]: ''}.items()
            )
        )
        
        search_urls.append(url_base + '?' + urllib.urlencode(qps.items()))
    
    # try to build an example search
    query_examples = list(
        chain.from_iterable(
            [extract_query_terms(url_elem.getparent(), p) for p in search_terms.values()]
        )
    )
    
    if query_examples:
        test_query = {search_terms.keys()[0]: query_examples[0]}
        qps = dict(
            chain(
                defaults.items(),
                test_query.items()
            )
        )
        
        search_urls.append(url_base + '?' + urllib.urlencode(qps.items()))
    
    return search_urls, headers

def execute_request(url, headers={}):
    try:
        req = requests.get(url, headers=headers)
    except:
        logger.error('\tSkipping connection issue\'s')
        return '-999', '', ''
    
    return req.status_code, req.content, req.headers

def parse_response(content, headers={}):
    # see if it has content, see if the xml parses, see if it's even xml
    if not content:
        return {'error': 'No content'}
    
    if 'html' in headers.get('content-type'):
        return {'error': 'HTML response'}

    try:
        xml = etree.fromstring(content)
    except:
        return {'error': 'XML Parse error'}

    return {'xml': xml}

def parse_osdd(osdd):
    # get the url template to test basic search
    #    get the parameter list (prefix:term)
    # get the parameter elements
    #    match to parameter list
    # get namespaces 
    output = {}
    
    output['namespaces'] = extract_namespaces(osdd)
    output['templates'] = []
    
    for extracted_elem in extract_urls(osdd):
        template_base, template_defaults, template_params, format_defined = extract_template(extracted_elem.attrib.get('template'))
        accept_type = extracted_elem.attrib.get('type', '')
        
        output['templates'].append({
            'base': template_base,
            'defaults': template_defaults,
            'parameters': template_params,
            'format_definition': format_defined,
            'accept_type': accept_type
        })
    
    parameter_definitions = extract_parameter_defs(extracted_elem, template_params)
    
    return output

In [3]:
# from doug, see notes re: uptime
cwic_links = [
    'http://dap.onc.uvic.ca/erddap/opensearch1.1/description.xml',
    'http://gcmd.gsfc.nasa.gov/KeywordSearch/default/openSearch.jsp?Portal=cwic',
    'http://podaac.jpl.nasa.gov/ws/search/dataset/osd.xml',
    # 'http://nsidc.org/api/opensearch/1.1/dataset/description',  # we're just not going to run this
    'http://ghrc.nsstc.nasa.gov/hydro/ghost.xml',
    'http://mirador.gsfc.nasa.gov/mirador_dataset_opensearch.xml',
    'http://eo-virtual-archive4.esa.int/search/ER02_SAR_RAW_0P/description',
    'http://www1.usgs.gov/erddap/opensearch1.1/description.xml',
    # 'http://bison.usgs.ornl.gov/doc/api.jsp',  # this is now a dead link
    # 'http://ceocat.ccrs.nrcan.gc.ca/opensearch_description_document.xml',  # this is 403 access forbidden
    # 'http://rs211980.rs.hosteurope.de/mule/os-description/',  # 503 service down
    'http://geo.spacebel.be/opensearch/description.xml',  # from the fedeo documentation page listed
    'http://lance-modis.eosdis.nasa.gov/user_services/dataset_opensearch.xml'
]

In [4]:
for cwic_link in cwic_links:
    print 'Processing {0}'.format(cwic_link)
    
    req = requests.get(cwic_link)
    if req.status_code != 200:
        print '\tFailed request'
        continue
    
    xml = etree.fromstring(req.content)
    
    extracted_url_elems = extract_urls(xml)
    
    for extracted_elem in extracted_url_elems:
        template_base, template_defaults, template_params, format_defined = extract_template(extracted_elem.attrib.get('template'))
        accept_type = extracted_elem.attrib.get('type', '')
        
        # let's see if the url includes some enumerations as Parameter children
        # parameter_definitions = extract_parameter_defs(extracted_elem, template_params)
        
        catalog_params = extract_parameter_key('searchTerms', template_params)
        print '\tCatalog searchTerms: ', catalog_params
    
        if not catalog_params:
            continue
        
        query_examples = list(chain.from_iterable([extract_query_terms(xml, p) for p in catalog_params.values()]))
        
        print '\tExample queries: ', query_examples

Processing http://dap.onc.uvic.ca/erddap/opensearch1.1/description.xml
	Catalog searchTerms:  {'searchTerms': 'searchTerms'}
	Example queries:  ['temperature']
Processing http://gcmd.gsfc.nasa.gov/KeywordSearch/default/openSearch.jsp?Portal=cwic
	Catalog searchTerms:  {'searchTerms': 'searchTerms'}
	Example queries:  ['modis']
Processing http://podaac.jpl.nasa.gov/ws/search/dataset/osd.xml
	Catalog searchTerms:  {'keyword': 'searchTerms'}
	Example queries:  []
Processing http://ghrc.nsstc.nasa.gov/hydro/ghost.xml
	Failed request
Processing http://mirador.gsfc.nasa.gov/mirador_dataset_opensearch.xml
	Catalog searchTerms:  {'keyword': 'searchTerms'}
	Example queries:  ['ozone', 'Surface Air Temperature']
Processing http://eo-virtual-archive4.esa.int/search/ER02_SAR_RAW_0P/description
	Catalog searchTerms:  {'q': 'searchTerms'}
	Example queries:  []
Processing http://www1.usgs.gov/erddap/opensearch1.1/description.xml
	Catalog searchTerms:  {'searchTerms': 'searchTerms'}
	Example queries: 