In [None]:
1. Can we generate a valid search URL from the URL Template provided?
    a. given the time frame of the harvests, this has to be considered alongside linkrot in the osdd (failed search query may not mean the template is incorrect, could just mean the service url is no longer valid at all).
2. Can we identify best practices (uses esip spatial, uses the time namespace, uses parameter elements)?
3. dataset/granule search (i don't think we have any data to support this at all).

Other info - can you identify the parent osdd of a resultset or of a nested osdd? can you grok it from the url only?

In [1]:
import requests
import json as js
import os
from lxml import etree
import urlparse
import urllib
from bs4 import BeautifulSoup
from itertools import chain
from datetime import datetime

# for pinging the database
import sqlalchemy as sqla
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import *
from sqlalchemy import and_
from semproc.xml_utils import *
from mpp.models import Response

In [2]:
def extract_namespaces(xml):
    '''
    Pull all of the namespaces in the source document
    and generate a list of tuples (prefix, URI) to dict
    '''
    if xml is None:
        return {}

    document_namespaces = dict(xml.xpath('/*/namespace::*'))
    if None in document_namespaces:
        document_namespaces['default'] = document_namespaces[None]
        del document_namespaces[None]

    # now run through any child namespace issues
    all_namespaces = xml.xpath('//namespace::*')
    for i, ns in enumerate(all_namespaces):
        if ns[1] in document_namespaces.values():
            continue
        new_key = ns[0] if ns[0] else 'default%s' % i
        document_namespaces[new_key] = ns[1]

    return document_namespaces

def extract_urls(xml, mimetype='atom+xml'):
    return xml.xpath('//*[local-name()="Url" and (@*[local-name()="type"]="application/%(mimetype)s" or @*[local-name()="type"]="text/%(mimetype)s")]' % {'mimetype': mimetype})

def extract_template(url, append_limit=True):
    # get the base url from the template
    template_parts = urlparse.urlparse(url)
    
    if not template_parts.scheme:
        return '', '', {}, False
    
    base_url = urlparse.urlunparse((
        template_parts.scheme,
        template_parts.netloc,
        template_parts.path,
        None,
        None,
        None
    ))

    qp = {k: v[0] for k, v in urlparse.parse_qs(template_parts.query).iteritems()}

    # get the hard-coded params
    defaults = {k:v for k, v in qp.iteritems() 
            if not v.startswith('{') 
            and not v.endswith('}')}
    
    # a flag for some hard-coded response format type to manage
    # accept headers or no
    format_defined = len([v for k, v in defaults.iteritems() if 'atom' in v.lower() or 'rss' in v.lower()]) > 0

    # get the rest (and ignore the optional/namespaces)
    parameters = {k: v[1:-1] for k, v in qp.iteritems() 
            if v.startswith('{') 
            and v.endswith('}')}
    
    if append_limit:
        terms = extract_parameter_key('count', parameters)
        if terms:
            defaults = dict(
                chain(defaults.items(), {k: 5 for k in terms.keys()}.items())
            )
            
    # note: not everyone manages url-encoded query parameter delimiters
    #       and not everyone manages non-url-encoded values so yeah. we are
    #       ignoring the non-url-encoded group tonight.
    # return the base, defaults, parameters as dict
    return base_url, defaults, parameters, format_defined

def extract_parameter_key(value, params):
    # sort out the query parameter name for a parameter
    # and don't send curly bracketed things, please
    return {k: v.split(':')[-1].replace('?', '') for k, v 
                in params.iteritems() 
                if value in v}

def extract_parameter_defs(url_elem, defined_terms):
    # could just go with a namespace check but
    # namespaces are included and not used more
    # than i'd like. safety first.
    params = url_elem.xpath('*[local-name()="Parameter"]')
    
    output = {}
    for i, param in enumerate(params):
        pname = param.attrib.get('name', i)
        pval = param.attrib.get('value', '')
        poptions = param.xpath('*[local-name()="Option"]')
        options = [(o.attrib.get('value'), o.attrib.get('label')) for o in poptions]
        
        output[pname] = {
            "value": pval,
            "options": options
        }
    
    return output
            
def extract_query_terms(xml, param_name):
    # find a query element that contains an example
    # for the provided param_name (no namespace, no optional flag)
    example_queries = {}
    xp = '//*[local-name()="Query" and @*[local-name()="role"]="example"]/@*[local-name()="{0}"]'.format(param_name)
    try:
        example_queries = xml.xpath(xp)
    except:
        print 'failed example query: ', xp
        return []
    
    return example_queries

def extract_search_rels(xml):
#     application/opensearchdescription+xml
    for elem in xml.xpath('//*/*[local-name()="link" and (@*[local-name()="type"]="application/opensearchdescription+xml") and (@*[local-name()="rel"]="search" or @*[local-name()="rel"]="http://esipfed.org/ns/fedsearch/1.0/search#")]'):
        parent = next(iter(elem.getparent().xpath('*[local-name()="title"]')), None)
        yield {
            "link_url": elem.attrib.get('href', ''),
            "link_title": elem.attrib.get('title', ''),
            "link_type": elem.attrib.get('type', ''),
            "parent_title": parent.text if parent is not None else ''
        }
            

def extract_response_stats(xml):
    total = next(iter(xml.xpath('//*[local-name()="totalResults"]/text()')), 'Unknown')
    subset = next(iter(xml.xpath('//*[local-name()="itemsPerPage"]/text()')), 'Unknown')
    
    return subset, total

def execute_request(url, headers={}):
    try:
        req = requests.get(url, headers=headers)
    except:
        logger.error('\tSkipping connection issue\'s')
        return '-999', '', ''
    
    return req.status_code, req.content, req.headers

def parse_response(content, headers={}):
    output = {}
    
    # see if it has content, see if the xml parses, see if it's even xml
    if not content:
        return {'error': 'No content'}
    
    if 'html' in headers.get('content-type'):
        return {'error': 'HTML response'}

    try:
        xml = etree.fromstring(content)
    except:
        return {'error': 'XML Parse error'}

    subset, total = extract_response_stats(xml)
    
    # this would get us to some nested search
    # there is no guarantee it is dataset/granule!
    # or can be identified as such!
    search_rels = []
    for search_rel in [e for e in extract_search_rels(xml)]:
        try:
            rsp = requests.get(search_rel.get('link_url'))
            search_rel.update({"status": rsp.status_code})
            try:
                xml = etree.fromstring(rsp.content)
            except:
                search_rel.update({"error": "invalid xml"})
                search_rels.append(search_rel)
                continue
                
            parsed_rel = parse_osdd(xml)
            
            search_rel.update({
                "content": rsp.content,
                "response": parsed_rel
            })
        except requests.exceptions.ReadTimeout:
            search_rel.update({"error": "timeout"})
        except requests.exceptions.ConnectionError:
            search_rel.update({"error": "connection error"})
        except:
            search_rel.update({"error": "unspecified"})
            
        search_rels.append(search_rel)
    
    output.update({
        'subset': subset,
        'total': total,
    })
    if search_rels:
        output.update({'search_rels': search_rels})
        
    return output

def parse_osdd(osdd):
    # get the url template to test basic search
    #    get the parameter list (prefix:term)
    # get the parameter elements
    #    match to parameter list
    # get namespaces 
    output = {}
    
    output['namespaces'] = extract_namespaces(osdd)
    output['templates'] = []
    
    # looking for resultset requests (atom or rss)
    for extracted_elem in extract_urls(osdd) + extract_urls(osdd,'rss+xml'):
        template_base, template_defaults, template_params, format_defined = extract_template(extracted_elem.attrib.get('template'))
        accept_type = extracted_elem.attrib.get('type', '')
        
        search_url = ''
        search_terms = extract_parameter_key('searchTerms', template_params)
    
        if search_terms:
            qps = dict(
                chain(
                    template_defaults.items(),
                    {search_terms.keys()[0]: ''}.items()
                )
            )
            search_url = template_base + '?' + urllib.urlencode(qps.items())
        
        example_url = ''
        example_terms = list(
            chain.from_iterable(
                [extract_query_terms(extracted_elem.getparent(), s) for s in search_terms.values()]
            )
        )
        if example_terms:
            qps = dict(
                chain(
                    template_defaults.items(),
                    {search_terms.keys()[0]: example_terms[0]}.items()
                )
            )
            example_url = template_base + '?' + urllib.urlencode(qps.items())
        
        default_url = template_base + '?' + urllib.urlencode(template_defaults.items()) if isinstance(template_defaults, dict) else template_defaults
        
        output['templates'].append({
            'base': template_base,
            'defaults': template_defaults,
            'parameters': template_params,
            'format_definition': format_defined,
            'accept_type': accept_type,
            'search_url': search_url,  # empty searchTerms
            'example_url': example_url,  # searchTerms w/ provided keywords
            'default_url': default_url,  # only default params, see cwic dataset osdds
            'param_defs': extract_parameter_defs(extracted_elem, template_params)
        })
    
    # get the basic definition bits (keywords, name, etc)
    output['has_title'] = len(osdd.xpath('*[local-name()="ShortName"]')) > 0
    output['has_desc'] = len(osdd.xpath('*[local-name()="Description"]')) > 0
    output['has_keywords'] = len(osdd.xpath('*[local-name()="Tags"]')) > 0
    output['has_contact'] = len(osdd.xpath('*[local-name()="Contact"]')) > 0
    
    return output

In [3]:
# getting the resultset (links and harvested osdd)
sketchy_sql = '''with i
as (
    select d.response_id, jsonb_array_elements(d.identity::jsonb) ident
    from identities d
    where d.identity is not null
)

select r.id, r.source_url, r.cleaned_content
from responses r join i on i.response_id = r.id
where i.ident->>'protocol' = 'OpenSearch' 
    and i.ident#>>'{service,name}' = 'OpenSearchDescription';
'''

In [7]:
# load the postgres connection file
with open('../local/big_rds.conf', 'r') as f:
    conf = js.loads(f.read())

# our connection
engine = sqla.create_engine(conf.get('connection'))
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()

In [None]:
# just for resets
session.rollback()

In [8]:
# run through the osdd resultset
result = session.execute(sketchy_sql)

In [9]:
for r in result:
    if os.path.exists('outputs/osdds/{0}.json'.format(r['id'])):
        continue
    try:
        xml = etree.fromstring(r['cleaned_content'].encode('utf-8'))
    except:
        continue
    
    parsed_osdd = parse_osdd(xml)
    parsed_osdd.update({
        "response_id": r['id'],
        "source_url": r['source_url'],
        "validated_on": datetime.now().isoformat()
    })
    
    # run a quick linkrot check
    # note that this doesn't compare the
    # harvested response with the currently available one
    url = r['source_url']
    try:
        rsp = requests.head(url)
        parsed_osdd.update({"status": rsp.status_code})
    except:
        parsed_osdd.update({"error": "linkrot error"})
    
    for j, template in enumerate(parsed_osdd.get('templates', [])):
        accept_type = template.get('accept_type', '')
        headers = {'Accept': accept_type} if accept_type else {}
        
        template_responses = []
        for url_type in ['search', 'example', 'default']:
            url = template.get('%s_url' % url_type, '')
            if not url:
                continue
            
            try:
                req = requests.get(url, headers=headers, timeout=15)
                ex = {
                    'status': req.status_code,
                    'has_content': req.content is not None
                }
                parsed_rsp = parse_response(req.content, req.headers)
                ex.update(parsed_rsp)
                template_responses.append({'%s_response' % url_type: ex})
            except requests.exceptions.ReadTimeout:
                template_responses.append({'%s_response' % url_type: {'error': 'timeout'}})
            except requests.exceptions.ConnectionError:
                template_responses.append({'%s_response' % url_type: {'error': 'connection error'}})
        
        template.update({"responses": template_responses})
        parsed_osdd['templates'][j] = template
        
    with open('outputs/osdds/{0}.json'.format(r['id']), 'w') as g:
        g.write(js.dumps(parsed_osdd, indent=4))
        

So now we have most of what we're interested in, let's put it in the RDS.

In [10]:
import glob
import sqlalchemy as sqla
from sqlalchemy.orm import sessionmaker
from sqlalchemy import (
    MetaData,
    Column,
    String,
    Integer,
    Boolean,
    DateTime,
)
from sqlalchemy.dialects.postgresql import *
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()


class OSDD(Base):
    __tablename__ = 'osdds'
    id = Column(Integer, primary_key=True)
    status_code = Column(Integer)
    url = Column(String)
    has_title = Column(Boolean)
    has_description = Column(Boolean)
    has_contact = Column(Boolean)
    has_keywords = Column(Boolean)
    url_templates = Column(JSON)
    namespaces = Column(JSON)
    date_verified = Column(DateTime)
    error=Column(String)
    response_id = Column(Integer)
    


In [11]:
with open('../local/big_rds.conf', 'r') as f:
    conf = js.loads(f.read())

files = glob.glob('outputs/osdds/*.json')
    
# our connection
engine = sqla.create_engine(conf.get('connection'))
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()

In [None]:
for f in files[5:]:
    with open(f, 'r') as g:
        data = js.loads(g.read())
    
    osdd = OSDD(
        status_code=data.get('status'),
        url=data.get("source_url"),
        has_title=data.get('has_title'),
        has_description=data.get('has_desc'),
        has_contact=data.get('has_contact'),
        has_keywords=data.get('has_keywords'),
        url_templates=data.get('templates', []),
        namespaces=data.get('namespaces', {}),
        response_id=data.get('response_id'),
        date_verified=data.get('validated_on'),
        error=data.get('error')
    )
    
    session.add(osdd)
    try:
        session.commit()
    except:
        print 'commit failed', f
        session.rollback()

Some notes about the stats.


Basic namespace use:

| Namespace                                                                                    | Frequency | 
|----------------------------------------------------------------------------------------------|-----------| 
| (alf,http://www.alfresco.org)                                                                | 1         | 
| (alien,http://alien.jrc.ec.europa.eu/species)                                                | 1         | 
| (atom,http://www.w3.org/2005/Atom)                                                           | 1         | 
| (bvh,http://www.hospitalsdatabase.lshtm.ac.uk/opensearch/)                                   | 1         | 
| (chronam,http://chroniclingamerica.loc.gov)                                                  | 2         | 
| (custom,http://example.com/opensearchextensions/1.0/)                                        | 1         | 
| (dc,http://purl.org/dc/elements/1.1/)                                                        | 6         | 
| (dclite4g,http://xmlns.com/2008/dclite4g#)                                                   | 1         | 
| (dct,http://purl.org/dc/terms/)                                                              | 1         | 
| (default,http://a9.com/-/spec/opensearch/1.1/)                                               | 4608      | 
| (eop,http://www.genesi-dr.eu/spec/opensearch/extensions/eop/1.0/)                            | 1         | 
| (eum,http://a9.com/-/opensearch/extensions/eumetsat/1.0/)                                    | 1         | 
| (geo,http://a9.com/-/opensearch/extensions/geo/1.0/)                                         | 68        | 
| (ical,http://www.w3.org/2002/12/cal/ical#)                                                   | 1         | 
| (ie,http://schemas.microsoft.com/Search/2008/)                                               | 4         | 
| (MODAPSParameters,http://modwebsrv.modaps.eosdis.nasa.gov/opensearchextensions/1.0/)         | 1         | 
| (moz,http://www.mozilla.org/2006/browser/search/)                                            | 1049      | 
| (moz,http:/www.mozilla.org/2006/browser/search/)                                             | 10        | 
| (mozilla,http://www.mozilla.org/2006/browser/search/)                                        | 4         | 
| (mp,http://modwebsrv.modaps.eosdis.nasa.gov/opensearchextensions/1.0/)                       | 8         | 
| (nsidc,http://nsidc.org/ns/opensearch/1.1/)                                                  | 7         | 
| (opensearch,http://a9.com/-/spec/opensearch/1.1/)                                            | 5         | 
| (opensearchgeo,http://a9.com/-/opensearch/extensions/geo/1.0/)                               | 1         | 
| (os,http://a9.com/-/spec/opensearch/1.1/)                                                    | 1         | 
| (parameters,http://a9.com/-/spec/opensearch/extensions/parameters/1.0/)                      | 6         | 
| (podaac,http://podaac.jpl.nasa.gov/opensearch/)                                              | 1         | 
| (py,http://genshi.edgewall.org/)                                                             | 2         | 
| (rdf,http://www.w3.org/1999/02/22-rdf-syntax-ns#)                                            | 1         | 
| (referrer,http://a9.com/-/opensearch/extensions/referrer/)                                   | 8         | 
| (sar,http://earth.esa.int/sar)                                                               | 1         | 
| (sru,http://a9.com/-/opensearch/extensions/sru/2.0/)                                         | 1         | 
| (suggestions,http://www.opensearch.org/specifications/opensearch/extensions/suggestions/1.1) | 1         | 
| (time,http://a9.com/-/opensearch/extensions/time/1.0/)                                       | 32        | 
| (wcapi,http://www.worldcat.org/devnet/wiki/SearchAPIDetails)                                 | 2         | 
| (ws,http://dclite4g.xmlns.com/ws.rdf#)                                                       | 1         | 
| (xml,http://www.w3.org/XML/1998/namespace)                                                   | 4608      | 
| (xsd,http://www.w3.org/2001/XMLSchema)                                                       | 19        | 
| (xsi,http://www.w3.org/2001/XMLSchema-instance)                                              | 19        | 