In [1]:
import os
import sys
from lxml import etree
from HTMLParser import HTMLParser
import urlparse
import urllib
import re


In [2]:
### UTILS

def generate_localname_xpath(tags):
    unchangeds = ['*', '..', '.', '//*']
    return '/'.join(
        ['%s*[local-name()="%s"]' % ('@' if '@' in t else '', t.replace('@', ''))
         if t not in unchangeds else t for t in tags])


def extract_attrib(elem, tags):
    e = extract_elem(elem, tags)
    return e.strip() if e else ''


def extract_attribs(elem, tags):
    e = extract_elem(elem, tags)
    return [m.strip() for m in e]


def extract_item(elem, tags):
    e = extract_elem(elem, tags)
    return e.text.strip() if e is not None and e.text else ''


def extract_items(elem, tags):
    es = extract_elems(elem, tags)
    return [e.text.strip() for e in es if e is not None and e.text]


def extract_elems(elem, tags):
    xp = generate_localname_xpath(tags)
    return elem.xpath(xp)


def extract_elem(elem, tags):
    xp = generate_localname_xpath(tags)
    return next(iter(elem.xpath(xp)), None)


def unquote(url):
    return urllib.unquote(url)

def parse_url(url):
    '''
    strip out the query parameters
    '''
    if not url:
        return ''
    parsed_url = urlparse.urlparse(url)
    return urlparse.parse_qs(parsed_url.query)

def tidy_dict(items):
    # cleanup a dict (remove empty elements)
    # but only at the single depth
    to_remove = []
    for k, v in items.iteritems():
        if not v:
            to_remove.append(k)
    for k in to_remove:
        del items[k]

    return items

def remap_http_method(original_method):
        '''
        return the "full" http method from some input
        '''
        definition = {
            "HTTP GET": ['get'],
            "HTTP POST": ['post']
        }
        for k, v in definition.iteritems():
            if original_method.lower() in v:
                return k
        return original_method

In [3]:
### CLASSES

class BasicParser():
    '''
    not concerned about namespaces or querying

    note: these could merge at some point
    '''
    def __init__(self, text):
        try:
            self.text = text.encode('unicode_escape')
        except UnicodeDecodeError:
            # TODO: this should be somewhere else and also maybe not this
            self.text = text.decode('utf-8', 'replace').encode('unicode_escape')
        self.parser = etree.XMLParser(
            remove_blank_text=True,
            remove_comments=True,
            recover=True,
            remove_pis=True,
            ns_clean=True
        )
        self._parse()
        self._extract_namespaces()

    def _parse(self):
        try:
            self.xml = etree.fromstring(self.text, parser=self.parser)
        except Exception as ex:
            print ex
            raise ex

    def _extract_namespaces(self):
        '''
        Pull all of the namespaces in the source document
        and generate a list of tuples (prefix, URI) to dict
        '''
        if self.xml is None:
            self.namespaces = {}
            return

        document_namespaces = dict(self.xml.xpath('/*/namespace::*'))
        if None in document_namespaces:
            document_namespaces['default'] = document_namespaces[None]
            del document_namespaces[None]

        # now run through any child namespace issues
        all_namespaces = self.xml.xpath('//namespace::*')
        for i, ns in enumerate(all_namespaces):
            if ns[1] in document_namespaces.values():
                continue
            new_key = ns[0] if ns[0] else 'default%s' % i
            document_namespaces[new_key] = ns[1]

        self.namespaces = document_namespaces
        

    


In [55]:
### the classes we care about from a routing perspective

class Processor(object):
    '''
    where routes is the tag sets to run as namespace-free
    xpath. the service, metadata and dataset keys are the dicts of
    tag lists (in case we have different locations for y) and the
    resultset list is the tag list to the result children
    '''

    def __init__(self, identify, response, url, parent_url=''):
        self.response = response
        self.url = url
        self.identify = identify
        self.parent_url = parent_url

        self._load_xml()

    def parse(self):
        pass
    
    def parse_children(self, elem=None, tags=[]):
        '''
        where elem = the parent node for the set and
        tags is the un-namespaced list of explicit items
        to parse or, if not specified, run the children
        one level down
        '''
        elem = self.parser.xml if elem is None else elem
        children = []
        if tags:
            children = extract_elems(elem, tags)
        else:
            children = [child for child in elem.iterchildren()]

        for child in children:
            parsed = self._parse_child(child)
            if parsed:
                yield parsed
        

    def _load_xml(self):
        self.parser = BasicParser(self.response)
    
    def _parse_child(self, child):
        pass

class OpenSearchReader(Processor):
    def __init__(self, identify, response, url, parent_url=''):
        self.response = response
        self.url = url
        self.identify = identify
        self.parent_url = parent_url
        
        self._load_xml()
        
    def parse(self):
        self.description = {}
        
        if self.parent_url:
            # TODO: consider making this a sha
            self.description['childOf'] = self.parent_url
        
        if 'service' in self.identify:
            self.description['service'] = self._parse_service()
            
        if 'resultset' in self.identify:
            # TODO: get the root stats
            self.description['children'] = self._parse_children(self.identify['resultset'].get('dialect', ''))
        
        self.description = tidy_dict(self.description)

    def _parse_service(self):
        output = {}
        output['title'] = extract_items(self.parser.xml, ["ShortName"])        
        output['abstract'] = extract_items(self.parser.xml, ["LongName"]) + \
            extract_items(self.parser.xml, ["Description"])
        output['source'] = extract_items(self.parser.xml, ["Attribution"])
        output['contact'] = extract_items(self.parser.xml, ["Developer"])
        output['rights'] = extract_items(self.parser.xml, ["SyndicationRight"])
        output['subject'] = extract_items(self.parser.xml, ["Tags"])
        
        output['endpoints'] = [self._parse_endpoint(e) for e in extract_elems(self.parser.xml, ['Url'])]
        
        return tidy_dict(output)
        
    def _parse_endpoint(self, elem):
        endpoint = {}
        endpoint['protocol'] = remap_http_method(elem.get('type', ''))
        endpoint['template'] = elem.get('template', '')
        endpoint['parameters'] = self._extract_params(elem)
        endpoint['actionable'] = 'NOPE'
        endpoint['url'] = ''
        
        return tidy_dict(endpoint)
        
    def _parse_children(self, dialect):
        ''' i fundamentally do not like this '''
        output = {}
        
        if dialect == 'ATOM':
            reader = OpenSearchAtomReader(None, self.response, self.url)
        elif dialect == 'RSS':
            reader = OpenSearchRssReader(None, self.response, self.url)
        return reader.parse()
    
    def _extract_params(self, endpoint):
        def _extract_prefix(param):
            pattern = '\{{0,1}(\S*):([\S][^}]*)'

            # TODO: this is probably a bad assumption (that there's just the
            #   one item in the list, not that urlparse returns the terms as a list)
            if isinstance(param, list):
                param = param[0]

            if ':' not in param:
                return ('', param)

            m = re.search(pattern, param)
            return m.groups()
        
        _parameter_formats = {
            "geo:box": "west, south, east, north",
            "time:start": "YYYY-MM-DDTHH:mm:ssZ",
            "time:stop": "YYYY-MM-DDTHH:mm:ssZ"
        }
        url = endpoint.get('template', '')
        query_params = parse_url(url)
        
        # deal with the namespaced parameters as [query param key, prefix, type]
        query_params = [[k] + list(_extract_prefix(v)) for k, v
                        in query_params.iteritems()]

        return [
            tidy_dict({
                "name": qp[0],
                "prefix": qp[1],
                "type": qp[2],
                "format": _parameter_formats.get(':'.join(qp[1:]))
                })
                for qp in query_params
            ]
    
    
class OpenSearchAtomReader(Processor):
    def parse(self):
        output = {}
        output['items'] = [child for child in self.parse_children(tags=['//*', 'entry'])]
        
        print output
        return tidy_dict(output)

    def _parse_child(self, child):
        entry = {}

        entry['title'] = extract_item(child, ['title'])
        entry['id'] = extract_item(child, ['id'])
        entry['creator'] = extract_item(child, ['creator'])
        entry['author'] = extract_item(child, ['author', 'name'])
        entry['date'] = extract_item(child, ['date'])
        entry['updated'] = extract_item(child, ['updated'])
        entry['published'] = extract_item(child, ['published'])

        entry['subjects'] = [e.attrib.get('term', '') for e in extract_elems(child, ['category'])]

        entry['contents'] = []
        contents = extract_elems(child, ['content'])
        for content in contents:
            text = content.text.strip() if content.text else ''
            content_type = content.attrib.get('type', '')
            entry['contents'].append({'content': text, 'type': content_type})

        entry['links'] = []
        links = extract_elems(child, ['link'])
        for link in links:
            href = link.attrib.get('href', '')
            rel = link.attrib.get('rel', '')
            entry['links'].append({'href': href, 'rel': rel})

        return tidy_dict(entry)

class OpenSearchRssReader(Processor):
    def parse(self):
        output = {}
        output['items'] = [child for child in self.parse_children(tags=['//*', 'item'])]
        return tidy_dict(output)

    def _parse_child(self, child):
        item = {}
        item['title'] = extract_item(child, ['title'])
        item['language'] = extract_item(child, ['language'])
        item['author'] = extract_item(child, ['author'])
        # TODO: go sort out what this is: http://purl.org/rss/1.0/modules/content/
        item['encoded'] = extract_item(child, ['encoded'])
        item['id'] = extract_item(child, ['guid'])
        item['creator'] = extract_item(child, ['creator'])

        item['subjects'] = extract_items(child, ['category'])
        item['published'] = extract_item(child, ['pubDate'])
        item['timestamp'] = extract_item(child, ['date'])

        item['links'] = extract_items(child, ['link'])
        item['links'] += extract_items(child, ['docs'])
        
        return tidy_dict(item)

In [53]:
### let's do stuff

response = '''<?xml version="1.0" encoding="utf-8"?>
<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/">
    <ShortName>CEOS</ShortName>
    <Description/>
    <InputEncoding>UTF-8</InputEncoding>
    <Image type="image/vnd.microsoft.icon" width="16" height="16"
        >http://www.ceos.org/templates/oceanwaves/favicon.ico</Image>
    <Url type="application/opensearchdescription+xml" rel="self"
        template="http://www.ceos.org/index.php?option=com_search&amp;view=remind&amp;format=opensearch"/>
    <Url type="text/html"
        template="http://www.ceos.org/index.php?option=com_search&amp;searchword={searchTerms}"/>
</OpenSearchDescription>'''

url = 'http://www.ceos.org/index.php?option=com_search&amp;view=remind&amp;format=opensearch'
identity = {"protocol": "OpenSearch", "service": {"name": "DescriptionDocument"}}

reader = OpenSearchReader(identity, response.replace('\n', ''), url)
reader.parse()
reader.description



{'service': {'endpoints': [{'actionable': 'NOPE',
    'parameters': [{'name': 'format', 'type': 'opensearch'},
     {'name': 'option', 'type': 'com_search'},
     {'name': 'view', 'type': 'remind'}],
    'protocol': 'application/opensearchdescription+xml',
    'template': 'http://www.ceos.org/index.php?option=com_search&view=remind&format=opensearch'},
   {'actionable': 'NOPE',
    'parameters': [{'name': 'option', 'type': 'com_search'},
     {'name': 'searchword', 'type': '{searchTerms}'}],
    'protocol': 'text/html',
    'template': 'http://www.ceos.org/index.php?option=com_search&searchword={searchTerms}'}],
  'title': ['CEOS']}}

In [56]:
with open('../response_examples/opensearch_usgs_search_atom.xml', 'r') as f:
    response = f.read().replace('\\\n', '').replace('\\n', '').replace('\n', '')
    
url = 'http://www.ceos.org/index.php?option=com_search&amp;view=remind&amp;format=opensearch'
identity = {"protocol": "OpenSearch",
    "resultset": {
        "dialect": "ATOM"
    }}

reader = OpenSearchReader(identity, response.replace('\n', ''), url)
reader.parse()
reader.description

{'items': [{'updated': '2014-06-21T19:49:32-06:00', 'links': [{'href': 'https://www.sciencebase.gov/catalog/item/5287d495e4b03b89f6f1a088.atom', 'rel': 'self'}, {'href': 'https://www.sciencebase.gov/catalog/item/5287d495e4b03b89f6f1a088', 'rel': ''}, {'href': 'https://www.sciencebase.gov/catalog/item/feedMap/5287d495e4b03b89f6f1a088', 'rel': 'related'}], 'subjects': ['geology', 'geologic maps', 'surficial geologic units', 'unconsolidated deposits', 'geospatial datasets', 'geoscientificInformation', 'Canyonlands National Park', 'Utah', 'Druid Arch 7.5-minute quadrangle', 'The Loop 7.5-minute quadrangle', '49037 = San Juan'], 'published': '2013-11-16T13:24:53-07:00', 'title': 'Surficial Geologic Map of The Loop and Druid Arch Quadrangles,Canyonlands National Park, Utah', 'id': 'https://www.sciencebase.gov/catalog/item/5287d495e4b03b89f6f1a088', 'contents': [{'content': '<div> This geologic map is a product of a cooperative project between theU.S. Geological Survey and the U.S. National P

{'children': {'items': [{'contents': [{'content': '<div> This geologic map is a product of a cooperative project between theU.S. Geological Survey and the U.S. National Park Service to providegeologic information about this part of Canyonlands National Park, Utah.This digital map database contains bedrock data from previously publisheddata that has been modified by the author. New mapping of the surficialdeposits represents the general distribution of surficial deposits of theDruid Arch and The Loop 7.5-minute quadrangles.</div>',
      'type': 'html'}],
    'id': 'https://www.sciencebase.gov/catalog/item/5287d495e4b03b89f6f1a088',
    'links': [{'href': 'https://www.sciencebase.gov/catalog/item/5287d495e4b03b89f6f1a088.atom',
      'rel': 'self'},
     {'href': 'https://www.sciencebase.gov/catalog/item/5287d495e4b03b89f6f1a088',
      'rel': ''},
     {'href': 'https://www.sciencebase.gov/catalog/item/feedMap/5287d495e4b03b89f6f1a088',
      'rel': 'related'}],
    'published': '2013-