In [None]:
import os
from lxml import etree
from HTMLParser import HTMLParser
import urlparse
import urllib


In [None]:
### UTILS

def generate_localname_xpath(tags):
    unchangeds = ['*', '..', '.', '//*']
    return '/'.join(
        ['%s*[local-name()="%s"]' % ('@' if '@' in t else '', t.replace('@', ''))
         if t not in unchangeds else t for t in tags])


def extract_attrib(elem, tags):
    e = extract_elem(elem, tags)
    return e.strip() if e else ''


def extract_attribs(elem, tags):
    e = extract_elem(elem, tags)
    return [m.strip() for m in e]


def extract_item(elem, tags):
    e = extract_elem(elem, tags)
    return e.text.strip() if e is not None and e.text else ''


def extract_items(elem, tags):
    es = extract_elems(elem, tags)
    return [e.text.strip() for e in es if e is not None and e.text]


def extract_elems(elem, tags):
    xp = generate_localname_xpath(tags)
    return elem.xpath(xp)


def extract_elem(elem, tags):
    xp = generate_localname_xpath(tags)
    return next(iter(elem.xpath(xp)), None)


def unquote(url):
    return urllib.unquote(url)


def tidy_dict(items):
    # cleanup a dict (remove empty elements)
    # but only at the single depth
    to_remove = []
    for k, v in items.iteritems():
        if not v:
            to_remove.append(k)
    for k in to_remove:
        del items[k]

    return items

In [None]:
### CLASSES

class BasicParser():
    '''
    not concerned about namespaces or querying

    note: these could merge at some point
    '''
    def __init__(self, text):
        try:
            self.text = text.encode('unicode_escape')
        except UnicodeDecodeError:
            # TODO: this should be somewhere else and also maybe not this
            self.text = text.decode('utf-8', 'replace').encode('unicode_escape')
        self.parser = etree.XMLParser(
            remove_blank_text=True,
            remove_comments=True,
            recover=True,
            remove_pis=True,
            ns_clean=True
        )
        self._parse()
        self._extract_namespaces()

    def _parse(self):
        try:
            self.xml = etree.fromstring(self.text, parser=self.parser)
        except Exception as ex:
            print ex
            raise ex

    def _extract_namespaces(self):
        '''
        Pull all of the namespaces in the source document
        and generate a list of tuples (prefix, URI) to dict
        '''
        if self.xml is None:
            self.namespaces = {}
            return

        document_namespaces = dict(self.xml.xpath('/*/namespace::*'))
        if None in document_namespaces:
            document_namespaces['default'] = document_namespaces[None]
            del document_namespaces[None]

        # now run through any child namespace issues
        all_namespaces = self.xml.xpath('//namespace::*')
        for i, ns in enumerate(all_namespaces):
            if ns[1] in document_namespaces.values():
                continue
            new_key = ns[0] if ns[0] else 'default%s' % i
            document_namespaces[new_key] = ns[1]

        self.namespaces = document_namespaces