In [92]:
from lxml import etree
import re
import dateutil.parser as dateparser
import urlparse
import urllib
from itertools import chain, izip
import json

from bs4 import BeautifulSoup



In [140]:
content = '''<metadata>
    <title id=""></title>
    <!-- check for the json as attribute -->
    <sometext human_address="{&quot;address&quot;:&quot;1801 Hawthorn Ln.&quot;,&quot;city&quot;:&quot;West Chicago&quot;,&quot;state&quot;:&quot;IL&quot;,&quot;zip&quot;:&quot;60185&quot;}"></sometext>
    <!-- scale things that are easy -->
    <sometext>Public Land Survey System data were collected from 1:500,000 and 1:1,000,000 State base maps for the western United States.</sometext>
    <!-- scale things that are less easy -->
    <sometext>#1:100,000-scale base maps</sometext>
    <!-- mostly the same. -->
    <sometext>In all, 85 corrections were made using 1:100,000-scale base maps.</sometext>
    <!-- mostly the same but also tilde. -->
    <sometext>"~1:63,000-scale"</sometext>
    <sometext>Here's some stuff with a url http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp in it.</sometext>
    <sometext>Here's some stuff with a url &lt;http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp&gt; in it.</sometext>
    <sometext>Here's some stuff with a url http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp} in it.</sometext>
    <sometext>Here's some stuff with a url (http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp) in it.</sometext>
    <sometext></sometext>
    <sometext>ISO 19115-2:2009(E)</sometext>
    
    <!-- fun with bad dates -->
    <somedate>9999-01-01T00:00:00</somedate>
    <somedate>0001-01-01T00:00:00</somedate>
    <somedate>NaN-01-01T00:00:00</somedate>
    <somedate>"1999-01-01T00:00:00"</somedate>
    <somedate>00:35:00</somedate>
    <somedate>"00:35:00"</somedate>
    <somedate></somedate>
    
    <!-- exclude by tags or tag-related things -->
    <someelement>
        <CI_DateTypeCode codeList="http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_DateTypeCode" codeListValue="publication"></CI_DateTypeCode>
    </someelement>
    <someelement template="http://www.blob.com"></someelement>
    <someelement><Value>blob:ThisIsNot:A:URN</Value></someelement>
    <someelement schemaLocation="http://www.blob.com/schema.xsd"></someelement>
    
    <!-- things with urls in them -->
    <someurl>&lt;URL: plss_la_usgs_2003.gif&gt;</someurl>
    <someurl>"http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp?catalog=http:/ferret.pmel.noaa.gov/thredds/catalog/las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface_Level/catalog.xml&amp;dataset=las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface"</someurl>
    <someurl>http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp .</someurl>
    <someurl>(http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp)</someurl>
    <someurl>(http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp):</someurl>
    <someurl></someurl>
    <someurl></someurl>
    
    <someurn>WWW:SPARQL:1.1</someurn>
</metadata>'''


# content = '''<metadata>
#     <someurl>"http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp?catalog=http://ferret.pmel.noaa.gov/thredds/catalog/las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface_Level/catalog.xml&amp;dataset=las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface"</someurl>
# </metadata>'''

In [141]:
class BasicParser():
    '''
    not concerned about namespaces or querying

    note: these could merge at some point
    '''
    def __init__(self, text, handle_html=False, include_html_hrefs=False):
        self.text = text.encode('unicode_escape')
        self.parser = etree.XMLParser(
            remove_blank_text=True,
            remove_comments=True,
            recover=True,
            remove_pis=True
        )
        self.handle_html = handle_html
        self.include_html_hrefs = include_html_hrefs

        self._parse()
        
        #print self.xml

    def _parse(self):
        try:
            self.xml = etree.fromstring(self.text, parser=self.parser)
        except Exception as ex:
            print ex
            raise

    def _un_htmlify(self, text):
        def _handle_bad_html(s):
            pttn = re.compile('<|>')
            return pttn.sub(' ', s)

        soup = BeautifulSoup(text.strip())

        # get all of the text and any a/@href values
        texts = [_handle_bad_html(t.strip('"')) for t in soup.find_all(text=True)]
        if self.include_html_hrefs:
            texts += [unquote(a['href']) for a in soup.find_all('a') if 'href' in a.attrs]

        try:
            text = ' '.join(texts)
        except Exception as ex:
            print ex
            raise
        return text

    def strip_text(self, exclude_tags=[]):
        def _extract_tag(t):
            if not t:
                return
            return t.split('}')[-1]

        def _taggify(e):
            tags = [e.tag] + [m.tag for m in e.iterancestors()]
            tags.reverse()

            try:
                return [_extract_tag(t) for t in tags]
            except:
                return []

        for elem in self.xml.iter():
            t = elem.text.strip() if elem.text else ''
            tags = _taggify(elem)

            if [e for e in exclude_tags if e in tags]:
                continue

            if t:
                if self.handle_html and (
                        (t.startswith('<') and t.endswith('>'))
                        or ('<' in t or '>' in t)):
                    t = self._un_htmlify(t)
                if t:
                    yield ('/'.join(tags), t)

            for k, v in elem.attrib.iteritems():
                if v.strip():
                    v = next(iter(BeautifulSoup(v.strip())), '')
                    if v:
                        yield ('/'.join(tags + ['@' + _extract_tag(k)]), v.find("p").string)


In [142]:
_pattern_set = [
     ('url', re.compile(ur"((?:(?:https?|ftp|http)://)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:.\d{1,3}){3})(?!(?:169.254|192.168)(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)(?:.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*(?:.(?:[a-z\\u00a1-\\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?)", re.IGNORECASE)),
    # a urn that isn't a url
    ('urn', re.compile(ur"(?![http://])(?![https://])(?![ftp://])(([a-z0-9.\S][a-z0-9-.\S]{0,}\S:{1,2}\S)+[a-z0-9()+,\-.=@;$_!*'%/?#]+)", re.IGNORECASE)),
    # ('urn', re.compile(ur"\burn:[a-z0-9][a-z0-9-]{0,31}:[a-z0-9()+,\-.:=@;$_!*'%/?#]+", re.IGNORECASE)),
    ('uuid', re.compile(ur'([a-f\d]{8}(-[a-f\d]{4}){3}-[a-f\d]{12}?)', re.IGNORECASE)),
    ('doi', re.compile(ur"(10[.][0-9]{4,}(?:[/][0-9]+)*/(?:(?![\"&\\'])\S)+)", re.IGNORECASE)),
    ('md5', re.compile(ur"([a-f0-9]{32})", re.IGNORECASE))
]

_rule_set = [
    ('uri', 'fileIdentifier/CharacterString'),  # ISO
    ('uri', 'identifier/*/code/CharacterString'),
    ('uri', 'dataSetURI/CharacterString'),
    ('uri', 'parentIdentifier/CharacterString'),
    ('uri', 'Entry_ID'),  # DIF
    ('uri', 'dc/identifier'),  # DC
    ('basic', 'Layer/Name'),  # WMS
    ('basic', 'dataset/@ID'),  # THREDDS
    ('uri', '@URI'),  # ddi
    ('uri', '@IDNo')  # ddi
]

def match(s, p):
    '''
    extract from regex
    '''
    m = re.search(p, s)
    return m.group(0) if m else ''

def unquote(url):
    return urllib.unquote(url)


def break_url(url):
    parts = urlparse.urlparse(url)

    url = urlparse.urlunparse((
        parts.scheme,
        parts.netloc,
        parts.path,
        None, None, None
    ))

    params = urlparse.parse_qs(parts.query)
    values = list(chain.from_iterable((params.values())))

    return url, ' '.join(values)



In [145]:
# the new workflow
'''
extract urls from the text
split into base and values
extract urls from values


store as sha, text, type, method, id
'''

class Identifier(object):
    pass

class IdentifierExtractor(object):
    def __init__(self, source_url, source_xml_as_str):
        self.source_url = source_url
        self.source_xml_as_str = source_xml_as_str
        
        self.identifieds = []
        self.texts = []
        self.seen_texts = []
        
        self._parse()
    
    def _parse(self):
        try:
            parser = BasicParser(self.source_xml_as_str, True, True)
        except Exception as ex:
            print ex
        if not parser or parser.xml is None:
            raise Exception('failed to parse')
        
        for tag, txt in parser.strip_text():
            self.texts.append((tag, txt))
            
    def _strip_punctuation(self, text):
        terminal_punctuation = '(){}[].,~|":'
        text = text.strip(terminal_punctuation)
        return text.strip()

    def _strip_dates(self, text):
        # this should still make it an invalid date
        # text = text[3:] if text.startswith('NaN') else text
        try:
            d = dateparser.parse(text)
            return ''
        except ValueError:
            return text
                
    def _strip_scales(self, text):
        scale_pttn = ur"(1:[\d]{0,}(,[\d]{3}){1,})"
        m = match(text, scale_pttn)
        if m:
            return ''
        return text

    def _strip_excludes(self, text):
        if any(e.lower() in text.lower() for e in excludes):
            return ''
        return text
        
    def _tidy_text(self, text):
        text = self._strip_punctuation(text)
        if not text:
            return ''
        
        text = self._strip_scales(text)
        if not text:
            return ''
    
        text = self._strip_dates(text)
        if not text:
            return ''
        
        return text
        
    def _extract_url(self, text):
        pttn = re.compile(ur"((?:(?:https?|ftp|http)://)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:.\d{1,3}){3})(?!(?:169.254|192.168)(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)(?:.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*(?:.(?:[a-z\\u00a1-\\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?)",
                          re.IGNORECASE)
        m = match(text, pttn)
        if not m:
            return '', []
        
        space_pattern = re.compile(' ')
        if space_pattern.subn(' ', m)[1] > 0:
            # i actually don't know if this is the right index. huh.\
            m = m.split(' ')[0]

        url = unquote(m)
        base_url, values = break_url(url)
        
        # return the original extracted url, and the values plus 
        # the base_url for more extracting
        return url, values.split(' ') + [base_url]

    def _extract_identifiers(self, text):
        for pattern_type, pattern in _pattern_set:
            m = match(text, pattern)
            if not m:
                continue
            yield pattern_type, m
      
    def _check_identifieds(self, extracted):
        return len([a for a in self.identifieds if a[4] == extracted]) > 0
    
    def _check_seens(self, extracted):
        return len([a for a in self.seen_texts if a == extracted]) > 0
    
    def process_text(self):
        while self.texts:
            tag, text = self.texts.pop()
            if self._check_seens(text) or not text.strip():
                continue
                
            print 'LOOP: ', len(self.texts)
            
            print 'STARTING: ', text
            try:
                j = json.loads(text)
                j.keys()  # it will decode a quoted string without error
                # print 'wtf', j
                continue
            except:
                pass
            
            url, values = self._extract_url(text)
            if url:
                print 'VALUES A:', len(values)
                print 'VALUES: ', values
            
            values = [v for v in values if not self._check_seens(v)]
            if url:
                print 'VALUES B:', len(values)
            self.texts += list(iter(izip([tag]* len(values), values)))
            print 'ADDING: ', len(self.texts)
            
            if url and not self._check_identifieds(url) and not self._check_seens(url):
                self.identifieds.append((tag, text, 'url', 'regex', url))
                self.seen_texts.append(url)
            
            # now run the OTHER regex
            for match_type, match_text in self._extract_identifiers(text):
                if match_text and not self._check_identifieds(match_text):
                    self.identifieds.append((tag, text, match_type, 'regex', self._tidy_text(match_text)))
                if not self._check_seens(match_text):
                    self.texts.append((tag, match_text))
                    self.seen_texts.append(match_text)
                    print 'MATCH ADDING: ', len(self.texts)
                
        
            
            
        

In [146]:
extractor = IdentifierExtractor('http://www.example.com/some/stuff?uid=urn:234:MyThing&x=nothing', content)
extractor.process_text()

extractor.identifieds

LOOP:  27
STARTING:  WWW:SPARQL:1.1
ADDING:  27
MATCH ADDING:  28
LOOP:  26
STARTING:  (http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp):
VALUES A: 2
VALUES:  ['', 'http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp']
VALUES B: 2
ADDING:  28
MATCH ADDING:  29
LOOP:  24
STARTING:  http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp .
VALUES A: 2
VALUES:  ['', 'http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp']
VALUES B: 1
ADDING:  25
LOOP:  23
STARTING:  "http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp?catalog=http:/ferret.pmel.noaa.gov/thredds/catalog/las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface_Level/catalog.xml&dataset=las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface"
VALUES A: 3
VALUES:  ['http:/ferret.pmel.noaa.gov/thredds/catalog/las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface_Level/catalog.xml', 'las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface', 'http://ferret.pmel.noaa.gov/thredds/view/ToolsU

[('metadata/someurn', 'WWW:SPARQL:1.1', 'urn', 'regex', 'WWW:SPARQL:1.1'),
 ('metadata/someurl',
  '(http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp):',
  'url',
  'regex',
  'http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp'),
 ('metadata/someurl',
  '(http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp):',
  'urn',
  'regex',
  'http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp'),
 ('metadata/someurl',
  '"http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp?catalog=http:/ferret.pmel.noaa.gov/thredds/catalog/las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface_Level/catalog.xml&dataset=las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface"',
  'url',
  'regex',
  'http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp?catalog=http:/ferret.pmel.noaa.gov/thredds/catalog/las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface_Level/catalog.xml&dataset=las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface'),
 ('metadata/someurl',
  

In [119]:
s = 'http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp?catalog=http:/ferret.pmel.noaa.gov/thredds/catalog/las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface_Level/catalog.xml&amp;dataset=las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface'
url, values = break_url(s)

print url
print
print values

http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp

http:/ferret.pmel.noaa.gov/thredds/catalog/las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface_Level/catalog.xml las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface


In [137]:
s = "http://ferret.pmel.noaa.gov/thredds/view/ToolsUI.jnlp?catalog=http://ferret.pmel.noaa.gov/thredds/catalog/las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface_Level/catalog.xml&dataset=las/NOAA-CIRES-CDC-CDC_Derived_NCEP_Reanalysis_Products_Surface"
pttn = re.compile(ur"((?:(?:https?|ftp|http)://)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:.\d{1,3}){3})(?!(?:169.254|192.168)(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)(?:.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*(?:.(?:[a-z\\u00a1-\\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?)",
                          re.IGNORECASE)
match(s, pttn)


c = json.loads('"%s"' % s)
c.keys()

AttributeError: 'unicode' object has no attribute 'keys'

## IGNORE THIS STUFF RIGHT NOW

In [91]:
soup = BeautifulSoup('http://www.example.com/some/stuff?uid=urn:234:MyThing&x=nothing')
soup.find("p").string

json.loads(u'{"address":"1801 Hawthorn Ln.","city":"West Chicago","state":"IL","zip":"60185"}')

NameError: name 'json' is not defined

In [8]:
def tidy(text):
    terminal_punctuation = '(){}[].,~|":'
    text = text.strip(terminal_punctuation)
    
    print 'remove punctuation: ', text

    text = text[3:] if text.startswith('NaN') else text
    
    print 'remove nan: ', text

    # check for scale?
    scale_pttn = ur"(1:[\d]{0,}(,[\d]{3}){1,})"
    m = match(text, scale_pttn)
    if m:
        print 'found scale: ', text, m
        return ''

    try:
        d = dateparser.parse(text)
        print 'parsed date: ', d
        return ''
    except ValueError:
        pass

    return text

In [9]:
# just the date checker thing

some_dates = [
    '9999-01-01T00:00:00',
    '0001-01-01T00:00:00',
    'NaN-01-01T00:00:00',
    '"1999-01-01T00:00:00"',
    '00:35:00',
    '"00:35:00"',
    '"00:35:00"):'
]



for sd in some_dates:
    nsd = tidy(sd)
    print sd, ' <> ', nsd

remove punctuation:  9999-01-01T00:00:00
remove nan:  9999-01-01T00:00:00
parsed date:  9999-01-01 00:00:00
9999-01-01T00:00:00  <>  
remove punctuation:  0001-01-01T00:00:00
remove nan:  0001-01-01T00:00:00
parsed date:  2001-01-01 00:00:00
0001-01-01T00:00:00  <>  
remove punctuation:  NaN-01-01T00:00:00
remove nan:  -01-01T00:00:00
parsed date:  2015-01-01 00:00:00
NaN-01-01T00:00:00  <>  
remove punctuation:  1999-01-01T00:00:00
remove nan:  1999-01-01T00:00:00
parsed date:  1999-01-01 00:00:00
"1999-01-01T00:00:00"  <>  
remove punctuation:  00:35:00
remove nan:  00:35:00
parsed date:  2015-05-22 00:35:00
00:35:00  <>  
remove punctuation:  00:35:00
remove nan:  00:35:00
parsed date:  2015-05-22 00:35:00
"00:35:00"  <>  
remove punctuation:  00:35:00
remove nan:  00:35:00
parsed date:  2015-05-22 00:35:00
"00:35:00"):  <>  


In [12]:
from bs4 import BeautifulSoup

text = '<value>&quot;AIRS.2003.01.01.L3.RetStd_H030.v6.0.12.0.G14121184349.hdf&quot;</value>'
soup = BeautifulSoup(text)

x = soup.find_all(text=True)[0]

x.strip('"')


u'AIRS.2003.01.01.L3.RetStd_H030.v6.0.12.0.G14121184349.hdf'

In [19]:
text = '<city>{&quot;address&quot;:&quot;Venice&quot;,&quot;city&quot;:&quot;&quot;,&quot;state&quot;:&quot;CA&quot;,&quot;zip&quot;:&quot;90291&quot;}</city>'

soup = BeautifulSoup(text)
x = soup.find_all(text=True)
print x


[u'{"address":"Venice","city":"","state":"CA","zip":"90291"}']
