from urls and/or xml text/attributes, can we extract urns? uuids? guids? dois? other ids.

starting from a url:

1. parse url
2. parse route
3. parse query params (un-urlencode)
4. check route parts


starting with xml:

1. parse xml
2. run the xpath ruleset
3. test xpath results if results
4. extract all other text
5. check text

In [1]:
import json
import glob
from lxml import etree
import re
import os

# tuple = type, sample string
url_sampleset = [
('uuid', 'https://data.noaa.gov/harvest/object/b1b6e62a-cc9b-4cf5-89e5-d280e854eb1c'),
('uuid', 'https://data.noaa.gov/harvest/object/2ee5666a-2f9e-4e69-997f-474737fcce71/original'),
('urn', 'https://openknowledge.worldbank.org/oai/request?verb=GetRecord&metadataPrefix=oai_dc&identifier=oai:openknowledge.worldbank.org:10986/18612'),
('other', 'http://www.ngdc.noaa.gov/metadata/published/NOAA/NESDIS/NGDC/MGG/NOS/H10001-H12000/iso/xml/H10193.xml'),
('other', 'https://www.sciencebase.gov/catalog/item/51fc076be4b04b00e3d891e5?format=atom'),
('other', 'http://acdisc.gsfc.nasa.gov/opendap/HDF-EOS5/Aura_OMI_Level3/OMSO2e.003/2011/OMI-Aura_L3-OMSO2e_2011m0104_v003-2012m0409t151714.he5.ddx'),
('uuid', 'https://www.ngdc.noaa.gov/geoportal/rest/document?id=%7B6FCC9928-1352-44F3-9D7B-BBDC9AF15E9A%7D'),
('uuid', 'http://gstore.unm.edu/apps/rgis/datasets/35946660-26d0-4d05-955f-8af7ced2b0c1/metadata/FGDC-STD-001-1998.xml'),
('other', 'http://e4ftl01.cr.usgs.gov/MOLT/MOD14.005/2000.04.28/MOD14.A2000119.2335.005.2006261045911.hdf.xml'),
('uuid', 'http://portal.oceannet.org/search/full/catalogue/dassh.ac.uk__MEDIN_2.3__CEFAS1abbc9be-1014-45ac-a281-808310790c31.xml/DIF_9.4'),
('other', 'http://earth.eo.esa.int/ml3/n412/2005/L3_ENV_MER_N412_m__20050201_GLOB_SI_ESA_9277x9277_-90+90+-180+180_0000.xml'),
('other', 'http://www.aauw-ca.org/blog/rss.cfm?mode=full&mode2=cat&catid=9D122695-1617-7A4B-92209B7B1C4B6D75'),
('doi', 'http://data.datacite.org/10.14457/KU.THE.2006.802'),
('doi', 'http://dx.doi.org/10.4224/20386605'),
('doi', 'http://link.springer.com/referenceworkentry/10.1007%2F1-4020-3880-1_85'),
('urn', 'http://epsg-registry.org/export.htm?wkt=urn:ogc:def:crs:EPSG::4979'),
]


In [2]:
import urllib
import urlparse
from itertools import chain

def unquote(url):
    return urllib.unquote(url)

def return_path(url, do_split=False):
    url = unquote(url)
    if do_split:
        return urlparse.urlparse(url).path.split('/')
    return urlparse.urlparse(url).path

def return_parameter_values(url, do_split=False):
    url = unquote(url)
    parse = urlparse.urlparse(url)
    if do_split:
        qp = urlparse.parse_qs(parse.query)
        return list(chain.from_iterable(qp.values()))
    return parse.query
    
def match(s, p):
    m = re.search(p, s)
    return m.group(0) if m else ''

In [40]:
# extract urn patterns

# standalone
urn_pattern = re.compile(ur"^urn:[a-z0-9][a-z0-9-]{0,31}:[a-z0-9()+,\-.:=@;$_!*'%/?#]+$", re.IGNORECASE)
# in text with punctuation
urn_pattern2 = re.compile(ur"\burn:[a-z0-9][a-z0-9-]{0,31}:[a-z0-9()+,\-.:=@;$_!*'%/?#]*[a-z0-9+=@$/]", re.IGNORECASE)
# in text
urn_pattern3 = re.compile(ur"\burn:[a-z0-9][a-z0-9-]{0,31}:[a-z0-9()+,\-.:=@;$_!*'%/?#]+", re.IGNORECASE)


In [41]:
# extract uuid pattern

uuid_pattern = re.compile(ur'(\w{8}(-\w{4}){3}-\w{12}?)', re.IGNORECASE)

uuid_pattern2 = re.compile(ur'([a-f\d]{8}(-[a-f\d]{4}){3}-[a-f\d]{12}?)', re.IGNORECASE)


In [42]:
# extract doi

doi_pattern = re.compile(ur"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\\'])\\S)+)\b", re.IGNORECASE)

In [4]:
# use these patterns (so far)
pattern_set = [
('uuid', re.compile(ur'([a-f\d]{8}(-[a-f\d]{4}){3}-[a-f\d]{12}?)', re.IGNORECASE)),
('urn', re.compile(ur"\burn:[a-z0-9][a-z0-9-]{0,31}:[a-z0-9()+,\-.:=@;$_!*'%/?#]+", re.IGNORECASE)),
('doi', re.compile(ur"(10[.][0-9]{4,}(?:[/][0-9]+)*/(?:(?![\"&\\'])\S)+)", re.IGNORECASE)),
('urn', re.compile(ur"(oai:[a-z0-9.][a-z0-9-.]{0,31}:[a-z0-9()+,\-.:=@;$_!*'%/?#]+)", re.IGNORECASE))
]

In [None]:
# let's make a ruleset based on identified patterns:
# http://philarcher.org/diary/2013/uripersistence/

In [59]:
# run the checks

for kind, url in url_sampleset:
    path = return_path(url)
    query = return_parameter_values(url)
    for pttn_type, pattern in pattern_set:
        m = match(path, pattern)
        if m:
            print 'found URL match: {0}, {1}'.format(m, path)
            print '\tExpected {0}? {1}'.format(kind, kind==pttn_type)
            break
        
        m = match(query, pattern)
        if m:
            print 'found QUERY match: {0}, {1}'.format(m, query)
            print '\tExpected {0}? {1}'.format(kind, kind==pttn_type)
            break
       

found URL match: b1b6e62a-cc9b-4cf5-89e5-d280e854eb1c, /harvest/object/b1b6e62a-cc9b-4cf5-89e5-d280e854eb1c
	Expected uuid? True
found URL match: 2ee5666a-2f9e-4e69-997f-474737fcce71, /harvest/object/2ee5666a-2f9e-4e69-997f-474737fcce71/original
	Expected uuid? True
found QUERY match: oai:openknowledge.worldbank.org:10986/18612, verb=GetRecord&metadataPrefix=oai_dc&identifier=oai:openknowledge.worldbank.org:10986/18612
	Expected urn? True
found QUERY match: 6FCC9928-1352-44F3-9D7B-BBDC9AF15E9A, id={6FCC9928-1352-44F3-9D7B-BBDC9AF15E9A}
	Expected uuid? True
found URL match: 35946660-26d0-4d05-955f-8af7ced2b0c1, /apps/rgis/datasets/35946660-26d0-4d05-955f-8af7ced2b0c1/metadata/FGDC-STD-001-1998.xml
	Expected uuid? True
found URL match: 1abbc9be-1014-45ac-a281-808310790c31, /search/full/catalogue/dassh.ac.uk__MEDIN_2.3__CEFAS1abbc9be-1014-45ac-a281-808310790c31.xml/DIF_9.4
	Expected uuid? True
found URL match: 10.14457/KU.THE.2006.802, /10.14457/KU.THE.2006.802
	Expected doi? True
found U

notes:

urn regex(es) unlikely to be very generalized right now.

ugh. text. and rss. and encoded html in the rss with the identifiers.



building the test set. get the responses for the test urls (find identifer from url in response?)

go get our known similarity (ie duplicates as different representations) set and try on those responses.

In [3]:
import glob
import json
import re
from lxml import etree
from bs4 import BeautifulSoup

import logging
reload(logging)

logger = logging.getLogger(__name__)
handler = logging.FileHandler(filename="identifier_extraction.log", mode="a", encoding="UTF-8")
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

# the array of any identifiers found in a url
# as (source url, identifier)
_found_identifiers = []

def un_htmlify(text):
    # for cdata wrapped things? this is unpleasant
    # but we want identifiers in the rss/atom element
    # wrapped in cdata as encoded html
    
    def _handle_bad_html(s):
        pttn = re.compile('<|>')
        return pttn.sub(' ', s)
    
    soup = BeautifulSoup(text.strip())
    
    # get all of the text and any a/@href values
    texts = [_handle_bad_html(t) for t in soup.find_all(text=True)] + \
            [unquote(a['href']) for a in soup.find_all('a') if 'href' in a.attrs]
        
    try:
        all_text = ' '.join(texts)
    except:
        raise
    return all_text

class Parser():
    def __init__(self, text):
        self.text = text
        self.parser = etree.XMLParser(
            remove_blank_text=True, 
            remove_comments=True, 
            recover=True,
            remove_pis=True
        )
        self._parse()
        
    def _parse(self):
        try:
            self.xml = etree.fromstring(text, parser=self.parser)
        except:
            raise
    
    def strip_text(self):
        # pull any text() and attribute. again.
        # bag of words BUT we care about where in
        # the tree it was found (just for thinking)
        # except do not care about namespace prefixed
        # why am i not stripping out the prefixes? no idea.
        # just don't want to install pparse/saxonb really
        
        def _extract_tag(t):
            if not t:
                return
            return t.split('}')[-1]
            
        def _taggify(e):
            tags = [e.tag] + [m.tag for m in e.iterancestors()]
            tags.reverse()
            
            try:
                return [_extract_tag(t) for t in tags]
            except Exception as ex:
                print tags
                print ex
        
        # to exclude based on the tags *only*
        exclude_pttns = ['schemaLocation']
        
        blobs = []
        for elem in self.xml.iter():
            t = elem.text.strip() if elem.text else ''
            tags = _taggify(elem)
            
            if [e for e in exclude_pttns if e in tags]:
                continue
            
            if t:
                blobs.append(('/'.join(tags), t))
            
            for k, v in elem.attrib.iteritems():
                if v.strip():
                    blobs.append(('/'.join(tags + ['@' + _extract_tag(k)]), v.strip()))
        
        return blobs
            
def process_text(text):
    # check for html, get that as a bag of space-delimited words
    # run some regex for identifiers  
    if (text.startswith('<') and text.endswith('>')) or ('<' in text or '>' in text):
        try:
            text = un_htmlify(text)
        except:
            raise
    
    # look for *any* identifiers
    for match_tuple in match_identifier_patterns(text):
        yield match_tuple

def match_identifier_patterns(s):
    space_pttn = re.compile(' ')
    exclude_pttns = [':ogc:', ':epsg:', '.xsd', 'codelist', 'rolecode', '.xsl']
    
    for pttn_type, pattern in pattern_set:
        m = match(s, pattern)
        if m and not any(e in m.lower() for e in exclude_pttns):
            if pttn_type == 'url' and space_pttn.subn('', m)[1] > 0:
                m = m.split(' ')[0]
            
            # this is stupid. i am tired. so pointless.
            try:
                log_m = m.decode('utf-8', errors='ignore')
            except:
                print 'UNICODE fail: ', m
                log_m = 'CHECK Source - unicode issues'
            logger.debug(u'MATCH_PATTERNS: {0} is a {1} from ({2})'.format(log_m, pttn_type, s))
            
            yield (pttn_type, m)


def match_identifier_rules(xml):
    def _get_text(e):
        if isinstance(e, str):
            return e
        try:
            return e.text
        except Exception as ex:
            # print ex
            return ''
    def _build_xpath(rule):
        return '//' + '/'.join(['%s*[local-name()="%s"]' % ('@' if '@' in r else '', r) 
                               if r not in ['*', '..', '.'] else r for r in rule.split('/')])

    for pttn_type, rule in rule_set:
        xp = _build_xpath(rule)
        results = xml.xpath(xp)
        results = results if isinstance(results, list) else [results]
        
        for r in results:
            t = _get_text(r)
            if t is None:
                continue
            
            # this is stupid. i am tired. so pointless.
            try:
                log_t = t.decode('utf-8', errors='ignore')
            except:
                print 'UNICODE fail: ', t
                log_t = 'CHECK Source - unicode issues'
            logger.debug(u'MATCH_RULES: {0} is a {1} from (the XML)'.format(log_t, pttn_type))
            
            # run the regex pass (so URL -> UUID, etc)
            for match_tuple in match_identifier_patterns(t):
                yield match_tuple

In [8]:
from simhash import Simhash, SimhashIndex
import hashlib

def compare(simhash_to_match):
    pass

def generate_sha(s):
    return hashlib.sha224(s).hexdigest()

In [4]:
# i think we'll keep these as some general xpaths without! the
# local-name/namespacing and add that in method for readability
# and assume we're looking from root (so always '//*.....')
rule_set = [
    ('uri', 'fileIdentifier/CharacterString'),  # ISO
    ('uri', 'identifier/*/code/CharacterString'),
    ('uri', 'dataSetURI/CharacterString'),
    ('uri', 'parentIdentifier/CharacterString'),
    ('uri', 'Entry_ID'),  # DIF
    ('uri', 'dc/identifier'),  # DC
    ('basic', 'Layer/Name'),  # WMS
    ('basic', 'dataset/@ID'),  # THREDDS
]

In [5]:
pattern_set = [
    ('url', re.compile(ur"((?:(?:https?|ftp|http)://)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:.\d{1,3}){3})(?!(?:169.254|192.168)(?:.\d{1,3}){2})(?!172.(?:1[6-9]|2\d|3[0-1])(?:.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)(?:.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*(?:.(?:[a-z\\u00a1-\\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?)", re.IGNORECASE)),
    ('urn', re.compile(ur"(oai:[a-z0-9.][a-z0-9-.]{0,31}:[a-z0-9()+,\-.:=@;$_!*'%/?#]+)", re.IGNORECASE)),
    ('urn', re.compile(ur"\burn:[a-z0-9][a-z0-9-]{0,31}:[a-z0-9()+,\-.:=@;$_!*'%/?#]+", re.IGNORECASE)),
    ('uuid', re.compile(ur'([a-f\d]{8}(-[a-f\d]{4}){3}-[a-f\d]{12}?)', re.IGNORECASE)),
    ('doi', re.compile(ur"(10[.][0-9]{4,}(?:[/][0-9]+)*/(?:(?![\"&\\'])\S)+)", re.IGNORECASE))
]


identifier identification

inconsistent urn patterns:

- can't assume it starts with urn
- can't assume it starts with three letters
- can't really assume any of the chunks are of some known length
- can't really assume any of the chunks are of some known structure (uuid, etc)
   
   
xml:

- xml resolvers are injecting repetitive namespace listings (or schemas)
    that can be captured as urls in the pattern set 
    (see http://catalog.data.gov/harvest/object/5e8cda58-9ea1-4038-9a11-98088f8749fa)
- iso codelists, the same
- xml resolvers often come with an internal @guid/@uuid (which is good for foaf, linking, etc,
    but bad for capturing noise)
- some uuid/guids in the resolved xml do not retain the hyphenated structure (even within the same record)
    
    
oh and you can't indiscriminately use @id in xpath searches in xml because that could be an explicit identifier (id="my-1234-value") instead of some internally required value (gml:id="d34555").

In [6]:
# a shallow fork of simhashindex (https://github.com/liangsun/simhash/blob/master/simhash/__init__.py)
# what we need to do, in a db-free test env, is retain more than the string and the hash of
# the string (we have the sha of the source, the string, and the simhash of the string)
# and a way to exclude a sha from the result set - find equivalent objects in another response

class HashIndex(object):
    def get_near_dups(self, simhash):
        """
        `simhash` is an instance of Simhash
        return a list of obj_id (pipe-delimited string of sha|text|distance)
        """
        assert simhash.f == self.f

        ans = set()

        for key in self.get_keys(simhash):
            dups = self.bucket.get(key, set())

            for dup in dups:
                sim2, obj_blob = dup.split(',', 1)
                sim2 = Simhash(long(sim2, 16), self.f)

                d = simhash.distance(sim2)
                if d <= self.k:
                    ans.add('{0}|{1}'.format(obj_blob, d))
        return list(ans)

    def add(self, obj_id, obj_str, simhash):
        """
        `obj_id` is a string
        `simhash` is an instance of Simhash
        """
        assert simhash.f == self.f

        for key in self.get_keys(simhash):
            v = '%x,%s|%s' % (simhash.value, obj_id, obj_str)

            self.bucket.setdefault(key, set())
            self.bucket[key].add(v)

    def delete(self, obj_id, obj_str, simhash):
        """
        `obj_id` is a string
        `simhash` is an instance of Simhash
        """
        assert simhash.f == self.f

        for key in self.get_keys(simhash):
            v = '%x,%s|%s' % (simhash.value, obj_id, obj_str)

            if v in self.bucket.get(key, set()):
                self.bucket[key].remove(v)

    def __init__(self, objs, f=64, k=2):
        """
        `objs` is a list of (sha, source obj (str), simhash)
        obj_id is a string, simhash is an instance of Simhash
        `f` is the same with the one for Simhash
        `k` is the tolerance
        """
        self.k = k
        self.f = f
        count = len(objs)

        self.bucket = {}

        for i, q in enumerate(objs):
            self.add(*q)

    @property
    def offsets(self):
        """
        You may optimize this method according to <http://www.wwwconference.org/www2007/papers/paper215.pdf>
        """
        return [self.f // (self.k + 1) * i for i in range(self.k + 1)]

    def get_keys(self, simhash):
        for i, offset in enumerate(self.offsets):
            m = (i == len(self.offsets) - 1 and 2 ** (self.f - offset) - 1 or 2 ** (self.offsets[i + 1] - offset) - 1)
            c = simhash.value >> offset & m
            yield '%x:%x' % (c, i)

    def bucket_size(self):
        return len(self.bucket)


In [9]:
# notes: url regex drops the final } from the opensearch templates (should we care about the templates anyway?)

files = glob.glob('../testdata/solr_20150320/identify_20150325_p/*.json')

FOUND_LIMIT = 1000

all_identifiers = []

for f in files:
    with open(f, 'r') as g:
        data = json.loads(g.read())

    text = data.get('content').encode('unicode_escape')
    url = data.get('source_url')
    
    sha_id = generate_sha(url)
    
    # extract the text from the xml
    try:
        parser = Parser(text)
    except Exception as ex:
        print ex
        continue
    
    identifiers = []
    
    # process the tuples (tag, text)
    text_blobs = parser.strip_text()
    
    logger.debug('Processing for {0}: {1}'.format(url, len(text_blobs)))
    
    try:
        for tag_blob, text_blob in text_blobs:
            if not text_blob:
                continue

            for match_pttn, match_blob in process_text(text_blob):
                # add some simhash and match
                identifiers.append((sha_id, match_blob, Simhash(match_blob)))
    except Exception as ex:
        print url 
        print '\t', ex
        continue
            
    logger.debug('Processing for {0}: as XML'.format(url))
    
    for match_pttn, match_blob in match_identifier_rules(parser.xml):
        if not match_blob:
            continue
        
        identifiers.append((sha_id, match_blob, Simhash(match_blob)))
        
    logger.debug('Processing for {0}: as url'.format(url))
    
    for match_pttn, match_blob in match_identifier_patterns(url):
        if not match_blob:
            continue
        
#         identifiers.append((sha_id, match_blob, str(Simhash(match_blob).value)))
        identifiers.append((sha_id, match_blob, Simhash(match_blob)))
    
    # and run distinct on the simhash to get a unique set for the response
    identifiers = list(set(identifiers))
    
    # for any identifier, add a tuple to {some store}
    # where today {some store} is some files on disk
    # note: simhashes can't be reloaded from a string/hex so 
    # i'm going to take the simhash index and make the keys not be strings 
    if not identifiers:
        continue
        
    all_identifiers += identifiers

#     with open(os.path.join('simhashing', os.path.basename(f)), 'w') as g:
#         g.write(json.dumps(identifiers, indent=4))
        
#     if len(glob.glob('simhashing/*.json')) > FOUND_LIMIT:
#         break


UNICODE fail:  10.5281/zenodo.15667. 
http://semantic-mediawiki.org/wiki/Special:Ask/-5B-5BNews-20date::%2B-5D-5D-20-5B-5Blanguage-20code::en-5D-5D/format%3Dfeed/sort%3Dnews-20date/order%3Ddesc/searchlabel%3DAtom/type%3Datom/title%3DSemantic-20MediaWiki-20%E2%80%93-20news/description%3DLatest-20news-20from-20semantic-2Dmediawiki.org/page%3Dfull/offset%3D0

  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



	'ascii' codec can't decode byte 0xc3 in position 35: ordinal not in range(128)
http://rss.nrdcfeeds.org/pulsoverde
	'ascii' codec can't decode byte 0xe2 in position 82: ordinal not in range(128)
http://rss.nrdcfeeds.org/switchboard_all
	'ascii' codec can't decode byte 0xe2 in position 43: ordinal not in range(128)
http://www.bibsonomy.org/burst/user/yish/analytics

  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



	'ascii' codec can't decode byte 0xc3 in position 12: ordinal not in range(128)
https://www2.le.ac.uk/search_rss?sort_on=sortable_title&Subject:list=Astronomy
	'ascii' codec can't decode byte 0xe2 in position 48: ordinal not in range(128)
UNICODE fail:  http://idéemarque.ca
http://planet.ffii.org/atom.xml

  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



	'ascii' codec can't decode byte 0xc3 in position 47: ordinal not in range(128)
UNICODE fail:  10.5281/zenodo.15667. 
http://www.w3.org/International/planet/atom.xml
	'ascii' codec can't decode byte 0xc4 in position 40: ordinal not in range(128)
UNICODE fail:  10.5281/zenodo.15667. 
http://www.eurovelo.org/feed/rdf/
	'ascii' codec can't decode byte 0xc3 in position 55: ordinal not in range(128)
UNICODE fail:  http://wellcomelibrary.org/content/documents/policy-documents/collection-development-policy.pdf [Accessed:
UNICODE fail:  10.5281/zenodo.15667. 


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)


In [12]:
# from some 25K files, second harvest, extracted 642,610 things to hash and then 835,775 things

len(all_identifiers)

all_identifiers[500:510]

[('f7c002c73d636b5a5c47462ad316b5552d0f86539eb4e9143a5e361f',
  u'0e11e207-c223-4953-85be-5b5f9a5132d2',
  <simhash.Simhash at 0x104f6b250>),
 ('9543603cd1875d2a1923bcb7cbf95b09a4e471c9883401a989583345',
  u'http://ladsweb.nascom.nasa.gov/opendap/allData/51/MOBGARS_E10K/catalog.xml',
  <simhash.Simhash at 0x105e59610>),
 ('8b9f0fc259d4cbfafecc8202f7f84ac45e498953d2aab1b3363c55a3',
  'e42c4c1a-d30c-4595-ac70-81bd8ddc3189',
  <simhash.Simhash at 0x1052bbe90>),
 ('8b9f0fc259d4cbfafecc8202f7f84ac45e498953d2aab1b3363c55a3',
  '31c5b31a-d758-48ed-843f-9d15109990a9',
  <simhash.Simhash at 0x1052bb710>),
 ('8b9f0fc259d4cbfafecc8202f7f84ac45e498953d2aab1b3363c55a3',
  '6aa3b6ed-0dc2-4f18-98de-4e2b3cb580f0',
  <simhash.Simhash at 0x1052db8d0>),
 ('8b9f0fc259d4cbfafecc8202f7f84ac45e498953d2aab1b3363c55a3',
  'http://lod.data-archive.ac.uk/skoshasset/49f16038-0e2c-4144-8974-0375d70b2f90',
  <simhash.Simhash at 0x1052bb750>),
 ('8b9f0fc259d4cbfafecc8202f7f84ac45e498953d2aab1b3363c55a3',
  '29e00e5c

In [26]:
import time

# additional blacklisting of things
excludes = [
    'http://purl.org/',
    'http://www.w3.org',
    'esri',  # do not use this because it could just be part of the route, but no worries today
    'soap'
]

# let's play with our new indexer
RANGE = 1000
for i, (sha_id, text_blob, simhash) in enumerate(all_identifiers[200:250]):
    if sum([1 for e in excludes if text_blob.lower().find(e) >= 0]) > 0:
        continue
    
    print sha_id, text_blob
    
    start_time = time.time()
    
    near_dupes = []
    
    for x in xrange(0, len(all_identifiers), RANGE):
        test_set = [d for d in all_identifiers[x:x+RANGE] if d[0] != sha_id]
        index = HashIndex(test_set, k=0)
        
        dupes = [d.split('|') for d in index.get_near_dups(simhash)]
    
        if len(dupes) > 0:
            # sha, text, distance
            near_dupes.extend([(d[0], d[1]) for d in dupes if int(d[2]) < 1])
    
    print '\t', near_dupes
    print '\t', time.time() - start_time
    print

58bf21a150362ad14dbb217f345ad1481fa34b5a0604c9b3d7c74c2d http://lod.data-archive.ac.uk/skoshasset/27ff1e6a-bb05-45e8-bcc9-4b42f5d4d381
	[('173fea0b464c0bd9c0603e1affc7e0e6db61f09fae456912a8a0def4', 'http://lod.data-archive.ac.uk/skoshasset/27ff1e6a-bb05-45e8-bcc9-4b42f5d4d381'), ('495736977b62856a96bb74e4803daf3e7f3ca5c16e4c8d477cbfab21', 'http://lod.data-archive.ac.uk/skoshasset/27ff1e6a-bb05-45e8-bcc9-4b42f5d4d381'), ('8921c82c8e704615145c6b379cf015ec7dab78c533862d82267664d0', 'http://lod.data-archive.ac.uk/skoshasset/27ff1e6a-bb05-45e8-bcc9-4b42f5d4d381'), ('198ab1deab4f4c7768524244c8d6e1fe30ef356b5b8a1d667663c867', 'http://lod.data-archive.ac.uk/skoshasset/27ff1e6a-bb05-45e8-bcc9-4b42f5d4d381'), ('b8367ab289c358c4203d6e5f20fd835d2d9021b90a7068a22f1684eb', 'http://lod.data-archive.ac.uk/skoshasset/27ff1e6a-bb05-45e8-bcc9-4b42f5d4d381'), ('39569f705136e60194b9a9f51118b18def697b13cee8c3b0e75be655', 'http://lod.data-archive.ac.uk/skoshasset/27ff1e6a-bb05-45e8-bcc9-4b42f5d4d381'), ('b2e

some urn uuid examples:

http://uaf.nodc.noaa.gov/geoportal/rest/find/document?max=5&f=atom&searchText=

cdata fun:

http://ngdc.noaa.gov/geoportal/rest/find/document?searchText=

In [9]:
import os
import json

with open('../testdata/solr_20150320/simhash_results_b.txt', 'r') as f:
    lines = f.readlines()

x = []
for line in lines[5:15]:
    data = json.loads(line.strip())
    x.append(data)

x


[{u'http://acdisc.sci.gsfc.nasa.gov/opendap/ncml/Aqua_AIRS_Level3/AIRX3ST8.006/2013/AIRS.2013.03.07.L3.RetStd008.v6.0.9.0.G13095171139.hdf.ncml': [u'http://acdisc.sci.gsfc.nasa.gov/opendap/ncml/Aqua_AIRS_Level3/AIRX3ST8.006/2013/AIRS.2013.03.07.L3.RetStd008.v6.0.9.0.G13095171139.hdf.ncml',
   u'http://acdisc.sci.gsfc.nasa.gov/opendap/ncml/Aqua_AIRS_Level3/AIRX3STD.005/2012/AIRS.2012.01.02.L3.RetStd001.v5.2.2.0.G12012154026.hdf.ncml',
   u'http://acdisc.sci.gsfc.nasa.gov/opendap/ncml/Aqua_AIRS_Level3/AIRS3STM.006/2013/AIRS.2013.02.01.L3.RetStd_IR028.v6.0.9.0.G13096014834.hdf.ncml.ddx',
   u'http://acdisc.sci.gsfc.nasa.gov/opendap/ncml/Aqua_AIRS_Level3/AIRX3ST8.006/2010/AIRS.2010.03.15.L3.RetStd008.v6.0.9.0.G13085172737.hdf.ncml',
   u'http://acdisc.sci.gsfc.nasa.gov/opendap/ncml/Aqua_AIRS_Level3/AIRX3ST8.006/2013/AIRS.2013.01.10.L3.RetStd008.v6.0.9.0.G13092175516.hdf.ncml.ddx',
   u'http://acdisc.sci.gsfc.nasa.gov/opendap/ncml/Aqua_AIRS_Level3/AIRX3ST8.006/2009/AIRS.2009.01.23.L3.RetStd

In [49]:
from simhash import Simhash

h1 = Simhash('https://pypi.python.org/pypi?:action=doap&name=lzmaffi&version=0.2.0')
h2 = Simhash('http://disc2.nascom.nasa.gov/opendap/TRMM_L3/TRMM_3A46/2002/152/3A46.020601.2.HDF.Z.rdf')
h3 = Simhash('http://mrdata.usgs.gov/wfs/agdb2?request=GetCapabilities&service=WFS&version=1.0.0')
h4 = Simhash('https://wiki.ucar.edu/opensearch/osd.action')

h1.distance(h4)

19