###URL Context

Extract the codeListValue or other possible identifying information for CI_OnlineResource elements - where and what. Just to note what kind of text is used, from which servers, in which kinds of sections.

In [1]:
%reload_ext autoreload
%autoreload 2

import json as js  # name conflict with sqla
import sqlalchemy as sqla
from sqlalchemy.orm import sessionmaker

from mpp.models import Response
from semproc.xml_utils import *
from semproc.utils import tidy_dict
from lxml import etree
import os


In [2]:
# grab the clean text from the rds
with open('../local/big_rds.conf', 'r') as f:
    conf = js.loads(f.read())

# our connection
engine = sqla.create_engine(conf.get('connection'))
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()

In [3]:
def _extract_tag(t):
    if not t:
        return
    return t.split('}')[-1]

def _taggify(e):
    tags = [e.tag] + [m.tag for m in e.iterancestors()]
    tags.reverse()

    try:
        return [_extract_tag(t) for t in tags]
    except:
        return []

In [4]:
# get any iso
sql_pttn = """
with i
as (
    select d.response_id, trim(both '"' from (e.value->'protocol')::text) as protocol
    from identities d, jsonb_array_elements(d.identity::jsonb) e
    where d.identity is not null 
        and e.value->>'protocol' = 'ISO'
)

select r.id, r.source_url, r.cleaned_content
from responses r join i on i.response_id = r.id
limit %s
offset %s;
"""

In [5]:
xp = "//*[local-name()='CI_OnlineResource']"

START = 0
STOP = 20000
LIMIT = 100

# START = 0
# STOP = 20
# LIMIT = 10

for i in xrange(START, STOP, LIMIT):
    sql = sql_pttn % (LIMIT, i)
    
    for response_id, source_url, cleaned_content in session.execute(sql):
        if os.path.exists(os.path.join('outputs', 'online_resources', '%s.json' % response_id)):
            continue

        try:
            xml = etree.fromstring(cleaned_content.encode('utf-8'))
        except:
            print 'failed to parse', response_id, source_url
            continue
        
        onlines = xml.xpath(xp)
    
        outputs = []
        for online in onlines:
            # we want to get the url
            # check for a protocol
            # check for an applicationProtocol
            # and name and description
            # and the function code
            
            tag = '/'.join(_taggify(online))
            
            url = extract_item(online, ['linkage', 'URL'])
            protocol = extract_item(online, ['protocol', 'CharacterString'])
            application_profile = extract_item(online, ['applicationProfile', 'CharacterString'])
            name = extract_item(online, ['name', 'CharacterString'])
            description = extract_item(online, ['description', 'CharacterString'])
            function_code = extract_attrib(online, ['function', 'CI_OnLineFunctionCode', '@codeListValue'])
            function_codelist = extract_attrib(online, ['function', 'CI_OnLineFunctionCode', '@codeList'])
            
            outputs.append(tidy_dict({
                "url": url,
                "tag": tag,
                "protocol": protocol,
                "application_profile": application_profile,
                "name": name,
                "description": description,
                "function_code": function_code,
                "function_codelist": function_codelist
            }))
        
        if outputs:
            with open('outputs/online_resources/%s.json' % response_id, 'w') as f:
                f.write(js.dumps(outputs, indent=4))
        
#         print response_id, source_url
#         print outputs
#         print '-'* 50

failed to parse 145560 http://opentopo.sdsc.edu/LidarPortlet/jsp/datasetMetadata.jsp?otCollectionID=OT.052013.26912.2&format=xml
failed to parse 183293 http://portal.oceannet.org/search/full/catalogue/ccw.gov.uk__MEDIN_2.3__4f4c4942-4343-5764-6473-313135363336.xml/MEDIN_2.3
failed to parse 196865 http://portal.oceannet.org/search/full/catalogue/ccw.gov.uk__MEDIN_2.3__4f4c4942-4343-5764-6473-313131373933.xml/MEDIN_2.3
failed to parse 219701 http://opentopo.sdsc.edu/LidarPortlet/jsp/datasetMetadata.jsp?otCollectionID=OT.092014.26913.1&format=xml
failed to parse 223566 http://opentopo.sdsc.edu/LidarPortlet/jsp/datasetMetadata.jsp?otCollectionID=OT.112012.26910.1&format=xml
failed to parse 252783 http://opentopo.sdsc.edu/LidarPortlet/jsp/datasetMetadata.jsp?otCollectionID=OT.042013.26911.2&format=xml
failed to parse 307810 http://portal.oceannet.org/search/full/catalogue/ccw.gov.uk__MEDIN_2.3__4f4c4942-4343-5764-6473-313039383135.xml/MEDIN_2.3
failed to parse 351247 http://portal.oceannet.

In [7]:
import glob

files = glob.glob('outputs/online_resources/*.json')
with open('outputs/iso_resources.csv', 'w') as f:
    f.write('response_id,tags,element_key,element_value,url\n')

# # ONLY HALF HIT THE DB - check on encoding, etc.
# with open("inputs/loaded_iso_rscs.csv", 'r') as f:
#     ids = [a.strip() for a in f.readlines() if a]

items = []
for f in files:
    response_id = int(f.split('/')[-1].replace('.json', ''))
    with open(f, 'r') as g:
        data = js.loads(g.read())
    

    for url in data:
        tags = url.get('tag')
        link = url.get('url')
        for k, v in url.iteritems():
            if k in ['tag', 'url']:
                continue
                
            items.append(
                ','.join([str(response_id), 
                          '"{0}"'.format(tags), 
                          k.strip(),
                          '"{0}"'.format(v.strip().encode('unicode_escape').replace('"', '\'')),
                          '"{0}"'.format(link)])
            )
    
with open('outputs/iso_resources.csv', 'a') as g:
    g.write('\n'.join(items))

In [14]:
from collections import defaultdict

d = defaultdict(int)
for i in items:
    d[i.split(',')[0]] += 1

In [15]:
len(d.keys())

11587

So only a little more than half had results. 