In [None]:
Does it get better or worse?

Presence of elements in ISO/FGDC:

- data quality
- data quality with lineage
- attribute definitions
- distribution information
- metadata reference section

Some of the above will include a word count value extracted from certain elements. For data quality, the count will come from the quality descriptions excluding lineage. The lineage word count will be included separately based on the process step descriptions. Attribute word counts will be taken for FGDC only, and from the main description plus any attribute descriptions.

In [1]:
%reload_ext autoreload
%autoreload 2

import json as js  # name conflict with sqla
import sqlalchemy as sqla
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import *
from sqlalchemy import and_
from semproc.xml_utils import *
from mpp.models import Response
from datetime import datetime
from lxml import etree
import os

In [2]:
# fgdc xpath sets for text extraction

# for data quality sans lineage
fgdc_dq_xpaths = [
    ['dataqual', 'attracc', 'attraccr'],
    ['dataqual', 'attracc', 'qattracc', 'attracce'],
    ['dataqual', 'logic'],
    ['dataqual', 'complete'],
    ['dataqual', 'posacc', 'horizpa', 'horizpar'],
    ['dataqual', 'posacc', 'horizpa', 'qhorizpa', 'horizpae'],
    ['dataqual', 'posacc', 'vertacc', 'vertaccr'],
    ['dataqual', 'posacc', 'vertacc', 'qhorizpa', 'vertacce']
]

# for data quality lineage
fgdc_lineage_xpaths = [
    ['dataqual', 'lineage', 'procstep', 'procdesc']
]

# for attributes
fgdc_attr_xpaths = [
    ['eainfo', 'overview'],
    ['eainfo', 'eadetcit'],
    ['eainfo', 'detailed', 'attr', 'attrdef'],
    ['eainfo', 'detailed', 'attr', 'attrlabl']
]

# for identifying number of distribution links vs offline resources
# xpath returns the number of elements
fgdc_distributions = [
    ('online_refs', 'count(distinfo/stdorder/digform/digtopt/onlinopt/computer/networka/networkr)'),
    ('offline_refs', 'count(distinfo/stdorder/digform/digtopt/offoptn/offmedia)'),
    ('nondigital_refs', 'count(distinfo/stdorder/nondig)')
]

# checks just for a decent existence
# mandatory doesn't mean they exist :/
fgdc_existences = [
    ('data_quality', 'dataqual/logic or dataqual/complete'),
    ('lineage', 'count(dataqual/lineage/procstep) > 0'),
    ('attribute_ref', 'eainfo/detailed/attrdef or eainfo/detailed/attrlabl'),
    ('metadata_ref', 'count(metainfo/metstdn) > 0')  # just check for the standard name
]

In [8]:
# iso xpath sets

# for data quality sans lineage
# NOTE: starting with these, not sure the pattern report/{name}/*Description/CharacterString is viable
iso_dq_xpaths = [
    ['*', 'dataQualityInfo', 'DQ_DataQuality', 'report', 'DQ_CompletenessOmission', 'evaluationMethodDescription', 'CharacterString'],
    ['*', 'dataQualityInfo', 'DQ_DataQuality', 'report', 'DQ_CompletenessCommission', 'evaluationMethodDescription', 'CharacterString'],
    ['*', 'dataQualityInfo', 'DQ_DataQuality', 'report', 'DQ_ConceptualConsistency', 'methodDescription', 'CharacterString']
]

# for data quality lineage
iso_lineage_xpaths = [
    ['*', 'dataQualityInfo', 'DQ_DataQuality', 'lineage', 'LI_Lineage', 'processStep', 'LI_ProcessStep', 'description', 'CharacterString']
]

# for attributes
# NOTE: 
iso_attr_xpaths = [
    
]

iso_distributions = [
    ('online_refs', 'count(//*/*[local-name()="MD_DigitalTransferOptions"]/*[local-name()="onLine"]/*[local-name()="CI_OnlineResource"]/*[local-name()="linkage"]/*[local-name()="URL"])')
]

# NOTE: not counting bands as attribute definitions here.
iso_existences = [
    ('data_quality', 'count(*[local-name()="dataQualityInfo"]/*[local-name()="DQ_DataQuality"]) > 0'),
    ('lineage', 'count(*[local-name()="dataQualityInfo"]/*[local-name()="DQ_DataQuality"]/*[local-name()="lineage"]/*/*/*[local-name()="LI_ProcessStep"]) > 0'),
    ('metadata_ref', 'count(metainfo/metstdn) > 0')  
]

In [4]:
# return a dict of fq xpaths: text from one of our sets
# in this case, we aren't interested in element attributes
# or in iterating over each child, just elements where
# there's an expectation (based on cultural practices)
# of finding descriptive text.
def extract(xml, xpath):
    elems = extract_elems(xml, xpath)
    for elem in elems:
        text = elem.text if elem.text else ''
        if not text:
            continue

        # xpath definition doesn't necessarily include
        # every elem name from parent, so return exact path
        tags = '/'.join(_taggify(elem))
        yield (tags, text.strip())

def _extract_tag(t):
    if not t:
        return
    return t.split('}')[-1]

def _taggify(e):
    tags = [e.tag] + [m.tag for m in e.iterancestors()]
    tags.reverse()

    try:
        return [_extract_tag(t) for t in tags]
    except:
        return []
    
def convert_to_bag(arr):
    # we have some array of strings and we want
    # tokens. not going to worry about numbers
    # or urns or what have you today.
    return ' '.join([a[1] for a in arr]).split()

def check_existence(xml, check):
    return xml.xpath(check)

In [5]:
# load the postgres connection file
with open('../local/big_rds.conf', 'r') as f:
    conf = js.loads(f.read())

# our connection
engine = sqla.create_engine(conf.get('connection'))
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()

In [6]:
sketchy_sql = '''with i
as (
    select d.response_id, jsonb_array_elements(d.identity::jsonb) ident
    from identities d
    where d.identity is not null
)

select r.id, r.source_url, r.source_url_sha, r.cleaned_content, i.ident->'protocol' as protocol
from responses r join i on i.response_id = r.id
where i.ident->>'protocol' = 'ISO'
limit %s
offset %s;
'''

# where i.ident->>'protocol' = 'FGDC' or i.ident->>'protocol' = 'ISO'



In [11]:
# LIMIT=500
# for i in xrange(0, 46000, LIMIT):

# 26300 for fgdc
# 19700 for ISO

LIMIT = 500
for i in xrange(0, 19700, LIMIT):
    sql = sketchy_sql % (LIMIT, i)
    result = session.execute(sql)
    for r in result:
        if os.path.exists('outputs/metrics/%s.json' % r['id']):
            continue
            
        try:
            xml = etree.fromstring(r['cleaned_content'].encode('utf-8'))
        except:
            print 'xml fail', r['id']
            continue
            
        metrics = {
            "data_quality": False,
            "data_quality_bow": 0,
            "lineage": False,
            "lineage_bow": 0,
            "attribute_ref": False,
            "attribute_bow": 0,
            "metadata_ref": False,
            "distribution": {}
        }

        if r['protocol'] == 'ISO':
            for ename, expath in iso_existences:
                metrics[ename] = check_existence(xml, expath)
            
            # data quality
            arr = []
            for xp in iso_dq_xpaths:
                arr += [d for d in extract(xml, xp)]
            
            metrics['data_quality_bow'] = len(convert_to_bag(arr))
            
            # dataqual lineage
            arr = []
            for xp in iso_lineage_xpaths:
                arr += [d for d in extract(xml, xp)]
            
            metrics['lineage_bow'] = len(convert_to_bag(arr))
            
#             # eainfo
#             arr = []
#             for xp in iso_attr_xpaths:
#                 arr += [d for d in extract(xml, xp)]
            
#             metrics['attribute_bow'] = len(convert_to_bag(arr))
            
            # count the kinds of distribution access points
            for dname, dxpath in iso_distributions:
                metrics['distribution'][dname] = check_existence(xml, dxpath)
                
            del metrics['attribute_bow']
            del metrics['attribute_ref']

        elif r['protocol'] == 'FGDC':
            for ename, expath in fgdc_existences:
                metrics[ename] = check_existence(xml, expath)
            
            # data quality
            arr = []
            for xp in fgdc_dq_xpaths:
                arr += [d for d in extract(xml, xp)]
            
            metrics['data_quality_bow'] = len(convert_to_bag(arr))
            
            # dataqual lineage
            arr = []
            for xp in fgdc_lineage_xpaths:
                arr += [d for d in extract(xml, xp)]
            
            metrics['lineage_bow'] = len(convert_to_bag(arr))
            
            # eainfo
            arr = []
            for xp in fgdc_attr_xpaths:
                arr += [d for d in extract(xml, xp)]
            
            metrics['attribute_bow'] = len(convert_to_bag(arr))
            
            # count the kinds of distribution access points
            for dname, dxpath in fgdc_distributions:
                metrics['distribution'][dname] = check_existence(xml, dxpath)
        
#         print r['id'], r['source_url']
#         print metrics
#         print
#         print
            
        with open('outputs/metrics/%s.json' % r['id'], 'w') as g:
            g.write(js.dumps(metrics, indent=4))

xml fail 145560
xml fail 183293
xml fail 196865
xml fail 219701
xml fail 223566
xml fail 252783
xml fail 307810
xml fail 351247
xml fail 402936
xml fail 453490
xml fail 503992
xml fail 539074
xml fail 563466
xml fail 576196
xml fail 653347
xml fail 667256
xml fail 721563
xml fail 722226


In [1]:
# load the metrics into the rds
import glob
import json as js
import sqlalchemy as sqla
from sqlalchemy.orm import sessionmaker
from sqlalchemy import (
    MetaData,
    Column,
    String,
    Integer,
    Boolean,
    DateTime,
)
from sqlalchemy.dialects.postgresql import *
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()


class Metric(Base):
    __tablename__ = 'metadata_age_metrics'
    id = Column(Integer, primary_key=True)
    completeness = Column(JSON)
    response_id = Column(Integer)
    
# load the postgres connection file
with open('../local/big_rds.conf', 'r') as f:
    conf = js.loads(f.read())

# our connection
engine = sqla.create_engine(conf.get('connection'))
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()

files = glob.glob('outputs/metrics/*.json')
for f in files:
    response_id = f.split('/')[-1].replace('.json', '')
    
    if session.query(Metric).filter(Metric.response_id==response_id).count() > 0:
        continue
    
    with open(f, 'r') as g:
        data = js.loads(g.read())
    
    metric = Metric(
        response_id=response_id,
        completeness=data
    )
    try:
        session.add(metric)
        session.commit()
    except:
        session.rollback()

In [None]:
# unfortunate lapse
# let's also grab the URLs/media definitions to
# say of the online (only) references, which are
# actually externally referencable

import json as js
import requests
from rfc3987 import parse as uparse
import sqlalchemy as sqla
from sqlalchemy.orm import sessionmaker
from sqlalchemy import (
    MetaData,
    Column,
    String,
    Integer,
    Boolean,
    DateTime,
)
from sqlalchemy.dialects.postgresql import *
from datetime import datetime
import os
from lxml import etree

# load the postgres connection file
with open('../local/big_rds.conf', 'r') as f:
    conf = js.loads(f.read())

# our connection
engine = sqla.create_engine(conf.get('connection'))
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()

sketchy_sql = '''with i
as (
    select d.response_id, jsonb_array_elements(d.identity::jsonb) ident
    from identities d
    where d.identity is not null
)

select r.id, r.source_url, r.source_url_sha, r.cleaned_content, i.ident->'protocol' as protocol
from responses r join i on i.response_id = r.id
where (i.ident->>'protocol' = 'ISO' or i.ident->>'protocol' = 'FGDC') and r.format = 'xml'
limit %s
offset %s;
'''

# 26300 for fgdc
# 19700 for ISO

LIMIT = 500
END = 19700+26300
# END = 5
# LIMIT=5
for i in xrange(0, END, LIMIT):
    sql = sketchy_sql % (LIMIT, i)
    result = session.execute(sql)
    for r in result:
        if os.path.exists('outputs/online_refs/%s.json' % r['id']):
            continue
        
        try:
            xml = etree.fromstring(r['cleaned_content'].encode('utf-8'))
        except Exception as ex:
            print 'xml fail', r['id']
            continue

        if r['protocol'] == 'ISO':
            xp = '//*/*[local-name()="MD_DigitalTransferOptions"]/*[local-name()="onLine"]/*[local-name()="CI_OnlineResource"]/*[local-name()="linkage"]/*[local-name()="URL"]'
        elif r['protocol'] == 'FGDC':
            xp = 'distinfo/stdorder/digform/digtopt/onlinopt/computer/networka/networkr'
        
        refs = []
        elems = xml.xpath(xp)
        for elem in elems:
            text = elem.text
            if not text:
                continue
            
            text = text.strip()
            
            # is it a valid URL and, you know, we're here so let's 
            # just make a little HEAD request to ask
            ref = {
                "url": text,
                "checked": datetime.now().isoformat()
            }
            
            try:
                u = uparse(text, rule='URI')
                
                if u['scheme'] == 'file':
                    ref['error'] = 'file path'
                    refs.append(ref)
                    continue
            except:
                # it's not a valid scheme://location/path (http or otherwise)
                ref["error"] = 'probable local path'
                refs.append(ref)
                continue
            
            try:
                rsp = requests.head(text, timeout=30)
            except:
                ref["error"] = "HEAD request failed"
                refs.append(ref)
                continue
            
            # just get the status code
            ref['status'] = rsp.status_code
            
            refs.append(ref)
            
            
        with open('outputs/online_refs/%s.json' % r['id'], 'w') as g:
            g.write(js.dumps(refs, indent=4))

In [13]:
from rfc3987 import parse as uparse

# (‘IRI’, 
#  ‘absolute_IRI’, 
#  ‘irelative_ref’, 
#  ‘irelative_part’, 
#  ‘URI_reference’, 
#  ‘URI’, 
#  ‘absolute_URI’, 
#  ‘relative_ref’, 
#  ‘relative_part’)

uparse('http://www.someinth.com/f', rule='URI')



{'authority': 'www.someinth.com',
 'fragment': None,
 'path': '/f',
 'query': None,
 'scheme': 'http'}