####OGC Element vs Token counts

Unlike the other metadata token/element evaluation, we're going to be fairly explicit with the expected text locations here. 

Does the service description section contain text?

Totals for the layers are not very meaningful - these are automatically generated responses so we can expect that the layers conform to the same structure throughout a GetCapabilities response. Internally to each response and most likely across the set of GetCapabilities per service and server.

In [22]:
%reload_ext autoreload
%autoreload 2

import json as js  # name conflict with sqla
import sqlalchemy as sqla
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import *
from sqlalchemy import and_
from semproc.xml_utils import *
from semproc.parser import Parser
from mpp.models import Response
from datetime import datetime
# from lxml import etree

In [23]:
# load the postgres connection file
with open('../local/big_rds.conf', 'r') as f:
    conf = js.loads(f.read())

# our connection
engine = sqla.create_engine(conf.get('connection'))
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()

In [24]:
# the token sets
# assumes executing the token paths from the 
# element described by the key

# need order preserving tuples for some, lists for the single items. 
paths = {
    "Service": [
        ['Name'],
        ['Title'],
        ['Abstract'],
        ('KeywordList', 'Keyword'),
        ('OnlineResource', '@href'),
        ('ContactInformation', 'ContactPersonPrimary', 'ContactPerson'),
        ('ContactInformation', 'ContactPersonPrimary', 'ContactOrganization'),
        ('ContactInformation', 'ContactPosition'),
        ('ContactInformation', 'ContactAddress', 'AddressType'),
        ('ContactInformation', 'ContactAddress', 'Address'),
        ('ContactInformation', 'ContactAddress', 'City'),
        ('ContactInformation', 'ContactAddress', 'StateOrProvince'),
        ('ContactInformation', 'ContactAddress', 'PostCode'),
        ('ContactInformation', 'ContactAddress', 'Country'),
        ('ContactInformation', 'ContactVoiceTelephone'),
        ('ContactInformation', 'ContactFacsimileTelephone'),
        ('ContactInformation', 'ContactElectronicMailAddress'),
        ['Fees'],
        ['AccessConstraints']
    ],
    "ServiceIdentification": [
        ['Title'],
        ['Abstract'],
        ('KeywordList', 'Keyword'),
        ['ServiceType'],
        ['ServiceTypeVersion'],
        ['Fees'],
        ['AccessConstraints']
    ],
    # need to run service provider for the ows versions
    'ServiceProvider': [
        ['ProviderName'],
        ('ServiceContact', 'IndividualName'),
        ('ServiceContact', 'PositionName'),
        ('ServiceContact', 'ContactInfo', 'Phone', 'Voice'),
        ('ServiceContact', 'ContactInfo', 'Phone', 'Facsimile'),
        ('ServiceContact', 'ContactInfo', 'Address', 'DeliveryPoint'),
        ('ServiceContact', 'ContactInfo', 'Address', 'City'),
        ('ServiceContact', 'ContactInfo', 'Address', 'AdministrativeArea'),
        ('ServiceContact', 'ContactInfo', 'Address', 'PostalCode'),
        ('ServiceContact', 'ContactInfo', 'Address', 'Country'),
        ('ServiceContact', 'ContactInfo', 'Address', 'ElectronicMailAddress'),
        ('ServiceContact', 'ContactInfo', 'OnlineResource', '@href'),
        ('ServiceContact', 'ContactInfo', 'HoursOfService'),
        ('ServiceContact', 'ContactInfo', 'ContactInstructions'),
        ['Role']
    ]
}

layer_paths = {
    'Layer': [
        ['Title'],
        ['Name'],
        ['Abstract'],
        ['KeywordList', 'Keyword'],
        ['Identifier']
    ],
    'FeatureType': [
        ['Name'],
        ['Title'],
        ['Abstract'],
        ['Keywords', 'Keyword'],
        ['Identifier']
    ],
    'CoverageSummary': [
        ['Title'],
        ['Abstract'],
        ['Keywords', 'Keyword'],
        ['Identifier']
    ]
}

In [25]:
sql = """
with i as (
	select d.response_id, (e.value->'protocol')::text as ident
	from identities d, jsonb_array_elements(d.identity::jsonb) e
	where d.identity is not null 
		and e.value->>'protocol' = 'OGC'
)
select r.id, r.source_url_sha, r.source_url, r.cleaned_content
from responses r join i on i.response_id = r.id
;
"""

In [26]:
def _extract_tag(t):
    if not t:
        return
    return t.split('}')[-1]

def _taggify(e, x):
    x = list(x)
    x.reverse()
    tags = x + [e.tag] + [m.tag for m in e.iterancestors()]
    tags.reverse()

    try:
        return '/'.join([_extract_tag(t) for t in tags])
    except:
        return ''

def strip_text(elem, arr):
    for a in arr:
        if '@' in a[-1]:
            texts = [extract_attrib(elem, a)]
        else:
            texts = extract_items(elem, a)
#             if 'KeywordList' in a and texts:
#                 print texts
#             if 'KeywordList' in a:
#                 print a
#                 print len(extract_elems(elem, a))
        
        if not texts:
            continue
        
        yield _taggify(elem, a), ' | '.join(texts)

In [27]:
x = ''

for i, sha, url, content in session.execute(sql):
    try:
        parser = Parser(content.encode('utf-8'))
    except:
        continue
        
    x = parser.xml
    
    blobs = {
        "keys": [],
        "extracted": []
    }
    for child in parser.xml:
        key = _extract_tag(child.tag)
        if not key in paths:
            continue
            
        arr = paths.get(key, [])
        elem = extract_elem(parser.xml, [key])
        
        extracted = [{"tag": tag, "values": texts} for tag, texts in strip_text(elem, arr)]
        if extracted:
            blobs['keys'].append(key)
            blobs['extracted'] += extracted
            
#     print blobs
    
    if blobs.get('extracted', []):
        keys = blobs.get('keys')
        total = len(paths.get('ServiceIdentification', [])) + len(paths.get('ServiceProvider', [])) \
            if 'ServiceIdentification' in keys or 'ServiceProvider' in keys else len(paths.get('Service', []))
        
        data = {
            "url": url,
            "total": total,
            "blobs": blobs.get('extracted', [])
        }
        with open('outputs/ogc/%s.json' % i, 'w') as g:
            g.write(js.dumps(data, indent=4))


In [28]:
import glob
from sqlalchemy import (
    MetaData,
    Column,
    String,
    Integer,
    Boolean,
    DateTime,
)
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

class Ogc(Base):
    __tablename__ = 'ogc_tokens'
    id = Column(Integer, primary_key=True)
    tokens = Column(JSON)
    expected_total = Column(Integer)
    response_id = Column(Integer)

files = glob.glob('outputs/ogc/*.json')

for f in files:
    with open(f, 'r') as g:
        data = js.loads(g.read())

    r = f.split('/')[-1].replace('.json', '')
    o = Ogc(
        response_id=r,
        tokens=data.get('blobs', []),
        expected_total=data.get('total')
    )
    
    try:
        session.add(o)
        session.commit()
    except:
        session.rollback()
