quick checks of the new fgdc triples structure
and maybe the triples (yes the triples, whatevs)

In [32]:
%reload_ext autoreload
%autoreload 2

import os
import json
import glob
from semproc.parser import Parser
from semproc.preprocessors.metadata_preprocessors import FgdcItemReader
from semproc.serializers.rdfgraphs import RdfGrapher

In [33]:
# load the proto-triples example
with open('../response_examples/fgdc_proto_example_1.xml', 'r') as f:
    response = f.read()

# this shouldn't be necessary but cargo-culting here is fine by me.
response = response.replace('\\\n', '').replace('\r\n', '').replace('\\r', '').replace('\\n', '').replace('\n', '')
response = response.decode('utf-8', errors='replace').encode('unicode_escape') 
    
url = 'https://bluehub.jrc.ec.europa.eu/erddap/metadata/fgdc/xml/noaa_pfeg_d543_8870_bc7f_fgdc.xml'
identities = [
    {"protocol": "FGDC", 
     "metadata": {
            "version": ["FGDC Content Standards for Digital Geospatial Metadata, FGDC-STD-001-1998"], 
            "name": "FGDC"}
    }
]

parser = Parser(response)

In [34]:
# execute the parse (this one takes xml)
reader = FgdcItemReader(parser.xml, url, {'harvest_date': '2015-06-20T20:22:00.643Z'})
reader.parse_item()

In [35]:
reader.description

{'catalog_record': {'bcube:dateCreated': '2015-06-20T20:22:00.643Z',
  'bcube:lastUpdated': '2015-06-20T20:22:00.643Z',
  'dc:conformsTo': ['http://www.ngdc.noaa.gov/metadata/published/xsd/ngdcSchema/schema.xsd'],
  'object_id': 'urn:sha:f65fb3d1efeee860adbbb53d3a20e80e1f50fe625e4887155f196a92',
  'relationships': [{'object_id': 'urn:uuid:f6bb0d78-b3bc-406f-a8b7-58b974b5a25a',
    'relate': 'bcube:originatedFrom'},
   {'object_id': 'urn:sha:d5a3a66150264cfeca19322b996a1cbf202b15966e0f1e41b99ae991',
    'relate': 'foaf:primaryTopic'}],
  'urls': [{'bcube:HTTPStatusCodeValue': 200,
    'bcube:HTTPStatusFamilyCode': 200,
    'bcube:HTTPStatusFamilyType': 'Success message',
    'bcube:atTime': '2015-06-20T20:22:00.643Z',
    'bcube:hasConfidence': 'Good',
    'bcube:hasUrlSource': 'Harvested',
    'bcube:reasonPhrase': 'OK',
    'bcube:validatedOn': '2015-06-20T20:22:00.643Z',
    'object_id': 'urn:uuid:f6bb0d78-b3bc-406f-a8b7-58b974b5a25a',
    'vcard:hasUrl': 'https://bluehub.jrc.ec.euro

In [36]:
g = RdfGrapher(reader.description)
g.serialize()
ttl = g.emit_format()
print ttl

@prefix bcube: <http://purl.org/BCube/#> .
@prefix bibo: <http://purl.org/ontology/bibo/#> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix dcat: <http://www.w3.org/TR/vocab-dcat/#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix esip: <http://purl.org/esip/#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix prov: <http://purl.org/net/provenance/ns#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix vcard: <http://www.w3.org/TR/vcard-rdf/#> .
@prefix vivo: <http://vivo.ufl.edu/ontology/vivo-ufl/#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<urn:sha:8df1983f1f9573be5059558fd20d718d9550720c873eb8930cdc6735> bcube:HTTPStatusCodeValue 200 ;
    bcube:HTTPStatusFamilyCode 200 ;
    bcube:HTTPStatusFamilyType "Success message" ;
    bcube:atTime "2015-06-20T20:22:00.643Z" ;
    bc

In [120]:
# let's play with triples

import rdflib
import hashlib
from uuid import uuid4
from rdflib import Graph, Literal, RDF, RDFS, Namespace, URIRef
from rdflib.namespace import DC, DCTERMS, FOAF, XSD, OWL


class Grapher():
    def __init__(self):
        self.graph = Graph()
        self._bind_namespaces()
    
    # some faked namespaces
    _ontology_uris = {
        'bcube': 'http://purl.org/BCube/#',
        'vcard': 'http://www.w3.org/TR/vcard-rdf/#',
        'esip': 'http://purl.org/esip/#',
        'vivo': 'http://vivo.ufl.edu/ontology/vivo-ufl/#',
        'bibo': 'http://purl.org/ontology/bibo/#',
        'dcat': 'http://www.w3.org/TR/vocab-dcat/#',
        'dc': str(DC),
        'dct': str(DCTERMS),
        'foaf': str(FOAF),
        'xsd': str(XSD),
        'owl': str(OWL)
    }
    
    def _bind_namespaces(self):
        # bind our lovely fake namespaces
        for prefix, uri in self._ontology_uris.iteritems():
            self.graph.bind(prefix, uri)

    def generate_predicate(self, prefix, name):
        return Namespace(self._ontology_uris[prefix])[name]

    def identify_prefix(self, predicate):
        # this is, granted, a lesson in technical debt.
        debt = {
            "dc": ["description", "conformsTo", "relation"],
            "dcat": ["publisher"],
            "foaf": ["primaryTopic"]
        }
        
        for k, v in debt.iteritems():
            if predicate in v:
                return k
        return ''
    
    def _stringify(self, text):
        return json.dumps(text)
            
    def create_resource(self, resource_prefix, resource_type, identifier=''):
        # make a thing with a uuid as a urn
        # and just assign it to type if it's not overridden
        identifier = identifier if identifier else uuid4().urn
        resource = self.graph.resource(identifier)
        ref = Namespace(self._ontology_uris[resource_prefix])[resource_type]
        resource.add(OWL.a, URIRef(ref))
        return resource

    def _process_catalog(self, entity):
        catalog_record = self.create_resource('dcat', 'CatalogRecord', entity['object_id'])
        catalog_record.add(self.generate_predicate('vcard', 'hasURL'), Literal(entity['url']))
        catalog_record.add(self.generate_predicate('vivo', 'harvestDate'), Literal(entity['harvestDate']))
        if entity['conformsTo']:
            catalog_record.add(DC.conformsTo, Literal(entity['conformsTo']))
        
        for relationship in entity['relationships']:
            # so. current object, verb, id of object, existence unknown
            self.relates.append((catalog_record, relationship['relate'], relationship['object_id']))
            
    def _process_dataset(self, entity):
        dataset = self.create_resource('dcat', 'Dataset', entity['object_id'])
        if entity['identifier']:
            dataset.add(DCTERMS.identifier, Literal(entity['identifier']))
        dataset.add(DCTERMS.title, Literal(self._stringify(entity['title'])))
        dataset.add(DC.description, Literal(self._stringify(entity['abstract'])))
        
        if 'temporal_extent' in entity:
            # NOTE: make these iso 8601 first
            begdate = entity['temporal_extent'].get('startDate')
            enddate = entity['temporal_extent'].get('endDate')

            dataset.add(self.generate_predicate('esip', 'startDate'), Literal(begdate, datatype=XSD.date))
            dataset.add(self.generate_predicate('esip', 'endDate'), Literal(enddate, datatype=XSD.date))

        if 'spatial_extent' in entity:
            dataset.add(DC.spatial, Literal(entity['spatial_extent']['wkt']))

            # a small not good thing.
            dataset.add(self.generate_predicate('esip', 'westBound'),
                        Literal(float(entity['spatial_extent']['west']), datatype=XSD.float))

            dataset.add(self.generate_predicate('esip', 'eastBound'),
                        Literal(float(entity['spatial_extent']['east']), datatype=XSD.float))

            dataset.add(self.generate_predicate('esip', 'southBound'),
                        Literal(float(entity['spatial_extent']['south']), datatype=XSD.float))

            dataset.add(self.generate_predicate('esip', 'northBound'),
                        Literal(float(entity['spatial_extent']['north']), datatype=XSD.float))
        
        for relationship in entity['relationships']:
            self.relates.append((dataset, relationship['relate'], relationship['object_id']))
        
    def _process_keywords(self, entity):
        for keywords in entity:
            keyset = self.create_resource('bcube', 'thesaurusSubset', keywords['object_id'])
            if 'type' in keywords:
                keyset.add(DC.hasType, Literal(self._stringify(keywords['type'])))
            if 'thesaurus' in keywords:
                keyset.add(DC.partOf, Literal(self._stringify(keywords['thesaurus'])))

            try:
                for term in keywords['terms']:
                    keyset.add(self.generate_predicate('bcube', 'hasValue'), Literal(self._stringify(term)))
            except:
                print keywords
        
    def _process_publisher(self, entity):
        publisher = self.create_resource('dcat', 'publisher', entity['object_id'])
        publisher.add(DC.location, Literal(self._stringify(entity['location'])))
        publisher.add(FOAF.name, Literal(self._stringify(entity['name'])))
        
    def _process_webpages(self, entity):
        for webpage in entity:
            relation = self.create_resource('bibo', 'WebPage', webpage['object_id'])
            relation.add(self.generate_predicate('vcard', 'hasURL'), Literal(webpage['url']))
    
    def serialize(self):
        return self.graph.serialize(format='turtle')
    
    def graphalize(self, doc):
        # not a word
        # so from our json.
        
        # this. is. idk. an ordering thing. i suspect the graph borks
        # when you try to add a triple for a non-existent object.
        # years of experience. also, i am wrong - it will add anything.
        self.relates = []
        for entity_type, entity in doc.iteritems():
            if entity_type == 'catalog_record':
                self._process_catalog(entity)
            elif entity_type == 'dataset':
                self._process_dataset(entity)
            elif entity_type == 'publisher':
                self._process_publisher(entity)
            elif entity_type == 'keywords':
                self._process_keywords(entity)
            elif entity_type == 'webpages':
                self._process_webpages(entity)
            else:
                continue
        
        for resource, verb, object_id in self.relates:
            resource.add(
                self.generate_predicate(
                    self.identify_prefix(verb), verb),
                URIRef(object_id)
            )
        

grapher = Grapher()    
grapher.graphalize(description)
print grapher.serialize()  


@prefix bcube: <http://purl.org/BCube/#> .
@prefix bibo: <http://purl.org/ontology/bibo/#> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix dcat: <http://www.w3.org/TR/vocab-dcat/#> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix esip: <http://purl.org/esip/#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix vcard: <http://www.w3.org/TR/vcard-rdf/#> .
@prefix vivo: <http://vivo.ufl.edu/ontology/vivo-ufl/#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<urn:sha:80655e27d685a3fa6e4092f0c0b53871e1d245015340f39d942faf9b> vivo:harvestDate "2014-10-29T23:09:48.26Z" ;
    owl:a dcat:CatalogRecord ;
    vcard:hasURL "http://www.usgs.gov/science/xml.php?cite=1449" ;
    foaf:primaryTopic <urn:uuid:18ce2260-f8c9-4e01-94a1-87268f105071> .

<urn:sha:dc71

#### generating local graphs for any fgdc response

i would note that we are not catching every fgdc through the identifier - not quite valid/complete responses are being dropped. this generates roughly three thousand graphs.

In [111]:
# so let's make a new temp class for the harvest + parser parts
# and this is not what we really want to do. probably.

class Fgdc():
    def __init__(self, doc):
        self.url = doc.get('url', '')
        self.response = self._prep_response(doc.get('raw_content', ''))
        self.harvested = doc.get('tstamp', '')  # except i did not carry this through in the clean task
        self._prep_parser()
        
    def _prep_parser(self):
        parser = Parser(self.response)
        self.reader = FgdcItemReader(parser.xml, self.url, self.harvested)
    
    def _prep_response(self, response):
        # this shouldn't be necessary but cargo-culting here is fine by me.
        # response = response.encode('unicode_escape')
        response = response.replace('\\\n', '').replace('\r\n', '').replace('\\r', '').replace('\\n', '').replace('\n', '')
        try:
            # the idiocy
            return response.decode('utf-8', 'replace').encode('unicode_escape')
        except:
            return response.encode('unicode_escape', 'replace')
        
    def parse(self):
        return self.reader.parse_item()

In [121]:
import csv

docs_dir = '/Users/sscott/Documents/working_bits/solr_20150707/docs'
triples_dir = '/Users/sscott/Documents/tmp/triples/'

with open('data/small_harvest_fgdc_as_md5.csv', 'rb') as csvfile:
    cr = csv.DictReader(csvfile, delimiter="|")

    for row in cr:      
        url = row['source_url'].strip().replace('"', '')
        md5 = row['raw_content_md5'].strip().replace('"', '')
        
        # go open the cleaned up version by md5
        filepath = os.path.join(docs_dir, md5 + '.json')
        if not os.path.exists(filepath):
            continue
        
        with open(filepath, 'r') as f:
            data = json.loads(f.read())
        
        try:
            fgdc = Fgdc(data)
            description = fgdc.parse()
        except:
            print 'parse error: ', row
        
        try:
            grapher = Grapher()
            grapher.graphalize(description)
            ttl = grapher.serialize()
        
            with open(os.path.join(triples_dir, md5 + '.ttl'), 'w') as f:
                f.write(ttl)
        except:
            print 'triples error: ', row

parse error:  {'raw_content_md5': '005f630392b3754e7a67034da04ea0bf', 'source_url': 'http://data.denvergov.org/download/gis/instream_sampling_sites/metadata/instream_sampling_sites.xml'}
parse error:  {'raw_content_md5': '0132063c3171520ef9a840523cdb4b6a', 'source_url': 'http://data.denvergov.org/download/gis/curb_ramps/metadata/curb_ramps.xml'}
parse error:  {'raw_content_md5': '03a5101c9b56ac15931f493128e71f8f', 'source_url': 'http://pubs.usgs.gov/ds/389/data_files/be/be_e772_n3300_15/be_e772_n3300_15.las.xml'}
parse error:  {'raw_content_md5': '03d55e4521e1e0dfc37b7f535ad73b62', 'source_url': 'http://sero.nmfs.noaa.gov/maps_gis_data/protected_resources/critical_habitat/geodata/johnsonsseagrass_critical_habitat_fgdc.xml'}
parse error:  {'raw_content_md5': '044c17a87896df27b5c9288dbc5ce9ed', 'source_url': 'http://pubs.usgs.gov/ds/389/data_files/fs/fs_e774_n3308_15/fs_e774_n3308_15.las.xml'}
parse error:  {'raw_content_md5': '04d562c7da9288c8914d0012b7922c0e', 'source_url': 'http://dat

In [109]:
## tracking down an invalid cdata tag in some invalid fgdc

from lxml import etree
from semproc.xml_utils import extract_items
from semproc.parser import Parser

with open('data/wonky_fgdc.xml', 'r') as f:
    orig_response = f.read()

# response = response.encode('unicode_escape')
response = orig_response.replace('\\\n', '').replace('\r\n', '').replace('\\r', '').replace('\\n', '').replace('\n', '')
response = response.decode('utf-8', errors='replace').encode('unicode_escape') 

parser = Parser(response)

xml = etree.fromstring(response)

extract_items(parser.xml, ['idinfo', 'keywords', 'theme', 'themekey'])

['Atmosphere > Air Quality > Emissions',
 'Human Dimensions > Environmental Impacts',
 'Atmosphere > Air Quality > Carbon Monoxide',
 'Atmosphere > Air Quality > Nitrogen Oxides',
 'Atmosphere > Air Quality > Sulfur Oxides']

In [108]:

filepath = os.path.join(docs_dir, '004cd8a41674143268af65d47043bfc3.json')
with open(filepath, 'r') as g:
    gd = json.loads(g.read())
orig_response = gd['raw_content']

fgdc = Fgdc({
        "url": "http://catalog.data.gov/harvest/object/0e11e207-c223-4953-85be-5b5f9a5132d2/original", 
        "raw_content": orig_response,
        "tstamp": "2015-07-08"
    })
#description = fgdc.parse()
# fgdc.parse()
r = fgdc._prep_response(orig_response)

parser = etree.XMLParser(
            remove_blank_text=True,
            remove_comments=True,
            #recover=True,
            remove_pis=True,
            ns_clean=True
        )

xml = etree.fromstring(r, parser=parser)

etree.tostring(xml)

XMLSyntaxError: CData section not finished
Atmosphere > Air Quality > Emissions</themekey>   , line 1, column 9577