quick checks of the new fgdc triples structure
and maybe the triples (yes the triples, whatevs)

In [70]:
%reload_ext autoreload
%autoreload 2

import os
import json
import glob
from semproc.parser import Parser
from semproc.preprocessors.metadata_preprocessors import FgdcItemReader

In [43]:
# load the proto-triples example
with open('../response_examples/fgdc_proto_example_1.xml', 'r') as f:
    response = f.read()

# this shouldn't be necessary but cargo-culting here is fine by me.
response = response.replace('\\\n', '').replace('\r\n', '').replace('\\r', '').replace('\\n', '').replace('\n', '')
response = response.decode('utf-8', errors='replace').encode('unicode_escape') 
    
url = 'https://bluehub.jrc.ec.europa.eu/erddap/metadata/fgdc/xml/noaa_pfeg_d543_8870_bc7f_fgdc.xml'
identities = [
    {"protocol": "FGDC", 
     "metadata": {
            "version": ["FGDC Content Standards for Digital Geospatial Metadata, FGDC-STD-001-1998"], 
            "name": "FGDC"}
    }
]

parser = Parser(response)

In [44]:
# execute the parse (this one takes xml)
reader = FgdcItemReader(parser.xml, url, '2015-06-20T20:22:00.643Z')
description = reader.parse_item()

In [45]:
description

{'catalog_record': {'conformsTo': 'http://www.ngdc.noaa.gov/metadata/published/xsd/ngdcSchema/schema.xsd',
  'harvestDate': '2015-06-20T20:22:00.643Z',
  'object_id': 'urn:sha:f65fb3d1efeee860adbbb53d3a20e80e1f50fe625e4887155f196a92',
  'relationships': [{'object_id': 'urn:sha:d5a3a66150264cfeca19322b996a1cbf202b15966e0f1e41b99ae991',
    'relate': 'primaryTopic'}],
  'url': 'https://bluehub.jrc.ec.europa.eu/erddap/metadata/fgdc/xml/noaa_pfeg_d543_8870_bc7f_fgdc.xml'},
 'dataset': {'abstract': 'Navy Global Environmental Model (NAVGEM) is a global numerical weather prediction computer model. It replaced NOGAPS as the prime model in the middle of February 2013 at the Navy Fleet Numerical Meteorology and Oceanography Center (FNMOC) Weather model synoptic site. [Wikipedia]',
  'identifier': 'https://bluehub.jrc.ec.europa.eu:noaa_pfeg_d543_8870_bc7f',
  'object_id': 'urn:sha:d5a3a66150264cfeca19322b996a1cbf202b15966e0f1e41b99ae991',
  'relationships': [{'object_id': 'urn:sha:f65fb3d1efeee86

In [63]:
# let's play with triples

import rdflib
import hashlib
from uuid import uuid4
from rdflib import Graph, Literal, RDF, RDFS, Namespace, URIRef
from rdflib.namespace import DC, DCTERMS, FOAF, XSD, OWL


class Grapher():
    def __init__(self):
        self.graph = Graph()
        self._bind_namespaces()
    
    # some faked namespaces
    _ontology_uris = {
        'bcube': 'http://purl.org/BCube/#',
        'vcard': 'http://www.w3.org/TR/vcard-rdf/#',
        'esip': 'http://purl.org/esip/#',
        'vivo': 'http://vivo.ufl.edu/ontology/vivo-ufl/#',
        'bibo': 'http://purl.org/ontology/bibo/#',
        'dcat': 'http://www.w3.org/TR/vocab-dcat/#',
        'dc': str(DC),
        'dct': str(DCTERMS),
        'foaf': str(FOAF),
        'xsd': str(XSD),
        'owl': str(OWL)
    }
    
    def _bind_namespaces(self):
        # bind our lovely fake namespaces
        for prefix, uri in self._ontology_uris.iteritems():
            self.graph.bind(prefix, uri)

    def generate_predicate(self, prefix, name):
        return Namespace(self._ontology_uris[prefix])[name]

    def identify_prefix(self, predicate):
        # this is, granted, a lesson in technical debt.
        debt = {
            "dc": ["description", "conformsTo", "relation"],
            "dcat": ["publisher"],
            "foaf": ["primaryTopic"]
        }
        
        for k, v in debt.iteritems():
            if predicate in v:
                return k
        return ''
            
    def create_resource(self, resource_prefix, resource_type, identifier=''):
        # make a thing with a uuid as a urn
        # and just assign it to type if it's not overridden
        identifier = identifier if identifier else uuid4().urn
        resource = self.graph.resource(identifier)
        ref = Namespace(self._ontology_uris[resource_prefix])[resource_type]
        resource.add(OWL.a, URIRef(ref))
        return resource

    def _process_catalog(self, entity):
        catalog_record = self.create_resource('dcat', 'CatalogRecord', entity['object_id'])
        catalog_record.add(self.generate_predicate('vcard', 'hasURL'), Literal(entity['url']))
        catalog_record.add(self.generate_predicate('vivo', 'harvestDate'), Literal(entity['harvestDate']))
        if entity['conformsTo']:
            catalog_record.add(DC.conformsTo, Literal(entity['conformsTo']))
        
        for relationship in entity['relationships']:
            # so. current object, verb, id of object, existence unknown
            self.relates.append((catalog_record, relationship['relate'], relationship['object_id']))
            
    def _process_dataset(self, entity):
        dataset = self.create_resource('dcat', 'Dataset', entity['object_id'])
        dataset.add(DCTERMS.identifier, Literal(entity['identifier']))
        dataset.add(DCTERMS.title, Literal(entity['title']))
        dataset.add(DC.description, Literal(entity['abstract']))
        
        if 'temporal_extent' in entity:
            # NOTE: make these iso 8601 first
            begdate = entity['temporal_extent'].get('startDate')
            enddate = entity['temporal_extent'].get('endDate')

            dataset.add(self.generate_predicate('esip', 'startDate'), Literal(begdate, datatype=XSD.date))
            dataset.add(self.generate_predicate('esip', 'endDate'), Literal(enddate, datatype=XSD.date))

        if 'spatial_extent' in entity:
            dataset.add(DC.spatial, Literal(entity['spatial_extent']['wkt']))

            # a small not good thing.
            dataset.add(self.generate_predicate('esip', 'westBound'),
                        Literal(float(entity['spatial_extent']['west']), datatype=XSD.float))

            dataset.add(self.generate_predicate('esip', 'eastBound'),
                        Literal(float(entity['spatial_extent']['east']), datatype=XSD.float))

            dataset.add(self.generate_predicate('esip', 'southBound'),
                        Literal(float(entity['spatial_extent']['south']), datatype=XSD.float))

            dataset.add(self.generate_predicate('esip', 'northBound'),
                        Literal(float(entity['spatial_extent']['north']), datatype=XSD.float))
        
        for relationship in entity['relationships']:
            self.relates.append((dataset, relationship['relate'], relationship['object_id']))
        
    def _process_keywords(self, entity):
        for keywords in entity:
            keyset = self.create_resource('bcube', 'thesaurusSubset', keywords['object_id'])
            if 'type' in keywords:
                keyset.add(DC.hasType, Literal(keywords['type']))
            if 'thesaurus' in keywords:
                keyset.add(DC.partOf, Literal(keywords['thesaurus']))

            try:
                for term in keywords['terms']:
                    keyset.add(self.generate_predicate('bcube', 'hasValue'), Literal(term))
            except:
                print keywords
        
    def _process_publisher(self, entity):
        publisher = self.create_resource('dcat', 'publisher', entity['object_id'])
        publisher.add(DC.location, Literal(entity['location']))
        publisher.add(FOAF.name, Literal(entity['name']))
        
    def _process_webpages(self, entity):
        for webpage in entity:
            relation = self.create_resource('bibo', 'WebPage', webpage['object_id'])
            relation.add(self.generate_predicate('vcard', 'hasURL'), Literal(webpage['url']))
    
    def serialize(self):
        return self.graph.serialize(format='turtle')
    
    def graphalize(self, doc):
        # not a word
        # so from our json.
        
        # this. is. idk. an ordering thing. i suspect the graph borks
        # when you try to add a triple for a non-existent object.
        # years of experience. also, i am wrong - it will add anything.
        self.relates = []
        for entity_type, entity in doc.iteritems():
            if entity_type == 'catalog_record':
                self._process_catalog(entity)
            elif entity_type == 'dataset':
                self._process_dataset(entity)
            elif entity_type == 'publisher':
                self._process_publisher(entity)
            elif entity_type == 'keywords':
                self._process_keywords(entity)
            elif entity_type == 'webpages':
                self._process_webpages(entity)
            else:
                continue
        
        for resource, verb, object_id in self.relates:
            resource.add(
                self.generate_predicate(
                    self.identify_prefix(verb), verb),
                URIRef(object_id)
            )
        

grapher = Grapher()    
grapher.graphalize(description)
print grapher.serialize()  


{'type': 'theme', 'thesaurus': 'GCMD Keyword Thesaurus', 'object_id': 'urn:uuid:571dbe03-82a3-45a5-a0b8-192ec92b5002'}
@prefix bcube: <http://purl.org/BCube/#> .
@prefix bibo: <http://purl.org/ontology/bibo/#> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix dcat: <http://www.w3.org/TR/vocab-dcat/#> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix esip: <http://purl.org/esip/#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix vcard: <http://www.w3.org/TR/vcard-rdf/#> .
@prefix vivo: <http://vivo.ufl.edu/ontology/vivo-ufl/#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<urn:sha:1646ee4d39fef9103635b9fcced8c474c560b673eb4813fd3ed0cba6> owl:a bibo:WebPage ;
    vcard:hasURL "http://datafedwiki.wustl.edu/index.php/RETRO_ANTHRO" .

<urn:sha:c88e51

#### generating local graphs for any fgdc response

i would note that we are not catching every fgdc through the identifier - not quite valid/complete responses are being dropped. this generates roughly three thousand graphs.

In [61]:
# so let's make a new temp class for the harvest + parser parts
# and this is not what we really want to do. probably.

class Fgdc():
    def __init__(self, doc):
        self.url = doc.get('url', '')
        self.response = self._prep_response(doc.get('raw_content', ''))
        self.harvested = doc.get('tstamp', '')  # except i did not carry this through in the clean task
        self._prep_parser()
        
    def _prep_parser(self):
        parser = Parser(self.response)
        self.reader = FgdcItemReader(parser.xml, self.url, self.harvested)
    
    def _prep_response(self, response):
        # this shouldn't be necessary but cargo-culting here is fine by me.
        response = response.encode('unicode_escape')
        response = response.replace('\\\n', '').replace('\r\n', '').replace('\\r', '').replace('\\n', '').replace('\n', '')
        return response.decode('utf-8', errors='replace').encode('unicode_escape') 
        
    def parse(self):
        return self.reader.parse_item()

In [71]:
import csv

docs_dir = '/Users/sscott/Documents/working_bits/solr_20150707/docs'
triples_dir = '/Users/sscott/Documents/tmp/triples/'

with open('data/small_harvest_fgdc_as_md5.csv', 'rb') as csvfile:
    cr = csv.DictReader(csvfile, delimiter="|")

    cnt = 0
    for row in cr:
        print row
        
        if cnt > 5:
            break
        
        url = row['source_url'].strip().replace('"', '')
        md5 = row['raw_content_md5'].strip().replace('"', '')
        
        # NOTE: this is obviously not correct but time constraints
        # harvest_date = '2015-07-07T00:00:00.000Z'
        
        # go open the cleaned up version by md5
        filepath = os.path.join(docs_dir, md5 + '.json')
        if not os.path.exists(filepath):
            continue
        
        with open(filepath, 'r') as f:
            data = json.loads(f.read())
            
        fgdc = Fgdc(data)
        description = fgdc.parse()
        
        grapher = Grapher()
        grapher.graphalize(description)
        ttl = grapher.serialize()
        
        with open(os.path.join(triples_dir, md5 + '.ttl'), 'w') as f:
            f.write(ttl)
        
        cnt += 1

{'raw_content_md5': '001165c4b4bb8c02389f1a216a71ab69', 'source_url': 'http://hdsc.nws.noaa.gov/hdsc/pfds/meta/na14_vol5_gu_grid_metadata.xml'}
{'raw_content_md5': '001346b24aca9c7d741c5cc0e3186681', 'source_url': 'http://catalog.data.gov/harvest/object/9a4ecbd7-643d-49f6-805a-0c6b7928783b/original'}
{'raw_content_md5': '043cdbc3ef789a66713ed7d49d5f6a7f', 'source_url': 'http://www.ncddc.noaa.gov/approved_recs/nos_de/csc/rcsd/Chlorophyll/CMECS_Eupohotic_metadata.xml'}
{'raw_content_md5': '00305fb6ea2c40eab991e028575627f8', 'source_url': 'http://catalog.data.gov/harvest/object/c7572f49-2da3-4bcf-a5a8-38003a55aad4/original'}
{'raw_content_md5': '00396a0e800619e08f206376cabae988', 'source_url': 'http://www.coris.noaa.gov/metadata/records/xml/Grammanik_45el_45az_10m.xml'}
{'raw_content_md5': '004831165a780e1ba35eebd9a39c2acf', 'source_url': 'http://gstore.unm.edu/apps/rgis/datasets/41b157be-ae1f-4011-97d1-d975618fe7fd/metadata/FGDC-STD-001-1998.xml'}
{'raw_content_md5': '004cd8a41674143268a

In [67]:
from lxml import etree
from semproc.xml_utils import extract_items

xml = etree.fromstring('''<metadata xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:smw="http://smw.ontoware.org/2005/smw#" xmlns:attribute="http://wiki.esipfed.org/index.php/Special:URIResolver/Attribute-3A">
<keywords>
            <theme>
                <themekt>GCMD Keyword Thesaurus</themekt>
                <themekey><![CDATA[Atmosphere > Air Quality > Emissions]]></themekey>
                <themekey><![CDATA[Human Dimensions > Environmental Impacts]]></themekey>
                <themekey><![CDATA[Atmosphere > Air Quality > Carbon Monoxide]]></themekey>
                <themekey><![CDATA[Atmosphere > Air Quality > Nitrogen Oxides]]></themekey>
                <themekey><![CDATA[Atmosphere > Air Quality > Sulfur Oxides]]></themekey>
            </theme>
            <place>
                <placekt>Uncontrolled Keywords</placekt>
                <placekey>US</placekey>
            </place>
        </keywords></metadata>
''')

extract_items(xml, ['keywords', 'theme', 'themekey'])

['Atmosphere > Air Quality > Emissions',
 'Human Dimensions > Environmental Impacts',
 'Atmosphere > Air Quality > Carbon Monoxide',
 'Atmosphere > Air Quality > Nitrogen Oxides',
 'Atmosphere > Air Quality > Sulfur Oxides']