For easy reference, documentation, non-developer access, this will unpack the contents of the identification configuration files into a table.

The table structure:

| Protocol | Name | Type | Signatures | Version Signatures |
|:---------|:-----|:-----|:-----------|:-------------------|
| - | - | metadata/service/dataset | substring or XPath filter for XML identification | substring or XPath filter for Version identification |

For ease of use, particularly related to minor version changes, the XPaths do not include prefixed element names.

To reduce dependencies (squishy, that), some code is ported from the semantics-preprocessing repo. Making every attempt to keep this tied to python deps only.

Filters will be repacked as "AND(filter, filter)" and those, unfortunately, can also be nested. 

For versions, there are two structures: a default (if filter matches, it is the specified version) or an extraction (return value at this XPath). Defaults will be repacked as "DEFAULT(filter == value)" and extractions as "AND(filter, filter)".

In [1]:
import yaml
import glob
import pandas as pd

def import_yaml_configs(config_paths):
    '''
    merge a set of yaml config files so we can maintain
    one set of identify structures for a protocol
    '''

    def _read(config_path):
        with open(config_path, 'r') as f:
            y = yaml.load(f.read())

        return y

    # the configs are just big lists
    config = []
    for config_path in config_paths:
        config.append((config_path, _read(config_path)))

    return config

yamls = glob.glob('../identification/configs/*_identifier.yaml')
configs = import_yaml_configs(yamls)

print configs[0]

('../identification/configs/dc_identifier.yaml', [{'name': 'DC', 'metadata': [{'name': 'DublinCore', 'filters': {'ands': [{'object': 'content', 'type': 'simple', 'value': 'http://www.openarchives.org/OAI/2.0/oai_dc/'}, {'object': 'content', 'type': 'xpath', 'value': '/*[local-name() = "dc"]'}]}, 'versions': {'defaults': {'ors': [{'text': '2.0', 'object': 'content', 'type': 'simple', 'value': 'http://www.openarchives.org/OAI/2.0/oai_dc/'}]}}}]}])


In [2]:
# let's see if we can recursively repack the things
def stringify_filters(operator, filters, strings):
    for f in filters:
        ft = f['type']
        
        if ft == 'complex':
            fo = f['operator']
            strings.append(stringify_filters(fo, f['filters'], []))
        elif ft in ['simple', 'regex', 'xpath']:
            # just append some junk
            strings.append('"%s"' % f['value'])
    return {operator: strings}

def repack_filters(filters):
    blobs = []
    for i, j in filters.iteritems():
        blob = stringify_filters(i, j, [])
        for k, v in blob.iteritems():
            blobs.append('%s(%s)' % (k.upper(), ' | '.join(v)))
    return '\n'.join(blobs)

In [3]:
# convert the yaml into a dataframe via the from_tuples route
# where each config can have multiple metadata/service/dataset options
rows = []
for path, config in configs:
    print path
    
    for c in config:
        protocol = c.get('name')
        
        print '\t', protocol

        for response_type, type_options in c.iteritems():
            if response_type == 'name':
                continue

            for type_option in type_options:
                # nesting the heck out of things
                filters = type_option.get('filters', [])
                versions = type_option.get('versions', {})

                rows.append((
                    protocol,
                    type_option.get('name'),
                    response_type,
                    repack_filters(filters),
                    '-'  # versions
                ))


../identification/configs/dc_identifier.yaml
	DC
../identification/configs/dif_identifier.yaml
	DIF
../identification/configs/extra_identifier.yaml
	PDS
	CAP-ALERT
	MARC 21 Format
	DDI
	KML
	ArcGISExplorerDocument
	ESRI MapServer Info
	Microsoft
	WordPress
	LOC-METS
	DataCite
	EML
	QuakeML
	NIEM
	Disco
	SOAP
	Sitemap
	waterML
	ECHO
	MODAPS
	IOOS
	Zenodo
	Ferret ToolsUi
	HDF-XML
../identification/configs/feed_identifier.yaml
	RSS
	ATOM
../identification/configs/fgdc_identifier.yaml
	FGDC
../identification/configs/iso_identifier.yaml
	ISO


TypeError: sequence item 0: expected string, dict found

In [25]:
print rows[0]

('DC', 'DublinCore', 'metadata', 'ANDS("http://www.openarchives.org/OAI/2.0/oai_dc/" | "/*[local-name() = "dc"]")', {'defaults': {'ors': [{'text': '2.0', 'object': 'content', 'type': 'simple', 'value': 'http://www.openarchives.org/OAI/2.0/oai_dc/'}]}})


In [4]:
df = pd.DataFrame(rows, columns=['protocol', 'name', 'type', 'filters', 'versions'])
df

Unnamed: 0,protocol,name,type,filters,versions
0,DC,DublinCore,metadata,"ANDS(""http://www.openarchives.org/OAI/2.0/oai_...",-
1,DIF,DIF,metadata,"ANDS(""http://gcmd.gsfc.nasa.gov/Aboutus/xml/di...",-
2,PDS,PDS,service,"ORS(""http://pds.nasa.gov/pds4/pds/v1"")",-
3,CAP-ALERT,CAP-ALERT,service,"ORS(""http://www.incident.com/cap/1.0"" | ""urn:o...",-
4,MARC 21 Format,MARC 21 Format,metadata,"ORS(""http://www.loc.gov/MARC21/slim"")",-
5,DDI,DDI,metadata,"ANDS(""http://www.ddialliance.org/Specification...",-
6,KML,KML,dataset,"ORS(""http://earth.google.com/kml"")",-
7,ArcGISExplorerDocument,ArcGISExplorerDocument,service,"ORS(""http://www.esri.com/schemas/ArcGIS/"")",-
8,ESRI MapServer Info,ESRI MapServer Info,service,"ANDS(""EsriPropertyType="" | ""ArcGISFormat"" | ""<...",-
9,Microsoft,Microsoft Document,dataset,"ORS(""urn:schemas-microsoft-com:office:office"")",-


In [7]:
df.to_csv('outputs/xml_signatures.csv')

In [14]:
test_filters = {
    'ands': [
        {
            'object': 'content',
            'type': 'simple',
            'value': 'http://www.openarchives.org/OAI/2.0/oai_dc/'
        },
        {
            'object': 'content', 'type': 'xpath', 'value': '/*[local-name() = "dc"]'
        }
    ]
}

for i, j in test_filters.iteritems():
    blob = stringify_filters(i, j, [])
    for k, v in blob.iteritems():
        print '%s(%s)' % (k.upper(), ' | '.join(v))

ANDS("http://www.openarchives.org/OAI/2.0/oai_dc/" | "/*[local-name() = "dc"]")
