# Extract basic term metadata from the EFG XML schema


In [64]:
# Import libraries and configuration
import requests
import pandas as pd
import xml.etree.ElementTree as et

schema_at_github = 'https://raw.githubusercontent.com/tdwg/efg/master/efg.xsd'
namespaces = {'xs': 'http://www.w3.org/2001/XMLSchema'}


In [68]:
# Define functions
def generate_label(local_name):
    out_string = ''
    # Step through each character to see if it's capitalized
    for character in local_name:
        if character < 'a':
            out_string += ' '
        out_string += character.lower()
    return out_string.strip()

# Extract examples from the documentation element values
def extract_examples(documentation_string):
    pieces = documentation_string.split('e.g.')
    if len(pieces) > 1:
        definition = pieces[0].strip()
        examples = pieces[1].strip()
    else:
        definition = documentation_string
        examples = ''
    return definition, examples

In [69]:
# Retrieve XSD from GitHub and parse XML structure
response_object = requests.get(schema_at_github)
data = response_object.text
root = et.fromstring(data)


In [71]:
# Find all of the elements with xs:element tags
elements = root.findall('.//xs:element', namespaces)
rows = []
for element in elements:
    # If the element has a "name" attribute, get its value
    if 'name' in element.attrib:
        local_name = element.attrib['name']
        
        # get the text contents of the nested xs:documentation node
        def_element = element.findall('.//xs:documentation', namespaces)
        if len(def_element) > 0:
            documentation_string = def_element[0].text
        else:
            documentation_string = ''

        # Build a dict for the output row
        column_dict = {}
        #print(local_name)
        #print(definition)
        
        column_dict['term_localName'] = local_name

        # Create human-readable label by replacing dashes with spaces
        column_dict['label'] = generate_label(local_name)
        
        # Split the documentation field into definition and examples (if any)
        definition, examples = extract_examples(documentation_string)

        column_dict['definition'] = definition
        column_dict['usage'] = ''
        column_dict['notes'] = ''
        column_dict['examples'] = examples
        column_dict['type'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'
        column_dict['tdwgutility_organizedInClass'] = ''

        #print(column_dict)
        rows.append(column_dict)

data_frame = pd.DataFrame(rows)
data_frame

Unnamed: 0,term_localName,label,definition,usage,notes,examples,type,tdwgutility_organizedInClass
0,UnitSize,unit size,Free text fiekd for describing the overall siz...,,,,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,
1,UnitWeight,unit weight,Free text field to enter gross weight of geolo...,,,,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,
2,Hazard,hazard,Container for describing hazards that may be a...,,,"radioactive, toxic, carcinogens etc.",http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,
3,HazardText,hazard text,Free text describing any hazards that may be a...,,,"is it radioactive, poisonous, carcinogenic?",http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,
4,HazardKeywords,hazard keywords,,,,,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,
...,...,...,...,...,...,...,...,...
206,AssociatedMineralList,associated mineral list,A single mineral name at any level of accuracy,,,"feldspar, pyroxene etc. or orthoclase, olivine...",http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,
207,AssociatedMineralName,associated mineral name,A single mineral name at any level of accuracy,,,"feldspar, pyroxene etc. or orthoclase, olivine...",http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,
208,AssociatedMineralComment,associated mineral comment,Free text for describing the occurrence of thi...,,,,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,
209,AssociatedMineralAssemblageText,associated mineral assemblage text,Free text for describing the features or chara...,,,typical North Yorkshire primary copper-rich su...,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,


In [72]:
sorted_df = data_frame.sort_values(by ='term_localName', ascending = 1)
sorted_df.to_csv('efg.csv', index = False)
print('done')

done
