In [1]:
from bs4 import BeautifulSoup
import re
from datetime import datetime

In [2]:
PATH_TO_XML_FILE = 'example.xml'

## Fields from the Materials Citations Dialog

```
collectionCode, specimenCount, specimenCode, accessionNumber
typestatus, collectingCountry, collectingRegion, collectingMunicipality
collectingCounty, location, locationDeviation, originalDetermination
determinerName, collectorName, collectingDate, collectedFrom
collectingMethod, collectingPermit, geoCoordinate, elevation
geologicalTimeScale, backReference
```

## Mapping Material Citation Fields to DwCA

The following list was extracted from the DwCA meta.xml file. Here, we can see how GGI terms are mapped to DwCA terms.

dwca | GGI|
-----|:---|
http://rs.tdwg.org/dwc/terms/taxonID|treatment ID + ".taxon"
http://rs.tdwg.org/dwc/terms/catalogNumber|mc@specimenCode (explode to one record per specimen code if possible)
http://rs.tdwg.org/dwc/terms/collectionCode|mc@collectionCode (explode to one record per collection code if possible)
http://rs.tdwg.org/dwc/terms/institutionCode|blank
http://rs.tdwg.org/dwc/terms/typeStatus|mc@typeStatus (blank if none given)
http://rs.gbif.org/terms/1.0/verbatimLabel|mc text
http://rs.tdwg.org/dwc/terms/sex|mc@sex (also other specimen types like "queen", "worker", etc.)
http://rs.tdwg.org/dwc/terms/individualCount|mc@specimenCount (explode things like "5 workers, 2 females" to one record per typified specimen count if possible)
http://rs.tdwg.org/dwc/terms/eventDate|mc@collectingDate
http://rs.tdwg.org/dwc/terms/recordedBy|mc@collectorName
http://rs.tdwg.org/dwc/terms/recordNumber|blank
http://rs.tdwg.org/dwc/terms/decimalLatitude|mc@latitude
http://rs.tdwg.org/dwc/terms/decimalLongitude|mc@longitude
http://rs.tdwg.org/dwc/terms/minimumElevationInMeters|mc@elevation, or mc@elevationMin if given
http://rs.tdwg.org/dwc/terms/maximumElevationInMeters|mc@elevationMax if given
http://rs.tdwg.org/dwc/terms/country|mc@collectingCountry
http://rs.tdwg.org/dwc/terms/stateProvince|mc@stateProvince or mc@collectingRegion
http://rs.tdwg.org/dwc/terms/municipality|mc@collectingMunicipality
http://rs.tdwg.org/dwc/terms/locality|mc@location

Here's my idea of which fields are required:

* collectingDate
* collectorName
* collectingCountry
* collectingMunicipality OR location

In [3]:
mat_cit_attr_fields = [
    'specimenCode','collectionCode','typeStatus',
    'sex','specimenCount','collectingDate',
    'collectorName','latitude','longitude',
    'elevation','elevationMin','elevationMax',
    'collectingCountry','stateProvince','collectingMunicipality',
    'location']

In [4]:
mat_cit_child_fields = [
    'collectionCode', 'specimenCount', 'specimenCode', 'accessionNumber',
    'typestatus', 'collectingCountry', 'collectingRegion', 'collectingMunicipality',
    'collectingCounty', 'location', 'locationDeviation', 'originalDetermination',
    'determinerName', 'collectorName', 'collectingDate', 'collectedFrom',
    'collectingMethod', 'collectingPermit', 'geoCoordinate', 'elevation',
    'geologicalTimeScale', 'backReference']

In [5]:
collector_list = ['O.H. Swezey', 'E. H. Bryan', 'R. L. Usinger']

place_list = ['Ritidian Point','Agana','Barrigada','Piti']

In [6]:
def check_required_fields(matcit):
    html = ''
#     if matcit.get('collectingdate','') == '':
#         html += '<div class="notification is-danger">ERROR: no collectiondate</div>\n'
    if matcit.get('collectorname','') == '':
        html += '<div class="notification is-danger">ERROR: no collectorname</div>\n'
    if matcit.get('collectingcountry','') == '':
        html += '<div class="notification is-danger">ERROR: no collectingcountry</div>\n'
    if (matcit.get('collectingmunicipality','') == '') and (matcit.get('location','') == ''):
        html += '<div class="notification is-danger">ERROR: no collectingmunicipality or location</div>\n'
    return html 

def check_for_unlisted_child_fields(matcit):
    html =''
    soup = BeautifulSoup(str(matcit))
    li = soup.find('materialscitation')
    children = li.findChildren(recursive=False)
    for child in children:
        if not (child.name in [x.lower() for x in mat_cit_child_fields]):
            html += '<div class="notification is-info">'
            html += f'INFO: <b>{child.name}</b> is not a regular material citations child field'
            html += '</div>\n'
    return html

def check_for_unlisted_attribites(matcit):
    html =''
    for attr in matcit.attrs:
        if not (attr in [x.lower() for x in mat_cit_attr_fields]):
            html += '<div class="notification is-info">'
            html += f'INFO: <b>{attr}</b> is not a regular material citations attribute'
            html += '</div>\n'
    return html

def check_date(matcit, doc_attrs):
    collectingdate = matcit.get('collectingdate','')
    if collectingdate=='':
        return '<div class="notification is-danger">ERROR: no collectingdate</div>\n' 
    matches = re.search('(\d{4})', collectingdate)
    if not matches:
        return ''
    year = matches.group(1)
    if year > doc_attrs.get('docdate',''):
        return '<div class="notification is-danger">ERROR: collectingdate year is greater than publication date</div>\n'    
    return ''

def check_location(matcit):
    return ''

def check_collector(matcit):
    return ''

def check_material_citation(matcit, doc_attrs):
    html = ''
#     html += check_required_fields(matcit)
    html += check_date(matcit, doc_attrs)
    html += check_location(matcit)
    html += check_collector(matcit)
    html += check_for_unlisted_attribites(matcit)
    html += check_for_unlisted_child_fields(matcit)
    return html

In [7]:
def check_material_citations():
    
    # Read xml file into a string
    
    with open(PATH_TO_XML_FILE, 'r') as f:
            s = f.read()

    soup = BeautifulSoup(s, 'lxml')

    # List document attributes

    doc_attrs = soup.find('document').attrs
    html = f'<p class="title is-1">{doc_attrs["doctitle"]}</p>\n'
    html += f'<p class="subtitle is-3">uuid: {doc_attrs["docid"]}</p>\n'
    html += f'<p class="subtitle is-3">Report generated by MatCit-Validator at {datetime.utcnow()} UTC</p>\n'
    html += f'<hr>\n'
    html += f'<p class="title is-2">Document attributes</p>\n'

    for key in doc_attrs:
        html += f'<b>{key}:</b> {doc_attrs[key]}<br>\n'
    html += '<hr>'

    # Check material citations
    
    treatments = soup.find_all('treatment')
    for treatment in treatments:
        extract = treatment.text.split()[:4]
        extract = ' '.join(extract)
        html += f'<p class="title is-2">treatment: {extract} ...</h1>\n'

        materialcitations = treatment.find_all('materialscitation')
        for materialcitation in materialcitations:
            html += f'<div class="notification is-info is-light">{materialcitation.text}</div>\n'
            html += '<p class="title is-6">Attributes</p>\n'
            for key in materialcitation.attrs:
                html += f'<b>{key}:</b> {materialcitation[key]}<br>\n'
            html += '<br>\n'
            html += '<p class="title is-6">Child nodes</p>\n'
            for child in materialcitation.findChildren(recursive=False):
                html += f'<b>{child.name}:</b> {child.text}<br>'            
            
            html += check_material_citation(materialcitation, doc_attrs)
            html += '<br><br><hr>\n'
    return html

In [8]:
def generate_html_report(mat_cit_chk_html, output_file):
    timestamp = datetime.utcnow()
    html = f'''
        <html>
            <header>
                <meta charset="utf-8">
                <meta name="viewport" content="width=device-width, initial-scale=1">
                <title>mat_cit_chk</title>
                <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.3/css/bulma.min.css">
            </header>
            <body>
                <section class="section">
                    <div class="container">
                        {mat_cit_chk_html}
                    </div>
                </section>
            </body>
        </html>        
        '''
    with open(output_file, 'w') as f:
        f.write(html)    

In [9]:
# MAIN

mat_cit_chk_html = check_material_citations()
generate_html_report(mat_cit_chk_html, PATH_TO_XML_FILE.replace('.xml','.html'))