# validator2

This notebook validates materials citations annotation using Darwin core archives downloaded from GGI server. 

If my laptop is online, this Jupyter notebook will run during the second minute of every hour under this crontab entry:
```
2 * * * * ~/Desktop/data-mining-insects-of-guam/validator2/run_validator2.sh >~/Desktop/data-mining-insects-of-guam/validator2/run_validator2.log 2>&1
```

The bash file (run_validator2.sh) runs the notebook using **papermill**:
```
#!/bin/bash
cd ~/Desktop/data-mining-insects-of-guam/validator2/
/home/aubrey/.local/bin/papermill validator2.ipynb output.ipynb
```

In [1]:
import time
import pandas as pd
import subprocess
import re
from datetime import datetime
import os

In [2]:
# This list is currently maintained "manually".

author_list = [
    'A. Cruz',
    'E. H. Bryan',
    'E. H. Bryan and O. H. Swezey',
    'T. E. Esaki',
    'D. T. Fullaway',
    'H. G. Hornbostel',
    'R. G. Oakley',
    'Z. Ono',
    'O. H. Swezey',
    'O. H. Swezey and R. L. Usinger',
    'Rowley',
    'R. L. Usinger',
    'R. L. Usinger and O. H. Swezey',
    'unknown',
]

In [3]:
# This list is currently maintained "manually".

locality_list = [
    'Agana',
    'Agana Swamp',
    'Agat',
    'Agfayan',
    'Asan',
    'Atao Beach',
    'Barrigada',
    'Dandan',
    'Dededo',
    'Fadian',
    'Government House, Agana',
    'Guam',
    'Inarajan',
    'Machanao',
    'Mata',
    'Merizo',
    'Mogfog',
    'Mt. Alifan',
    'Mount Alifan',
    'Mount Chachao',
    'Mt. Sasalaguan',
    'Mount Sasalaguan', 
    'Mount Tenjo',
    'Orote Peninsula',
    'Orote Point',
    'Passan',
    'Piti',
    'Ritidian Point',
    'Rota Island',
    'Root School Farm',
    'Santa Rosa Peak',
    'Sumay Road',
    'Tarague',
    'Tarague Beach',
    'Tumon',
    'Umatac',
    'Upi Trail',
    'Yigo',
    'Yona',
    'Atantano',
    'Talofofo',
    'Libugon Farm',
    'Sumay',
    'Fonte Valley',
    'Ponape',
    'Ponape, Mount Nanalaut',
    'Ponape, Nipit-Ninoani',
    'Ponape, Kolonia-Nat',
    'Babelthaup, Marukyoku',
    'Kusaie, Mount Wakapp',
    'Tiyan',
    'Libugon',
    'Palae',
    'Magua',
    'Saipan'
]

In [4]:
date_list = [
    '1911',
    '1925',
    '1936',
    '1937',
    '1938',
    '1939',
]

In [5]:
country_list = [
    'Guam',
    'Northern Mariana Islands',
    'Palau',
    'Micronesia (Federated States of)',
]

In [6]:
def read_dataset_list():
    '''
    Reads ../dataset-list.md and returns a pandas dataframe
    '''
    
    df = pd.read_table('../dataset-list.md', sep="|", header=0, skipinitialspace=True)

    # Drop the left-most and right-most null columns 
    
    df = df.dropna(axis=1, how='all')

    # Drop the header underline row
    
    df = df.iloc[1:]  

    # Strip whitespace from end of strings
    
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    # Strip whitespace from end of column headers
    
    df.columns = df.columns.str.strip()

    # Drop datasets with no title - we don't need to process these
    
    df = df.drop(df[df.title == 'no title'].index)
    return df

# read_dataset_list()

In [7]:
%%time

def update_github():
    command = f'./update_github.sh'
    result = os.system(command)
    assert result==0, f'{command} failed'  

# update_github()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.53 µs


In [8]:
def get_datestamp():
    df = pd.read_xml('eml.xml', xpath=".//additionalMetadata/metadata/gbif")
    return df.dateStamp[0]

# get_datestamp()

In [9]:
%%time

def validate_dwca(uuid):
    """
    Downloads a Darwin core archive from the GGI server, unzips it. 
    
    Summary saved to <uuid>.html
    """
    dwca_url = f'http://tb.plazi.org/GgServer/dwca/{uuid}.zip'
    dwca_file = f'{uuid}.zip'

    # download the DwCA into the current working directory, 
    # overwriting any previous DwCA with same uuid

    command = f'wget -O {dwca_file} {dwca_url}'
    result = os.system(command)
    assert result==0, f'{command} failed'

    # unzip the DwCA, overwriting files:  
    #   meta.xml, eml.xml, taxa.txt, occurrences.txt, multimedia.txt, description.txt, distribution.txt, 
    #   media.txt, references.txt, vernaculars.txt

    command = f'unzip -o {dwca_file}'
    result = os.system(command)
    assert result==0, f'{command} failed'  
    
    # delete zip file

    command = f'rm {dwca_file}'
    result = os.system(command)
    assert result==0, f'{command} failed'  
       
    taxon_df = pd.read_csv('taxa.txt', sep='\t')
    occ_df = pd.read_csv('occurrences.txt', sep='\t') 
    occ_df.eventDate = occ_df.eventDate.astype(str)

    merged_df = taxon_df.merge(right=occ_df, on='taxonID')
    assert merged_df.shape[0]==occ_df.shape[0],'merged_df does not have same number of records as occ_df'
    
    merged_df = merged_df[['canonicalName','country','locality','recordedBy','eventDate']]
    
    merged_df['valid_eventDate'] = merged_df['eventDate'].str[:4].isin(date_list)
    merged_df['valid_recordedBy'] = merged_df['recordedBy'].isin(author_list)
    merged_df['valid_locality'] = merged_df['locality'].isin(locality_list)
    merged_df['valid_country'] = merged_df['country'].isin(country_list)

    # title_html
    
    s = '<p class="title is-1">Insects of Guam Datamining Project</p>\n'
    title = df[df["uuid"]==uuid]["title"].to_list()[0]
    s += f'<p class="subtitle is-3">{title}</p>\n'
    s += f'<p><b>Darwin Core Archive:</b> <a href="{dwca_url}">{dwca_url}</a></p>\n'
    s += f'<p>Generated by <b>validator2.ipynb</b> at {datetime.utcnow()} UTC</p>\n'
    title_html = s
    
    # results_html
    
    s = '<table class="table">\n'
    s += '<thead>\n'
    s += '<tr>\n'
    s += '<th>canonicalName</th>\n'
    s += '<th>country</th>\n'
    s += '<th>locality</th>\n'
    s += '<th>recordedBy</th>\n'
    s += '<th>eventDate</th>\n'
    s += '</tr>\n'
    s += '</thead>\n'
    
    for i,r in merged_df.iterrows():
        s += '<tr>\n'
        s += f'<td><i>{r.canonicalName}</i></td>\n'
        
        if r.valid_country:
            s += f'<td>{r.country}</td>\n'
        else:
            s += f'<td class="is-selected">{r.country}</td>\n'
            
        if r.valid_locality:
            s += f'<td>{r.locality}</td>\n'
        else:
            s += f'<td class="is-selected">{r.locality}</td>\n'
                        
        if r.valid_recordedBy:
            s += f'<td>{r.recordedBy}</td>\n'
        else:
            s += f'<td class="is-selected">{r.recordedBy}</td>\n'
            
        if r.valid_eventDate:
            s += f'<td>{r.eventDate}</td>\n'
        else:
            s += f'<td class="is-selected">{r.eventDate}</td>\n'
            
        s += '</tr>\n'
    s += '</table>\n'
    results_html = s
    
    # summary_html

    summary_dict = {}
    summary_dict['title'] = title
    summary_dict['uuid'] = uuid
    summary_dict['n_materials_citations'] = merged_df.shape[0]
    summary_dict['n_treatments'] = len(pd.unique(merged_df['canonicalName']))    
    summary_dict['n_invalid_country'] = merged_df[merged_df['valid_country']==False].shape[0]
    summary_dict['n_invalid_locality'] = merged_df[merged_df['valid_locality']==False].shape[0]
    summary_dict['n_invalid_recordedBy'] = merged_df[merged_df['valid_recordedBy']==False].shape[0]
    summary_dict['n_invalid_eventDate'] = merged_df[merged_df['valid_eventDate']==False].shape[0]
    summary_dict['datestamp'] = get_datestamp()
    print(summary_dict)

    s = '<table class="table">\n'
    s += '<thead>\n'
    s += '<tr>\n'
    s += '<th>treatments</th>\n'
    s += '<th>materials_citations</th>\n'
    s += '<th>invalid_country</th>\n'
    s += '<th>invalid_locality</th>\n'
    s += '<th>invalid_recordedBy</th>\n'
    s += '<th>invalid_eventDate</th>\n'
    s += '<th>datestamp</th>\n'
    s += '</tr>\n'
    s += '</thead>\n'
    s += '<tr>\n'
    s += f'<td>{summary_dict["n_treatments"]}</td>\n'    
    s += f'<td>{summary_dict["n_materials_citations"]}</td>\n'    
    
    for x in ['n_invalid_country', 'n_invalid_locality', 'n_invalid_recordedBy', 'n_invalid_eventDate', 'datestamp']:
        if summary_dict[x] == 0:
            s += f'<td>{summary_dict[x]}</td>\n'
        else:
            s += f'<td class="is-selected">{summary_dict[x]}</td>\n'
       
    s += '</tr>\n'
    s += '</table>'
   
    summary_html = s
       
    # Write the validation report
    
    timestamp = datetime.utcnow()
    html = f'''
        <html>
            <header>
                <meta charset="utf-8">
                <meta name="viewport" content="width=device-width, initial-scale=1">
                <title>validator2</title>
                <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.3/css/bulma.min.css">
            </header>
            <body>
                <section class="section">
                    <div class="container">
                        {title_html}
                        {summary_html}
                        {results_html}
                    </div>
                </section>
            </body>
        </html>        
        '''
    with open(f'{uuid}.html', 'w') as f:
        f.write(html)        
            
    return summary_dict

# validate_dwca('FE566D11FFD2FFF5383F9056FFE3FFEF')

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


In [10]:
%%time

def generate_status_report(summary_list):
    
    # controlled_vocabularie_html

    s = '<p class="title is-3">Controlled vocabularies\n'

    s += '<p class="title is-4">Country</p>\n'
    for i in sorted(country_list):
        s += f'<p>{i}</p>\n'
    s += '<br><br>'

    s += '<p class="title is-4">Locality</p>\n'
    for i in sorted(locality_list):
        s += f'<p>{i}</p>\n'
    s += '<br><br>'

    s += '<p class="title is-4">recordedBy</p>\n'
    for i in sorted(author_list):
        s += f'<p>{i}</p>\n'
    s += '<br><br>'

    s += '<p class="title is-4">eventDate (valid years)</p>\n'
    for i in sorted(date_list):
        s += f'<p>{i}</p>\n'
    s += '<br><br>'
        
    controlled_vocabularies_html = s

    # title_html

    s = '<p class="title is-1">Insects of Guam Datamining Project</p>\n'
    s += f'<p class="subtitle is-3">Status report</p>\n'
    s += f'<p>Generated by <b>validator2.ipynb</b> at {datetime.utcnow()} UTC</p>'
    s += '<p>Data are check against controlled vocabularies listed at the bottom of this report. Currently, these lists are maintained within <b>validator2.ipynb</b></p>\n'
    s += '<p>Click on a <b>uuid</b> to see validation results for the corresponding chapter.</p>'
    title_html = s

    # table_html

    s = '<table class="table">\n'
    s += '<thead>\n'
    s += '<tr>\n'
    s += '<th>uuid</th>\n'
    s += '<th>title</th>\n'
    s += '<th>treatments</th>\n'
    s += '<th>materials_citations</th>\n'
    s += '<th>invalid_country</th>\n'
    s += '<th>invalid_locality</th>\n'
    s += '<th>invalid_recordedBy</th>\n'
    s += '<th>invalid_eventDate</th>\n'
    s += '</tr>\n'
    s += '</thead>\n'

    for d in summary_list:
        uuid = d['uuid']
        s += '<tr>\n'
        s += f'<td><a href="{uuid}.html">{uuid}</td>\n' # link to validation report for this uuid
        s += f'<td>{d["title"]}</td>\n'
        s += f'<td>{d["n_treatments"]}</td>\n'    
        s += f'<td>{d["n_materials_citations"]}</td>\n'    

        for x in ['n_invalid_country', 'n_invalid_locality', 'n_invalid_recordedBy', 'n_invalid_eventDate']:
            if d[x] == 0:
                s += f'<td>{d[x]}</td>\n'
            else:
                s += f'<td class="is-selected">{d[x]}</td>\n'

        s += '</tr>\n'
    s += '</table>'
    table_html = s

    # Write the validation report

    timestamp = datetime.utcnow()
    html = f'''
        <html>
            <header>
                <meta charset="utf-8">
                <meta name="viewport" content="width=device-width, initial-scale=1">
                <title>validator2</title>
                <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.3/css/bulma.min.css">
            </header>
            <body>
                <section class="section">
                    <div class="container">
                        {title_html}
                        {table_html}
                        {controlled_vocabularies_html}
                    </div>
                </section>
            </body>
        </html>        
        '''
    with open(f'status_report.html', 'w') as f:
        f.write(html)           

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


In [11]:
def cleanup():
    files = ['meta.xml', 'eml.xml', 'taxa.txt', 'occurrences.txt', 'multimedia.txt', 'description.txt', 'distribution.txt', 'media.txt', 'references.txt', 'vernaculars.txt']
    for f in files:
        if os.path.exists(f):
            os.remove(f)
            
cleanup()

In [None]:
%%time

# MAIN

df = read_dataset_list()

# Validate each DwCA

print('Validating')

summary_list = []

uuids = list(df.uuid.values)
for uuid in uuids:
    summary_dict = validate_dwca(uuid)
    summary_list.append(summary_dict)

print('Generating status report')  
generate_status_report(summary_list)

print('Cleaning up')
cleanup()

print('Updating GitHub')
update_github()
        
print('FINISHED')

Validating
{'title': 'Strepsiptera of Guam', 'uuid': 'FFF07216FFA41642FFBAFFE7FFCA482A', 'n_materials_citations': 2, 'n_treatments': 1, 'n_invalid_country': 0, 'n_invalid_locality': 0, 'n_invalid_recordedBy': 0, 'n_invalid_eventDate': 0, 'datestamp': '2022-07-10T07:33:36+0000'}
{'title': 'Halictine Bees from Rota Island', 'uuid': 'A676FD1EF22D3F34FF8F8907FFDAFC58', 'n_materials_citations': 3, 'n_treatments': 3, 'n_invalid_country': 0, 'n_invalid_locality': 0, 'n_invalid_recordedBy': 0, 'n_invalid_eventDate': 0, 'datestamp': '2022-05-03T23:10:30+0000'}
{'title': 'Aphididae and Aleurodidae Of Guam', 'uuid': 'FF8CA776FF947F25FF920C4AFFF5FF0F', 'n_materials_citations': 9, 'n_treatments': 4, 'n_invalid_country': 0, 'n_invalid_locality': 0, 'n_invalid_recordedBy': 0, 'n_invalid_eventDate': 0, 'datestamp': '2022-07-10T06:41:44+0000'}
{'title': 'Isoptera of Guam', 'uuid': 'FFDEFF89B713A955FFD59822FF8CFF82', 'n_materials_citations': 6, 'n_treatments': 3, 'n_invalid_country': 0, 'n_invalid_local