In [81]:
# Hack of Script to build the Audubon Core term list page using Markdown.
# see https://github.com/tdwg/ac/blob/master/code/build_page.py
# Steve Baskauf 2020-02-10
# This script merges static Markdown header and footer documents with term information tables (in Markdown) generated from data in the rs.tdwg.org repo from the TDWG Github site

# Note: this script calls a function from http_library.py, which requires importing the requests, csv, and json modules
import re
import requests   # best library to manage HTTP transactions
import csv        # library to read/write/parse CSV files
import json       # library to convert JSON to Python data structures
import pandas as pd

# -----------------
# Configuration section
# -----------------

# This is the base URL for raw files from the branch of the repo that has been pushed to GitHub
githubBaseUri = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/practice/'

headerFileName = 'termlist-header.md'
footerFileName = 'termlist-footer.md'
outFileName = '../docs/doe.md'

# This is a Python list of the IRIs of the term lists to be included in the document.
termLists = ['degreeOfEstablishment', 'establishmentMeans']

vocab_type = 2 # 1 is simple vocabulary, 2 is simple controlled vocabulary, 3 is c.v. with broader hierarchy
has_versions = True

displayOrder = [''] # in this hack, the value is irrelevant. There just needs to be one item in the list.
displayLabel = ['Vocabulary'] # these are the section labels for the page
displayComments = [''] # these are the comments to be appended following the section labels
displayId = ['Vocabulary'] # these are the fragment identifiers for the associated sections

# ---------------
# Function definitions
# ---------------

# replace URL with link
#
def createLinks(text):
    def repl(match):
        if match.group(1)[-1] == '.':
            return '<a href="' + match.group(1)[:-1] + '">' + match.group(1)[:-1] + '</a>.'
        return '<a href="' + match.group(1) + '">' + match.group(1) + '</a>'

    pattern = '(https?://[^\s,;\)"]*)'
    result = re.sub(pattern, repl, text)
    return result

In [40]:
# don't run

# performs a generic HTTP GET
def httpGet(baseUri,acceptMime):
    if acceptMime == '':
        acceptMime = '*.*'                         # if no mime type specified, accept anything
    headerDict = {'Accept' : acceptMime}           # headers are sent as a dictionary
    r = requests.get(baseUri, headers=headerDict)
    return [r.status_code, r.text]                 # status code is an integer, response body is a string

# requests tabular data from an API and returns a table consisting of a list of lists.  If GET fails, the list is empty
def retrieveData(baseUri, responseType, param1):    # For CSV, param1 is the delimiter character.  For JSON, param1 is the key of the data array.
    if responseType == 'csv':
        acceptMime = 'text/csv'
    elif responseType == 'json':
        acceptMime = 'application/json'
    elif responseType == 'xml':
        acceptMime = 'text/xml'
    else:
        acceptMime = '*.*'

    response = httpGet(baseUri, acceptMime)
    table = list()                                 # create an empty table
    if response[0] == 200:                         # process data only if GET is successful

        if responseType == 'csv':
            tableRows = response[1].split('\n')    # split the response string into lines at the newline character to make a list
            tableReader = csv.reader(tableRows, delimiter=param1, quotechar='"')  # csv.reader can operate on any iteratible object including a list
            for row in tableReader:                # need to convert the _csv.reader object into an actual list
                if len(row) != 0:
                    table.append(row)
                    
        if responseType == 'json':
            responseDict = json.loads(response[1]) # response string converted to a dictionary
            dataArray = responseDict[param1]       # param1 is the key of the dictionary item that contains the data array
            aggregationDict = {}
            for item in dataArray:                 # this loop updates a dictionary with every dictionary in the data array, resulting in a dictionary that conatins all keys used in any of the individual dictionaries
                aggregationDict.update(item)
            keyList = list(aggregationDict.keys()) # now generate a list of all of the keys that were found
            table.append(keyList)                  # create the header list (item 0 in list of lists)
            for item in dataArray:                 # step through each of the dictionaries in the data array and find the value for each key in the key list
                tableRowList = list()
                for key in keyList:
                    try:                           # need to error trap the case where a dictionary is missing one of the keys
                        tableRowList.append(item[key])
                    except:
                        tableRowList.append('')
                table.append(tableRowList)
            
        return table                               # returns a table consisting of a list of rows that consist of a list of fields

# outputs a list-of-lists table as a fielded text (CSV) file
def outputTableToFile(table, fileName, delimCharacter):
    outObject = open(fileName, 'wt', newline = '', encoding = 'utf-8') # explicitly specify utf-8 encoding so that it doesn't default to the system preferred encoding (cp1252 on PCs)
    csvOutput = csv.writer(outObject, delimiter = delimCharacter)
    for row in table:
        csvOutput.writerow(row)
    outObject.close()

# replace URL with link
#
def createLinks(text):
    def repl(match):
        if match.group(1)[-1] == '.':
            return '<a href="' + match.group(1)[:-1] + '">' + match.group(1)[:-1] + '</a>.'
        return '<a href="' + match.group(1) + '">' + match.group(1) + '</a>'

    pattern = '(https?://[^\s,;\)"]*)'
    result = re.sub(pattern, repl, text)
    return result

In [None]:
#termLists = retrieveVocabularyInfo(githubBaseUri)

#listMetadata = retrieveTermListMetadata(githubBaseUri)

#table = createMasterMetadataTable(termLists, listMetadata)

#localnameSortedTable = sorted(table, key = lambda term: term[2].lower() ) # perform sort on lowercase of the third column: localNameColumn
#labelSortedTable = sorted(table, key = lambda term: term[3].lower() ) # perform sort on lowercase of the fourth column: labelColumn

#indexByName = buildIndexByTermName(localnameSortedTable, displayOrder, displayLabel, displayId)
#indexByLabel = buildIndexByTermLabel(labelSortedTable, displayOrder, displayLabel, displayId)
#termTable = buildMarkdown(localnameSortedTable, displayOrder, displayLabel, displayComments, displayId)

#text = indexByName + indexByLabel + termTable

#outputMarkdown(text, headerFileName, footerFileName, outFileName)


In [None]:
# DON'T RUN

# create dictionaries of metadata about term lists
#def retrieveTermListMetadata(githubBaseUri):
if 1==1:
    # retrieve term list metadata from Github
    dataUrl = githubBaseUri + 'term-lists/term-lists.csv'
    table = retrieveData(dataUrl, 'csv', ',')
#    table = http_library.retrieveData(dataUrl, 'csv', ',')
    header = table[0]

    # determine which columns contain the namespace info
    for column in range(len(header)):
        if header[column] == 'list':
            listColumn = column
        if header[column] == 'vann_preferredNamespacePrefix':
            prefixColumn = column
        if header[column] == 'vann_preferredNamespaceUri':
            uriColumn = column

    listFilename = {}
    listNamespace = {}
    listUri = {}

    for row in range(1,len(table)):    #skip the header row
        for termList in termLists:
            if termList == table[row][listColumn]:
                print(table[row][listColumn])
                listNamespace[termList] = table[row][prefixColumn] # make a dictionary of namespaces
                listUri[termList] = table[row][uriColumn] # make a dictionary of URIs
                if table[row][prefixColumn] == 'dwcdoe':
                    listFilename[termList] = 'degreeOfEstablishment'
#                else:
#                    listFilename[termList] = table[row][prefixColumn] + '-for-ac' # make a dictionary of filenames
#    return [listFilename, listNamespace, listUri]
    listMetadata = [listFilename, listNamespace, listUri]

In [82]:
term_lists_info = []

frame = pd.read_csv(githubBaseUri + 'term-lists/term-lists.csv', na_filter=False)
for termList in termLists:
    term_list_dict = {'list_iri': termList}
    term_list_dict = {'database': termList}
    for index,row in frame.iterrows():
        if row['database'] == termList:
            term_list_dict['pref_ns_prefix'] = row['vann_preferredNamespacePrefix']
            term_list_dict['pref_ns_uri'] = row['vann_preferredNamespaceUri']
            term_list_dict['list_iri'] = row['list']
    term_lists_info.append(term_list_dict)
print(term_lists_info)

[{'database': 'degreeOfEstablishment', 'pref_ns_prefix': 'dwcdoe', 'pref_ns_uri': 'http://rs.tdwg.org/dwc/doe/', 'list_iri': 'http://rs.tdwg.org/dwc/doe/'}, {'database': 'establishmentMeans', 'pref_ns_prefix': 'dwcem', 'pref_ns_uri': 'http://rs.tdwg.org/dwc/em/', 'list_iri': 'http://rs.tdwg.org/dwc/em/'}]


In [83]:
# Create column list
column_list = ['pref_ns_prefix', 'pref_ns_uri', 'term_localName', 'label', 'definition', 'usage', 'notes']
if vocab_type == 2:
    column_list += ['controlled_value_string']
elif vocab_type == 3:
    column_list += ['controlled_value_string', 'skos_broader']
if has_versions:
    column_list.append('version_iri')

# Create list of lists metadata table
table_list = []
for term_list in term_lists_info:
    # retrieve versions metadata for term list
    versions_url = githubBaseUri + term_list['database'] + '-versions/' + term_list['database'] + '-versions.csv'
    versions_df = pd.read_csv(versions_url, na_filter=False)
    
    # retrieve current term metadata for term list
    data_url = githubBaseUri + term_list['database'] + '/' + term_list['database'] + '.csv'
    frame = pd.read_csv(data_url, na_filter=False)
    for index,row in frame.iterrows():
        row_list = [term_list['pref_ns_prefix'], term_list['pref_ns_uri'], row['term_localName'], row['label'], row['definition'], row['usage'], row['notes']]
        if vocab_type == 2:
            row_list += [row['controlled_value_string']]
        elif vocab_type == 3:
            row_list += [row['controlled_value_string'], term_list['pref_ns_prefix'] + ':' + row['skos_broader']]
        if has_versions:
            found = False
            for vindex, vrow in versions_df.iterrows():
                if vrow['term_localName']==row['term_localName'] and vrow['version_status']=='recommended':
                    found = True
                    version_iri = vrow['version']
                    # NOTE: the current hack for non-TDWG terms without a version is to append # to the end of the term IRI
                    if version_iri[len(version_iri)-1] == '#':
                        version_iri = ''
            if not found:
                version_iri = ''
            row_list.append(version_iri)
        table_list.append(row_list)

# Turn list of lists into dataframe
terms_df = pd.DataFrame(table_list, columns = column_list)
terms_df.head()

Unnamed: 0,pref_ns_prefix,pref_ns_uri,term_localName,label,definition,usage,notes,controlled_value_string,version_iri
0,dwcdoe,http://rs.tdwg.org/dwc/doe/,d001,native (category A),Not transported beyond limits of native range,,Considered native and naturally occuring. See ...,native,http://rs.tdwg.org/dwc/doe/version/d001-2020-0...
1,dwcdoe,http://rs.tdwg.org/dwc/doe/,d002,captive (category B1),Individuals in captivity or quarantine (i.e. i...,Only for cases where specific actions have bee...,See also Blackburn et al. 2011 https://doi.org...,captive,http://rs.tdwg.org/dwc/doe/version/d002-2020-0...
2,dwcdoe,http://rs.tdwg.org/dwc/doe/,d003,cultivated (category B2),Individuals in cultivation (i.e. individuals p...,,"Examples include gardens, parks and farms. See...",cultivated,http://rs.tdwg.org/dwc/doe/version/d003-2020-0...
3,dwcdoe,http://rs.tdwg.org/dwc/doe/,d004,released (category B3),Individuals directly released into novel envir...,,"For example, fish stocked for angling, birds f...",released,http://rs.tdwg.org/dwc/doe/version/d004-2020-0...
4,dwcdoe,http://rs.tdwg.org/dwc/doe/,d005,failing (category C0),Individuals released outside of captivity or c...,,Such as frost tender plants sown or planted in...,failing,http://rs.tdwg.org/dwc/doe/version/d005-2020-0...


In [None]:
# don't run

localnameSortedTable = sorted(masterTable, key = lambda term: term[2].lower() ) # perform sort on lowercase of the third column: localNameColumn
labelSortedTable = sorted(masterTable, key = lambda term: term[3].lower() ) # perform sort on lowercase of the fourth column: labelColumn
print(labelSortedTable)

In [84]:
terms_sorted_by_label = terms_df.sort_values(by='label')
terms_sorted_by_localname = terms_df.sort_values(by='term_localName')
terms_sorted_by_label

Unnamed: 0,pref_ns_prefix,pref_ns_uri,term_localName,label,definition,usage,notes,controlled_value_string,version_iri
1,dwcdoe,http://rs.tdwg.org/dwc/doe/,d002,captive (category B1),Individuals in captivity or quarantine (i.e. i...,Only for cases where specific actions have bee...,See also Blackburn et al. 2011 https://doi.org...,captive,http://rs.tdwg.org/dwc/doe/version/d002-2020-0...
5,dwcdoe,http://rs.tdwg.org/dwc/doe/,d006,casual (category C1),Individuals surviving outside of captivity or ...,,Trees planted in the wild for forestry or orna...,casual,http://rs.tdwg.org/dwc/doe/version/d006-2020-0...
8,dwcdoe,http://rs.tdwg.org/dwc/doe/,d009,colonising (category D1),Self-sustaining population outside of captivit...,,The population is maintained by reproduction a...,colonising,http://rs.tdwg.org/dwc/doe/version/d009-2020-0...
2,dwcdoe,http://rs.tdwg.org/dwc/doe/,d003,cultivated (category B2),Individuals in cultivation (i.e. individuals p...,,"Examples include gardens, parks and farms. See...",cultivated,http://rs.tdwg.org/dwc/doe/version/d003-2020-0...
7,dwcdoe,http://rs.tdwg.org/dwc/doe/,d008,established (category C3),Individuals surviving outside of captivity or ...,,"The population is maintained by reproduction, ...",established,http://rs.tdwg.org/dwc/doe/version/d008-2020-0...
4,dwcdoe,http://rs.tdwg.org/dwc/doe/,d005,failing (category C0),Individuals released outside of captivity or c...,,Such as frost tender plants sown or planted in...,failing,http://rs.tdwg.org/dwc/doe/version/d005-2020-0...
13,dwcem,http://rs.tdwg.org/dwc/em/,e003,"introduced (alien, exotic, non-native, nonindi...",Establishment of a taxon by human agency into ...,,Organisms can be introduced to novel areas and...,introduced,http://rs.tdwg.org/dwc/em/version/e003-2020-06-15
14,dwcem,http://rs.tdwg.org/dwc/em/,e004,introduced: assisted colonisation,Establishment of a taxon specifically with the...,,In the event of environmental change and habit...,introducedAssistedColonisation,http://rs.tdwg.org/dwc/em/version/e004-2020-06-15
9,dwcdoe,http://rs.tdwg.org/dwc/doe/,d010,invasive (category D2),Self-sustaining population outside of captivit...,,"The population is maintained by reproduction, ...",invasive,http://rs.tdwg.org/dwc/doe/version/d010-2020-0...
0,dwcdoe,http://rs.tdwg.org/dwc/doe/,d001,native (category A),Not transported beyond limits of native range,,Considered native and naturally occuring. See ...,native,http://rs.tdwg.org/dwc/doe/version/d001-2020-0...


In [None]:
'''
# generate the index of terms grouped by category and sorted alphabetically by lowercase term local name
if 1==1:
#def buildIndexByTermName(table, displayOrder, displayLabel, displayId):
    table = localnameSortedTable
    text = '### 6.1 Index By Term Name\n\n'
    text += '(See also [6.2 Index By Label](#62-index-by-label))\n\n'
    for category in range(0,len(displayOrder)):
        text += '**' + displayLabel[category] + '**\n'
        text += '\n'
        filteredTable = [x for x in table if x[10] == displayOrder[category]]
        for row in range(0,len(filteredTable)):    #no header row
            curie = filteredTable[row][0] + ":" + filteredTable[row][2]
            curieAnchor = curie.replace(':','_')
            text += '[' + curie + '](#' + curieAnchor + ')'
            if row < len(filteredTable) - 1:
                text += ' |'
            text += '\n'
        text += '\n'
#    return text
    indexByName = text
'''

In [85]:
if 1==1:
#def buildIndexByTermLabel(table, displayOrder, displayLabel, displayId):
    text = '\n\n'
#    text = '### 6.2 Index By Label\n\n'
#    text += '(See also [6.1 Index By Term Name](#61-index-by-term-name))\n\n'
    for category in range(0,len(displayOrder)):
#        text += '**' + displayLabel[category] + '**\n'
#        text += '\n'
        filteredTable = terms_sorted_by_label
#        filteredTable = [x for x in table if x[10] == displayOrder[category]]
        for row_index,row in filteredTable.iterrows():
            if row_index == 0 or (row_index != 0 and row['label'] != filteredTable.iloc[row_index - 1].loc['label']): # this is a hack to prevent duplicate labels
                curieAnchor = row['pref_ns_prefix'] + "_" + row['term_localName']
                text += '[' + row['label'] + '](#' + curieAnchor + ')'
                if row_index < len(filteredTable) - 2 or (row_index == len(filteredTable) - 2 and row['label'] != filteredTable.iloc[row_index + 1].loc['label']):
                    text += ' |'
                text += '\n'
        text += '\n'
#    return text
    indexByLabel = text
print(indexByLabel)



[casual (category C1)](#dwcdoe_d006) |
[colonising (category D1)](#dwcdoe_d009) |
[cultivated (category B2)](#dwcdoe_d003) |
[established (category C3)](#dwcdoe_d008) |
[failing (category C0)](#dwcdoe_d005) |
[introduced (alien, exotic, non-native, nonindigenous)](#dwcem_e003) |
[introduced: assisted colonisation](#dwcem_e004) |
[native (category A)](#dwcdoe_d001) |
[released (category B3)](#dwcdoe_d004) |
[reproducing (category C2)](#dwcdoe_d007) |
[uncertain (unknown, cryptogenic)](#dwcem_e006)
[vagrant (casual)](#dwcem_e005) |
[widespread invasive (category E)](#dwcdoe_d011) |




In [86]:
# generate a table for each term, with terms grouped by category
if 1==1:
#def buildMarkdown(table, displayOrder, displayLabel, displayComments, displayId):

    # generate the Markdown for the terms table
    text = '## 4 Vocabulary\n'
    for category in range(0,len(displayOrder)):
#        text += '### 7.' + str(category + 1) + ' ' + displayLabel[category] + '\n'
#        text += '\n'
#        text += displayComments[category] # insert the comments for the category, if any.
        for row_index,row in terms_sorted_by_localname.iterrows():
        #for row in range(0,len(table)):    #no header row
            if 1==1:
#            if displayOrder[category] == table[row][10]:
                text += '<table>\n'
                curie = row['pref_ns_prefix'] + ":" + row['term_localName']
                curieAnchor = curie.replace(':','_')
                text += '\t<thead>\n'
                text += '\t\t<tr>\n'
                text += '\t\t\t<th colspan="2"><a id="' + curieAnchor + '"></a>Term Name: ' + curie + '</th>\n'
                text += '\t\t</tr>\n'
                text += '\t</thead>\n'
                text += '\t<tbody>\n'
                text += '\t\t<tr>\n'
                text += '\t\t\t<td>Normative IRI:</td>\n'
                uri = row['pref_ns_uri'] + row['term_localName']
                text += '\t\t\t<td><a href="' + uri + '">' + uri + '</a></td>\n'
                text += '\t\t</tr>\n'

                if row['version_iri'] != '':
                    text += '\t\t<tr>\n'
                    text += '\t\t\t<td>Version</td>\n'
                    text += '\t\t\t<td><a href="' + row['version_iri'] + '">' + row['version_iri'] + '</a></td>\n'
                    text += '\t\t</tr>\n'

                text += '\t\t<tr>\n'
                text += '\t\t\t<td>Label</td>\n'
                text += '\t\t\t<td>' + row['label'] + '</td>\n'
                text += '\t\t</tr>\n'
                text += '\t\t<tr>\n'
                text += '\t\t\t<td>Definition</td>\n'
                text += '\t\t\t<td>' + row['definition'] + '</td>\n'
                text += '\t\t</tr>\n'
                if row['usage'] != '':
                    text += '\t\t<tr>\n'
                    text += '\t\t\t<td>Usage</td>\n'
                    text += '\t\t\t<td>' + createLinks(row['usage']) + '</td>\n'
                    #text += '\t\t\t<td>' + row['usage'] + '</td>\n'
                    text += '\t\t</tr>\n'
                if row['notes'] != '':
                    text += '\t\t<tr>\n'
                    text += '\t\t\t<td>Notes</td>\n'
                    text += '\t\t\t<td>' + createLinks(row['notes']) + '</td>\n'
                    #text += '\t\t\t<td>' + row['notes'] + '</td>\n'
                    text += '\t\t</tr>\n'

                if vocab_type == 2 or vocab_type ==3: # controlled vocabulary
                    text += '\t\t<tr>\n'
                    text += '\t\t\t<td>Controlled Value</td>\n'
                    text += '\t\t\t<td>' + row['controlled_value_string'] + '</td>\n'
                    text += '\t\t</tr>\n'

                if vocab_type == 3: # controlled vocabulary with skos:broader relationships
                    text += '\t\t<tr>\n'
                    text += '\t\t\t<td>Has broader concept</td>\n'
                    curieAnchor = row['skos_broader'].replace(':','_')
                    text += '\t\t\t<td><a href="#' + curieAnchor + '">' + row['skos_broader'] + '</a></td>\n'
                    text += '\t\t</tr>\n'

                text += '\t</tbody>\n'
                text += '</table>\n'
                text += '\n'
        text += '\n'
#    return text
    termTable = text
print(termTable)

## 4 Vocabulary
<table>
	<thead>
		<tr>
			<th colspan="2"><a id="dwcdoe_d001"></a>Term Name: dwcdoe:d001</th>
		</tr>
	</thead>
	<tbody>
		<tr>
			<td>Normative IRI:</td>
			<td><a href="http://rs.tdwg.org/dwc/doe/d001">http://rs.tdwg.org/dwc/doe/d001</a></td>
		</tr>
		<tr>
			<td>Version</td>
			<td><a href="http://rs.tdwg.org/dwc/doe/version/d001-2020-06-15">http://rs.tdwg.org/dwc/doe/version/d001-2020-06-15</a></td>
		</tr>
		<tr>
			<td>Label</td>
			<td>native (category A)</td>
		</tr>
		<tr>
			<td>Definition</td>
			<td>Not transported beyond limits of native range</td>
		</tr>
		<tr>
			<td>Notes</td>
			<td>Considered native and naturally occuring. See also Blackburn et al. 2011 <a href="https://doi.org/10.1016/j.tree.2011.03.023">https://doi.org/10.1016/j.tree.2011.03.023</a> category A</td>
		</tr>
		<tr>
			<td>Controlled Value</td>
			<td>native</td>
		</tr>
	</tbody>
</table>

<table>
	<thead>
		<tr>
			<th colspan="2"><a id="dwcdoe_d002"></a>Term Name: dwcdoe:d002</th>

In [87]:
text = indexByLabel + termTable
#text = indexByName + indexByLabel + termTable

In [88]:
# read in header and footer, merge with terms table, and output
if 1==1:
#def outputMarkdown(text, headerFileName, footerFileName, outFileName):
    headerObject = open(headerFileName, 'rt', encoding='utf-8')
    header = headerObject.read()
    headerObject.close()

    footerObject = open(footerFileName, 'rt', encoding='utf-8')
    footer = footerObject.read()
    footerObject.close()

    output = header + text + footer
    outputObject = open(outFileName, 'wt', encoding='utf-8')
    outputObject.write(output)
    outputObject.close()
print('done')

done
