In [None]:
# Hack of Script to build the Audubon Core term list page using Markdown.
# see https://github.com/tdwg/ac/blob/master/code/build_page.py
# Steve Baskauf 2020-02-10
# This script merges static Markdown header and footer documents with term information tables (in Markdown) generated from data in the rs.tdwg.org repo from the TDWG Github site

# Note: this script calls a function from http_library.py, which requires importing the requests, csv, and json modules
import re
import requests   # best library to manage HTTP transactions
import csv        # library to read/write/parse CSV files
import json       # library to convert JSON to Python data structures

In [None]:
# performs a generic HTTP GET
def httpGet(baseUri,acceptMime):
    if acceptMime == '':
        acceptMime = '*.*'                         # if no mime type specified, accept anything
    headerDict = {'Accept' : acceptMime}           # headers are sent as a dictionary
    r = requests.get(baseUri, headers=headerDict)
    return [r.status_code, r.text]                 # status code is an integer, response body is a string

# requests tabular data from an API and returns a table consisting of a list of lists.  If GET fails, the list is empty
def retrieveData(baseUri, responseType, param1):    # For CSV, param1 is the delimiter character.  For JSON, param1 is the key of the data array.
    if responseType == 'csv':
        acceptMime = 'text/csv'
    elif responseType == 'json':
        acceptMime = 'application/json'
    elif responseType == 'xml':
        acceptMime = 'text/xml'
    else:
        acceptMime = '*.*'

    response = httpGet(baseUri, acceptMime)
    table = list()                                 # create an empty table
    if response[0] == 200:                         # process data only if GET is successful

        if responseType == 'csv':
            tableRows = response[1].split('\n')    # split the response string into lines at the newline character to make a list
            tableReader = csv.reader(tableRows, delimiter=param1, quotechar='"')  # csv.reader can operate on any iteratible object including a list
            for row in tableReader:                # need to convert the _csv.reader object into an actual list
                if len(row) != 0:
                    table.append(row)
                    
        if responseType == 'json':
            responseDict = json.loads(response[1]) # response string converted to a dictionary
            dataArray = responseDict[param1]       # param1 is the key of the dictionary item that contains the data array
            aggregationDict = {}
            for item in dataArray:                 # this loop updates a dictionary with every dictionary in the data array, resulting in a dictionary that conatins all keys used in any of the individual dictionaries
                aggregationDict.update(item)
            keyList = list(aggregationDict.keys()) # now generate a list of all of the keys that were found
            table.append(keyList)                  # create the header list (item 0 in list of lists)
            for item in dataArray:                 # step through each of the dictionaries in the data array and find the value for each key in the key list
                tableRowList = list()
                for key in keyList:
                    try:                           # need to error trap the case where a dictionary is missing one of the keys
                        tableRowList.append(item[key])
                    except:
                        tableRowList.append('')
                table.append(tableRowList)
            
        return table                               # returns a table consisting of a list of rows that consist of a list of fields

# outputs a list-of-lists table as a fielded text (CSV) file
def outputTableToFile(table, fileName, delimCharacter):
    outObject = open(fileName, 'wt', newline = '', encoding = 'utf-8') # explicitly specify utf-8 encoding so that it doesn't default to the system preferred encoding (cp1252 on PCs)
    csvOutput = csv.writer(outObject, delimiter = delimCharacter)
    for row in table:
        csvOutput.writerow(row)
    outObject.close()

# replace URL with link
#
def createLinks(text):
    def repl(match):
        if match.group(1)[-1] == '.':
            return '<a href="' + match.group(1)[:-1] + '">' + match.group(1)[:-1] + '</a>.'
        return '<a href="' + match.group(1) + '">' + match.group(1) + '</a>'

    pattern = '(https?://[^\s,;\)"]*)'
    result = re.sub(pattern, repl, text)
    return result

In [None]:
# constants
githubBaseUri = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/cv/'
headerFileName = 'termlist-header.md'
footerFileName = 'termlist-footer.md'
outFileName = '../docs/termlist.md'

displayOrder = ['http://rs.tdwg.org/dwc/terms/attributes/Management']
displayLabel = ['Vocabulary']
displayComments = ['']
displayId = ['Vocabulary']

#termLists = retrieveVocabularyInfo(githubBaseUri)

#listMetadata = retrieveTermListMetadata(githubBaseUri)

#table = createMasterMetadataTable(termLists, listMetadata)

#localnameSortedTable = sorted(table, key = lambda term: term[2].lower() ) # perform sort on lowercase of the third column: localNameColumn
#labelSortedTable = sorted(table, key = lambda term: term[3].lower() ) # perform sort on lowercase of the fourth column: labelColumn

#indexByName = buildIndexByTermName(localnameSortedTable, displayOrder, displayLabel, displayId)
#indexByLabel = buildIndexByTermLabel(labelSortedTable, displayOrder, displayLabel, displayId)
#termTable = buildMarkdown(localnameSortedTable, displayOrder, displayLabel, displayComments, displayId)

#text = indexByName + indexByLabel + termTable

#outputMarkdown(text, headerFileName, footerFileName, outFileName)


In [None]:
# retrieve vocabularies members metadata from Github
if 1==1:
#def retrieveVocabularyInfo(githubBaseUri):
    dataUrl = githubBaseUri + 'vocabularies/vocabularies-members.csv'
    table = retrieveData(dataUrl, 'csv', ',')
#    table = http_library.retrieveData(dataUrl, 'csv', ',')
    #print(table[0])
    header = table[0]

    # determine which column contains the vocab and term list ids
    for column in range(len(header)):
        if header[column] == 'termList':
            termListColumn = column
        if header[column] == 'vocabulary':
            vocabularyColumn = column

    # store the identifiers of the term lists
    termLists = []
    for row in range(1,len(table)):    #skip the header row
        if table[row][vocabularyColumn] == 'http://rs.tdwg.org/dwc/':
            termLists.append(table[row][termListColumn])
#    return termLists
print(termLists)

In [None]:
termLists = ['http://rs.tdwg.org/dwc/doe/']

In [None]:
# create dictionaries of metadata about term lists
#def retrieveTermListMetadata(githubBaseUri):
if 1==1:
    # retrieve term list metadata from Github
    dataUrl = githubBaseUri + 'term-lists/term-lists.csv'
    table = retrieveData(dataUrl, 'csv', ',')
#    table = http_library.retrieveData(dataUrl, 'csv', ',')
    header = table[0]

    # determine which columns contain the namespace info
    for column in range(len(header)):
        if header[column] == 'list':
            listColumn = column
        if header[column] == 'vann_preferredNamespacePrefix':
            prefixColumn = column
        if header[column] == 'vann_preferredNamespaceUri':
            uriColumn = column

    listFilename = {}
    listNamespace = {}
    listUri = {}

    for row in range(1,len(table)):    #skip the header row
        for termList in termLists:
            if termList == table[row][listColumn]:
                print(table[row][listColumn])
                listNamespace[termList] = table[row][prefixColumn] # make a dictionary of namespaces
                listUri[termList] = table[row][uriColumn] # make a dictionary of URIs
                if table[row][prefixColumn] == 'dwcdoe':
                    listFilename[termList] = 'degreeOfEstablishment'
#                else:
#                    listFilename[termList] = table[row][prefixColumn] + '-for-ac' # make a dictionary of filenames
#    return [listFilename, listNamespace, listUri]
    listMetadata = [listFilename, listNamespace, listUri]

In [None]:
print(listMetadata)

In [None]:
# create a single table that combines all relevant metadata from the various term list metadata tables
if 1==1:
#def createMasterMetadataTable(termLists, listMetadata):
    fileNameDict = listMetadata[0]
    namespaceDict = listMetadata[1]
    uriDict = listMetadata[2]
    masterTable = []

    for termList in termLists:
        # retrieve term metadata for a particular list from Github
        dataUrl = githubBaseUri + fileNameDict[termList] + '/' + fileNameDict[termList] + '.csv'
        table = retrieveData(dataUrl, 'csv', ',')
#        table = http_library.retrieveData(dataUrl, 'csv', ',')
        header = table[0]

        # determine which columns contain specified metadata fields
        for column in range(len(header)):
            if header[column] == 'term_localName':
                localNameColumn = column
            if header[column] == 'label':
                labelColumn = column
            if header[column] == 'definition':
                definitionColumn = column
            if header[column] == 'usage':
                scopeNoteColumn = column
            if header[column] == 'notes':
                notesColumn = column
            if header[column] == 'rdf_value':
                valueColumn = column

        for row in range(1,len(table)):    #skip the header row
            masterTable.append([ namespaceDict[termList], uriDict[termList], table[row][localNameColumn], table[row][labelColumn], table[row][definitionColumn], table[row][scopeNoteColumn], table[row][notesColumn], table[row][valueColumn] ])

#    return masterTable
    table = masterTable
#print(table)

In [None]:
localnameSortedTable = sorted(table, key = lambda term: term[2].lower() ) # perform sort on lowercase of the third column: localNameColumn
labelSortedTable = sorted(table, key = lambda term: term[3].lower() ) # perform sort on lowercase of the fourth column: labelColumn
#print(labelSortedTable)

In [None]:
'''
# generate the index of terms grouped by category and sorted alphabetically by lowercase term local name
if 1==1:
#def buildIndexByTermName(table, displayOrder, displayLabel, displayId):
    table = localnameSortedTable
    text = '### 6.1 Index By Term Name\n\n'
    text += '(See also [6.2 Index By Label](#62-index-by-label))\n\n'
    for category in range(0,len(displayOrder)):
        text += '**' + displayLabel[category] + '**\n'
        text += '\n'
        filteredTable = [x for x in table if x[10] == displayOrder[category]]
        for row in range(0,len(filteredTable)):    #no header row
            curie = filteredTable[row][0] + ":" + filteredTable[row][2]
            curieAnchor = curie.replace(':','_')
            text += '[' + curie + '](#' + curieAnchor + ')'
            if row < len(filteredTable) - 1:
                text += ' |'
            text += '\n'
        text += '\n'
#    return text
    indexByName = text
'''

In [None]:
if 1==1:
#def buildIndexByTermLabel(table, displayOrder, displayLabel, displayId):
    table = labelSortedTable
    text = '\n\n'
#    text = '### 6.2 Index By Label\n\n'
#    text += '(See also [6.1 Index By Term Name](#61-index-by-term-name))\n\n'
    for category in range(0,len(displayOrder)):
#        text += '**' + displayLabel[category] + '**\n'
#        text += '\n'
        filteredTable = [x for x in table]
#        filteredTable = [x for x in table if x[10] == displayOrder[category]]
        for row in range(0,len(filteredTable)):    #no header row
            if row == 0 or (row != 0 and filteredTable[row][3] != filteredTable[row-1][3]): # this is a hack to prevent duplicate labels
                curieAnchor = filteredTable[row][0] + "_" + filteredTable[row][2]
                text += '[' + filteredTable[row][3] + '](#' + curieAnchor + ')'
                if row < len(filteredTable) - 2 or (row == len(filteredTable) -2 and filteredTable[row][3] != filteredTable[row + 1][3]):
                    text += ' |'
                text += '\n'
        text += '\n'
#    return text
    indexByLabel = text
#print(indexByLabel)

In [None]:
# generate a table for each term, with terms grouped by category
if 1==1:
#def buildMarkdown(table, displayOrder, displayLabel, displayComments, displayId):
    table = localnameSortedTable

    # generate the Markdown for the terms table
    text = '## 4 Vocabulary\n'
    for category in range(0,len(displayOrder)):
#        text += '### 7.' + str(category + 1) + ' ' + displayLabel[category] + '\n'
#        text += '\n'
#        text += displayComments[category] # insert the comments for the category, if any.
        for row in range(0,len(table)):    #no header row
            if 1==1:
#            if displayOrder[category] == table[row][10]:
                text += '<table>\n'
                curie = table[row][0] + ":" + table[row][2]
                curieAnchor = curie.replace(':','_')
                text += '\t<thead>\n'
                text += '\t\t<tr>\n'
                text += '\t\t\t<th colspan="2"><a id="' + curieAnchor + '"></a>Term Name: ' + curie + '</th>\n'
                text += '\t\t</tr>\n'
                text += '\t</thead>\n'
                text += '\t<tbody>\n'
                text += '\t\t<tr>\n'
                text += '\t\t\t<td>Normative IRI:</td>\n'
                uri = table[row][1] + table[row][2]
                text += '\t\t\t<td><a href="' + uri + '">' + uri + '</a></td>\n'
                text += '\t\t</tr>\n'
                text += '\t\t<tr>\n'
                text += '\t\t\t<td>Label</td>\n'
                text += '\t\t\t<td>' + table[row][3] + '</td>\n'
                text += '\t\t</tr>\n'
                text += '\t\t<tr>\n'
                text += '\t\t\t<td>Definition</td>\n'
                text += '\t\t\t<td>' + table[row][4] + '</td>\n'
                text += '\t\t</tr>\n'
                if table[row][5] != '':
                    text += '\t\t<tr>\n'
                    text += '\t\t\t<td>Usage</td>\n'
                    text += '\t\t\t<td>' + createLinks(table[row][5]) + '</td>\n'
                    #text += '\t\t\t<td>' + table[row][5] + '</td>\n'
                    text += '\t\t</tr>\n'
                if table[row][6] != '':
                    text += '\t\t<tr>\n'
                    text += '\t\t\t<td>Notes</td>\n'
                    text += '\t\t\t<td>' + createLinks(table[row][6]) + '</td>\n'
                    #text += '\t\t\t<td>' + table[row][6] + '</td>\n'
                    text += '\t\t</tr>\n'
                text += '\t\t<tr>\n'
                text += '\t\t\t<td>Controlled Value</td>\n'
                text += '\t\t\t<td>' + table[row][7] + '</td>\n'
                text += '\t\t</tr>\n'
                text += '\t</tbody>\n'
                text += '</table>\n'
                text += '\n'
        text += '\n'
#    return text
    termTable = text
#print(termTable)

In [None]:
text = indexByLabel + termTable
#text = indexByName + indexByLabel + termTable

In [None]:
# read in header and footer, merge with terms table, and output
if 1==1:
#def outputMarkdown(text, headerFileName, footerFileName, outFileName):
    headerObject = open(headerFileName, 'rt', encoding='utf-8')
    header = headerObject.read()
    headerObject.close()

    footerObject = open(footerFileName, 'rt', encoding='utf-8')
    footer = footerObject.read()
    footerObject.close()

    output = header + text + footer
    outputObject = open(outFileName, 'wt', encoding='utf-8')
    outputObject.write(output)
    outputObject.close()