In [46]:
# Script to build JSON-LD pages that provide multilingual labels and definitions controlled vocabularies
# Steve Baskauf 2020-09-26 CC0

import re
import requests   # best library to manage HTTP transactions
import csv        # library to read/write/parse CSV files
import json       # library to convert JSON to Python data structures
import pandas as pd

# -----------------
# Configuration section
# -----------------

# This is the base URL for raw files from the branch of the repo that has been pushed to GitHub. In this example,
# the branch is named "pathway"
github_base_url = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/format_cv/'
database_name = 'format'
translations_url = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/format_cv/format/format-translations.csv'

out_filename = 'format.json'

has_broader = True # set to true if any terms have skos:broader values
has_exactMatch = True # set to true of any terms have skos:exactMatch values

label_col_prefix = 'label_'
def_col_prefix = 'definition_'
localname_column_header = 'term_localName'

# ---------------
# Function definitions
# ---------------

# replace URL with link
#
def createLinks(text):
    def repl(match):
        if match.group(1)[-1] == '.':
            return '<a href="' + match.group(1)[:-1] + '">' + match.group(1)[:-1] + '</a>.'
        return '<a href="' + match.group(1) + '">' + match.group(1) + '</a>'

    pattern = '(https?://[^\s,;\)"]*)'
    result = re.sub(pattern, repl, text)
    return result

In [48]:
frame = pd.read_csv(github_base_url + database_name + '/constants.csv', na_filter=False)
namespace_iri = frame.domainRoot[0]

translations_frame = pd.read_csv(translations_url, na_filter=False)
columns = translations_frame.columns

# Extract the list of languages from the translations spreadsheet column headers
languages = []
for column_header in columns:
    if label_col_prefix in column_header:
        language_code = column_header.split(label_col_prefix)[1]
        if language_code != 'en':
            languages.append(column_header.split(label_col_prefix)[1])
print(languages)

['es', 'zh-Hans']


In [49]:
# Create a dictionary of language dictionaries for the terms
translations_dictionary = {}
for index,row in translations_frame.iterrows():
    language_dictionary = {}
    for language in languages:
        term_dictionary = {'label': row[label_col_prefix + language], 'definition': row[def_col_prefix + language]}
        language_dictionary[language] = term_dictionary
    translations_dictionary[row[localname_column_header]] = language_dictionary
print(json.dumps(translations_dictionary, indent = 2))

{
  "m": {
    "es": {
      "label": "Nombre aceptado en uso",
      "definition": "El nombre completo, con autor\u00eda e informaci\u00f3n de fecha si se conoce, del tax\u00f3n actualmente v\u00e1lido (zool\u00f3gico) o aceptado (bot\u00e1nico). "
    },
    "zh-Hans": {
      "label": "\u516c\u8ba4\u4f7f\u7528\u540d\u79f0",
      "definition": "\u76ee\u524d\u6709\u6548\uff08\u52a8\u7269\u5b66\uff09\u7684\u6216\u516c\u8ba4\uff08\u690d\u7269\u5b66\uff09\u7684\u5206\u7c7b\u5355\u5143\u5168\u79f0\uff0c\u5982\u5df2\u77e5\u6765\u6e90\u548c\u65e5\u671f\u4fe1\u606f\u5219\u9700\u6ce8\u660e\u3002"
    }
  },
  "e": {
    "es": {
      "label": "Nombre aceptado en uso - ID",
      "definition": "Un identificador para el nombre en uso (significado del nombre, documentado de acuerdo con alguna fuente) del tax\u00f3n actualmente v\u00e1lido (zool\u00f3gico) o aceptado (bot\u00e1nico).   "
    },
    "zh-Hans": {
      "label": "\u516c\u8ba4\u4f7f\u7528\u540d\u79f0\u7f16\u53f7",
      "definition"

In [75]:
term_info = []
term_dict = {}
frame = pd.read_csv(github_base_url + database_name + '/' + database_name + '.csv', na_filter=False)
for index,row in frame.iterrows():
    term_dict = {}
    term_dict['localname'] = row['term_localName']
    term_dict['iri'] = namespace_iri + row['term_localName']
    term_dict['label'] = []
    term_dict['label'].append({'language': 'en', 'value': row['label']})
    for language in languages:
        term_dict['label'].append({'language': language, 'value': translations_dictionary[row['term_localName']][language]['label']})
    term_dict['definition'] = []
    term_dict['definition'].append({'language': 'en', 'value': row['definition']})
    for language in languages:
        term_dict['definition'].append({'language': language, 'value': translations_dictionary[row['term_localName']][language]['definition']})
    term_dict['cv_string'] = row['controlled_value_string']
    if row['skos_inScheme'] != '':
        term_dict['scheme'] = namespace_iri + row['skos_inScheme']
    else:
        term_dict['scheme'] = ''
    term_dict['type'] = row['type']
    if has_broader:
        if row['skos_broader'] != '':
            term_dict['broader'] = namespace_iri + row['skos_broader']
        else:
            term_dict['broader'] = ''
    if has_exactMatch:
        if row['skos_exactMatch'] != '':
            term_dict['match'] = namespace_iri + row['skos_exactMatch']
        else:
            term_dict['match'] = ''
    term_dict['value'] = row['controlled_value_string']
    term_info.append(term_dict)
print(json.dumps(term_info, indent = 2))

[
  {
    "localname": "m",
    "iri": "http://rs.tdwg.org/format/values/m",
    "label": [
      {
        "language": "en",
        "value": "media types and physical media concept scheme"
      },
      {
        "language": "es",
        "value": "Nombre aceptado en uso"
      },
      {
        "language": "zh-Hans",
        "value": "\u516c\u8ba4\u4f7f\u7528\u540d\u79f0"
      }
    ],
    "definition": [
      {
        "language": "en",
        "value": "concept scheme for IANA media types and various physical media items"
      },
      {
        "language": "es",
        "value": "El nombre completo, con autor\u00eda e informaci\u00f3n de fecha si se conoce, del tax\u00f3n actualmente v\u00e1lido (zool\u00f3gico) o aceptado (bot\u00e1nico). "
      },
      {
        "language": "zh-Hans",
        "value": "\u76ee\u524d\u6709\u6548\uff08\u52a8\u7269\u5b66\uff09\u7684\u6216\u516c\u8ba4\uff08\u690d\u7269\u5b66\uff09\u7684\u5206\u7c7b\u5355\u5143\u5168\u79f0\uff0c\u5982\u5df2\u7

In [81]:
context_dict = {
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "xsd": "http://www.w3.org/2001/XMLSchema#"
  }

graph_list = []
for term in term_info:
    term_dict = {}
    term_dict['@id'] = term['iri']
    term_dict['@type'] = term['type']
    if term['value'] != '':
        term_dict['rdf:value'] = term['value']
    if term['scheme'] != '':
        term_dict['skos:inScheme'] = term['scheme']
    if has_broader:
        if term['broader'] != '':
            term_dict['skos:broader'] = term['broader']
    if has_exactMatch:
        if term['match'] != '':
            term_dict['skos:exactMatch'] = term['match']
    label_list = []
    for lang_label in term['label']:
        label_list.append({'@language': lang_label['language'], '@value': lang_label['value']})
    term_dict['skos:prefLabel'] = label_list
    
    def_list = []
    for lang_label in term['definition']:
        def_list.append({'@language': lang_label['language'], '@value': lang_label['value']})
    term_dict['skos:definition'] = def_list
    
    graph_list.append(term_dict)

output = {"@context": context_dict, "@graph": graph_list}
jsonld_output = json.dumps(output, indent = 2)
print(jsonld_output)

{
  "@context": {
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "xsd": "http://www.w3.org/2001/XMLSchema#"
  },
  "@graph": [
    {
      "@id": "http://rs.tdwg.org/format/values/m",
      "@type": "http://www.w3.org/2004/02/skos/core#ConceptScheme",
      "skos:prefLabel": [
        {
          "@language": "en",
          "@value": "media types and physical media concept scheme"
        },
        {
          "@language": "es",
          "@value": "Nombre aceptado en uso"
        },
        {
          "@language": "zh-Hans",
          "@value": "\u516c\u8ba4\u4f7f\u7528\u540d\u79f0"
        }
      ],
      "skos:definition": [
        {
          "@language": "en",
          "@value": "concept scheme for IANA media types and various physical media items"
        },
        {
          "@language": "es",
          "@value": "El nombre completo, con autor\u00eda e 

In [82]:
outputObject = open(out_filename, 'wt', encoding='utf-8')
outputObject.write(jsonld_output)
outputObject.close()
    
print('done')

done


In [44]:
decisions_df = pd.read_csv('https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/decisions/decisions-links.csv', na_filter=False)

# generate a table for each term, with terms grouped by category

# generate the Markdown for the terms table
text = '## 4 Vocabulary\n'
for category in range(0,len(display_order)):
    if organized_in_categories:
        text += '### 4.' + str(category + 1) + ' ' + display_label[category] + '\n'
        text += '\n'
        text += display_comments[category] # insert the comments for the category, if any.
        filtered_table = terms_sorted_by_localname[terms_sorted_by_localname['tdwgutility_organizedInClass']==display_order[category]]
        filtered_table.reset_index(drop=True, inplace=True)
    else:
        filtered_table = terms_sorted_by_localname
        filtered_table.reset_index(drop=True, inplace=True)

    for row_index,row in filtered_table.iterrows():
        text += '<table>\n'
        curie = row['pref_ns_prefix'] + ":" + row['term_localName']
        curieAnchor = curie.replace(':','_')
        text += '\t<thead>\n'
        text += '\t\t<tr>\n'
        text += '\t\t\t<th colspan="2"><a id="' + curieAnchor + '"></a>Term Name  ' + curie + '</th>\n'
        text += '\t\t</tr>\n'
        text += '\t</thead>\n'
        text += '\t<tbody>\n'
        text += '\t\t<tr>\n'
        text += '\t\t\t<td>Term IRI</td>\n'
        uri = row['pref_ns_uri'] + row['term_localName']
        text += '\t\t\t<td><a href="' + uri + '">' + uri + '</a></td>\n'
        text += '\t\t</tr>\n'
        text += '\t\t<tr>\n'
        text += '\t\t\t<td>Modified</td>\n'
        text += '\t\t\t<td>' + row['term_modified'] + '</td>\n'
        text += '\t\t</tr>\n'

        if row['version_iri'] != '':
            text += '\t\t<tr>\n'
            text += '\t\t\t<td>Term version IRI</td>\n'
            text += '\t\t\t<td><a href="' + row['version_iri'] + '">' + row['version_iri'] + '</a></td>\n'
            text += '\t\t</tr>\n'

        text += '\t\t<tr>\n'
        text += '\t\t\t<td>Label</td>\n'
        text += '\t\t\t<td>' + row['label'] + '</td>\n'
        text += '\t\t</tr>\n'

        if row['term_deprecated'] != '':
            text += '\t\t<tr>\n'
            text += '\t\t\t<td></td>\n'
            text += '\t\t\t<td><strong>This term is deprecated and should no longer be used.</strong></td>\n'
            text += '\t\t</tr>\n'

        text += '\t\t<tr>\n'
        text += '\t\t\t<td>Definition</td>\n'
        text += '\t\t\t<td>' + row['definition'] + '</td>\n'
        text += '\t\t</tr>\n'

        if row['usage'] != '':
            text += '\t\t<tr>\n'
            text += '\t\t\t<td>Usage</td>\n'
            text += '\t\t\t<td>' + createLinks(row['usage']) + '</td>\n'
            text += '\t\t</tr>\n'

        if row['notes'] != '':
            text += '\t\t<tr>\n'
            text += '\t\t\t<td>Notes</td>\n'
            text += '\t\t\t<td>' + createLinks(row['notes']) + '</td>\n'
            text += '\t\t</tr>\n'

        if row['examples'] != '':
            text += '\t\t<tr>\n'
            text += '\t\t\t<td>Examples</td>\n'
            text += '\t\t\t<td>' + createLinks(row['examples']) + '</td>\n'
            text += '\t\t</tr>\n'

        if (vocab_type == 2 or vocab_type == 3) and row['controlled_value_string'] != '': # controlled vocabulary
            text += '\t\t<tr>\n'
            text += '\t\t\t<td>Controlled value</td>\n'
            text += '\t\t\t<td>' + row['controlled_value_string'] + '</td>\n'
            text += '\t\t</tr>\n'

        if vocab_type == 3 and row['skos_broader'] != '': # controlled vocabulary with skos:broader relationships
            text += '\t\t<tr>\n'
            text += '\t\t\t<td>Has broader concept</td>\n'
            curieAnchor = row['skos_broader'].replace(':','_')
            text += '\t\t\t<td><a href="#' + curieAnchor + '">' + row['skos_broader'] + '</a></td>\n'
            text += '\t\t</tr>\n'

        text += '\t\t<tr>\n'
        text += '\t\t\t<td>Type</td>\n'
        if row['type'] == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property':
            text += '\t\t\t<td>Property</td>\n'
        elif row['type'] == 'http://www.w3.org/2000/01/rdf-schema#Class':
            text += '\t\t\t<td>Class</td>\n'
        elif row['type'] == 'http://www.w3.org/2004/02/skos/core#Concept':
            text += '\t\t\t<td>Concept</td>\n'
        else:
            text += '\t\t\t<td>' + row['type'] + '</td>\n' # this should rarely happen
        text += '\t\t</tr>\n'

        # Look up decisions related to this term
        for drow_index,drow in decisions_df.iterrows():
            if drow['linked_affected_resource'] == uri:
                text += '\t\t<tr>\n'
                text += '\t\t\t<td>Executive Committee decision</td>\n'
                text += '\t\t\t<td><a href="http://rs.tdwg.org/decisions/' + drow['decision_localName'] + '">http://rs.tdwg.org/decisions/' + drow['decision_localName'] + '</a></td>\n'
                text += '\t\t</tr>\n'                        

        text += '\t</tbody>\n'
        text += '</table>\n'
        text += '\n'
    text += '\n'
term_table = text

print(term_table)

NameError: name 'display_order' is not defined

Modify to display the indices that you want

In [10]:
text = index_by_label + term_table
#text = index_by_name + index_by_label + term_table

In [11]:
# read in header and footer, merge with terms table, and output

headerObject = open(headerFileName, 'rt', encoding='utf-8')
header = headerObject.read()
headerObject.close()

footerObject = open(footerFileName, 'rt', encoding='utf-8')
footer = footerObject.read()
footerObject.close()

output = header + text + footer
outputObject = open(outFileName, 'wt', encoding='utf-8')
outputObject.write(output)
outputObject.close()
    
print('done')

done
