In [2]:
# Script to build JSON-LD pages that provide multilingual labels and definitions controlled vocabularies
# Steve Baskauf 2020-09-26 CC0

import re
import requests   # best library to manage HTTP transactions
import csv        # library to read/write/parse CSV files
import json       # library to convert JSON to Python data structures
import pandas as pd

# -----------------
# Configuration section
# -----------------

# This is the base URL for raw files from the branch of the repo that has been pushed to GitHub. In this example,
# the branch is named "pathway"
github_base_url = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/'
database_name = 'establishmentMeans'

translations_url = github_base_url + database_name + '/' + database_name +'-translations.csv'

has_broader = False # set to true if any terms have skos:broader values
has_exactMatch = False # set to true of any terms have skos:exactMatch values

label_col_prefix = 'label_'
def_col_prefix = 'definition_'
localname_column_header = 'term_localName'

# ---------------
# Function definitions
# ---------------

# replace URL with link
#
def createLinks(text):
    def repl(match):
        if match.group(1)[-1] == '.':
            return '<a href="' + match.group(1)[:-1] + '">' + match.group(1)[:-1] + '</a>.'
        return '<a href="' + match.group(1) + '">' + match.group(1) + '</a>'

    pattern = '(https?://[^\s,;\)"]*)'
    result = re.sub(pattern, repl, text)
    return result

In [3]:
frame = pd.read_csv(github_base_url + database_name + '/constants.csv', na_filter=False)
namespace_iri = frame.domainRoot[0]

translations_frame = pd.read_csv(translations_url, na_filter=False)
columns = translations_frame.columns

# Extract the list of languages from the translations spreadsheet column headers
languages = []
for column_header in columns:
    if label_col_prefix in column_header:
        language_code = column_header.split(label_col_prefix)[1]
        if language_code != 'en':
            languages.append(column_header.split(label_col_prefix)[1])
print(languages)

['es', 'nl']


In [4]:
# Create a dictionary of language dictionaries for the terms
translations_dictionary = {}
for index,row in translations_frame.iterrows():
    language_dictionary = {}
    for language in languages:
        term_dictionary = {'label': row[label_col_prefix + language], 'definition': row[def_col_prefix + language]}
        language_dictionary[language] = term_dictionary
    translations_dictionary[row[localname_column_header]] = language_dictionary
print(json.dumps(translations_dictionary, indent = 2))

{
  "e": {
    "es": {
      "label": "esquema de conceptos controlado de establishmentMeans",
      "definition": "Un Esquema de Conceptos de SKOS para ser utilizado como vocabulario controlado para los t\u00e9rminos de Darwin Core dwc:establishmentMeans y dwciri:establishmentMeans"
    },
    "nl": {
      "label": "establishmentMeans gecontroleerd conceptschema",
      "definition": "Een SKOS Conceptschema te gebruiken als een gecontoleerde woordenschat voor de Darwin Core termen dwc:establishmentMeans en dwciri:establishmentMeans"
    }
  },
  "e001": {
    "es": {
      "label": "nativo (aut\u00f3ctono)",
      "definition": "Un tax\u00f3n que se encuentra dentro de su \u00e1rea de distribuci\u00f3n natural"
    },
    "nl": {
      "label": "inheems",
      "definition": "Een taxon dat voorkomt binnen zijn natuurlijk verspreidingsgebied"
    }
  },
  "e002": {
    "es": {
      "label": "nativo: reintroducido",
      "definition": "Un tax\u00f3n reestablecido por introducci\u00f3

In [5]:
term_info = []
term_dict = {}
frame = pd.read_csv(github_base_url + database_name + '/' + database_name + '.csv', na_filter=False)
for index,row in frame.iterrows():
    term_dict = {}
    term_dict['localname'] = row['term_localName']
    term_dict['iri'] = namespace_iri + row['term_localName']
    term_dict['label'] = []
    term_dict['label'].append({'language': 'en', 'value': row['label']})
    for language in languages:
        term_dict['label'].append({'language': language, 'value': translations_dictionary[row['term_localName']][language]['label']})
    term_dict['definition'] = []
    term_dict['definition'].append({'language': 'en', 'value': row['definition']})
    for language in languages:
        term_dict['definition'].append({'language': language, 'value': translations_dictionary[row['term_localName']][language]['definition']})
    term_dict['cv_string'] = row['controlled_value_string']
    if row['skos_inScheme'] != '':
        term_dict['scheme'] = namespace_iri + row['skos_inScheme']
    else:
        term_dict['scheme'] = ''
    term_dict['type'] = row['type']
    if has_broader:
        if row['skos_broader'] != '':
            term_dict['broader'] = namespace_iri + row['skos_broader']
        else:
            term_dict['broader'] = ''
    if has_exactMatch:
        if row['skos_exactMatch'] != '':
            term_dict['match'] = namespace_iri + row['skos_exactMatch']
        else:
            term_dict['match'] = ''
    term_dict['value'] = row['controlled_value_string']
    term_info.append(term_dict)
print(json.dumps(term_info, indent = 2))

[
  {
    "localname": "e",
    "iri": "http://rs.tdwg.org/dwcem/values/e",
    "label": [
      {
        "language": "en",
        "value": "establishmentMeans controlled concept scheme"
      },
      {
        "language": "es",
        "value": "esquema de conceptos controlado de establishmentMeans"
      },
      {
        "language": "nl",
        "value": "establishmentMeans gecontroleerd conceptschema"
      }
    ],
    "definition": [
      {
        "language": "en",
        "value": "A SKOS Concept Scheme to be used as a controlled vocabulary for the Darwin Core terms dwc:establishmentMeans and dwciri:establishmentMeans"
      },
      {
        "language": "es",
        "value": "Un Esquema de Conceptos de SKOS para ser utilizado como vocabulario controlado para los t\u00e9rminos de Darwin Core dwc:establishmentMeans y dwciri:establishmentMeans"
      },
      {
        "language": "nl",
        "value": "Een SKOS Conceptschema te gebruiken als een gecontoleerde woordensch

In [6]:
context_dict = {
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "xsd": "http://www.w3.org/2001/XMLSchema#",
    "skos:inScheme": {"@type": "@id"}
  }

graph_list = []
for term in term_info:
    term_dict = {}
    term_dict['@id'] = term['iri']
    term_dict['@type'] = term['type']
    if term['value'] != '':
        term_dict['rdf:value'] = term['value']
    if term['scheme'] != '':
        term_dict['skos:inScheme'] = term['scheme']
    if has_broader:
        if term['broader'] != '':
            term_dict['skos:broader'] = term['broader']
    if has_exactMatch:
        if term['match'] != '':
            term_dict['skos:exactMatch'] = term['match']
    label_list = []
    for lang_label in term['label']:
        label_list.append({'@language': lang_label['language'], '@value': lang_label['value']})
    term_dict['skos:prefLabel'] = label_list
    
    def_list = []
    for lang_label in term['definition']:
        def_list.append({'@language': lang_label['language'], '@value': lang_label['value']})
    term_dict['skos:definition'] = def_list
    
    graph_list.append(term_dict)

output = {"@context": context_dict, "@graph": graph_list}
#jsonld_output = json.dumps(output, indent = 2) # output as escaped characters
jsonld_output = json.dumps(output, indent = 2, ensure_ascii=False) # output at UTF-8 strings
print(jsonld_output)

{
  "@context": {
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "xsd": "http://www.w3.org/2001/XMLSchema#"
  },
  "@graph": [
    {
      "@id": "http://rs.tdwg.org/dwcem/values/e",
      "@type": "http://www.w3.org/2004/02/skos/core#ConceptScheme",
      "skos:prefLabel": [
        {
          "@language": "en",
          "@value": "establishmentMeans controlled concept scheme"
        },
        {
          "@language": "es",
          "@value": "esquema de conceptos controlado de establishmentMeans"
        },
        {
          "@language": "nl",
          "@value": "establishmentMeans gecontroleerd conceptschema"
        }
      ],
      "skos:definition": [
        {
          "@language": "en",
          "@value": "A SKOS Concept Scheme to be used as a controlled vocabulary for the Darwin Core terms dwc:establishmentMeans and dwciri:establishmentMeans"
      

In [7]:
# outputObject = open(out_filename, 'wt', encoding='utf-8') # use with escaped characters
for out_filename in [database_name + '.json', database_name + '.jsonld']:
    outputObject = open(out_filename, 'w', encoding='utf-8') # use with UTF-8 strings
    outputObject.write(jsonld_output)
    outputObject.close()
    
print('done')

done
