# Script to generate the Darwin Core "normative" CSV document

Steve Baskauf 2020-07-08

This builds the "normative document" CSV file that is used to generate the Quick Reference Guide from the data in the TDWG rs.tdwg.org repo.

In [60]:
# -----------------------------
# file import and configuration
# -----------------------------

import pandas as pd

# This is the base URL for raw files from the branch of the repo that has been pushed to GitHub
github_baseUri = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/dwc/'

# This is a Python list of the database names of the term version lists to be included in the document.
#term_lists = ['iri']
term_lists = ['terms', 'iri', 'dc-for-dwc', 'dcterms-for-dwc', 'curatorial', 'dwcore', 'dwctype', 'geospatial']

column_mappings = [
    {'norm': 'iri', 'accum': 'version'},
    {'norm': 'label', 'accum': 'label'},
    {'norm': 'definition', 'accum': 'rdfs_comment'},
    {'norm': 'comments', 'accum': 'dcterms_description'},
    {'norm': 'examples', 'accum': 'examples'},
    {'norm': 'organized_in', 'accum': 'tdwgutility_organizedInClass'},
    {'norm': 'issued', 'accum': 'version_issued'},
    {'norm': 'status', 'accum': 'version_status'},
    {'norm': 'replaces', 'accum': 'replaces_version'},
    {'norm': 'rdf_type', 'accum': 'rdf_type'},
    {'norm': 'term_iri', 'accum': 'term_iri'},
    {'norm': 'abcd_equivalence', 'accum': 'tdwgutility_abcdEquivalence'}#,
    #{'norm': 'flags', 'accum': ''}
]

Load the term version data for all of the term lists that are included in Darwin Core (including obsolete ones)

In [61]:
for term_list_index in range(len(term_lists)):
    # retrieve configuration metadata for term list
    config_url = github_baseUri + term_lists[term_list_index] + '/constants.csv'
    config_df = pd.read_csv(config_url, na_filter=False)
    term_namespace = config_df.iloc[0].loc['domainRoot']
    print(term_namespace)
    
    # Retrieve versions metadata for term list
    versions_url = github_baseUri + term_lists[term_list_index] + '-versions/' + term_lists[term_list_index] + '-versions.csv'
    print(versions_url)
    versions_df = pd.read_csv(versions_url, na_filter=False)
    
    # Add a column for the term IRI by concatenating the term namespace with the local name value for each row
    versions_df['term_iri'] = term_namespace + versions_df['term_localName']
    
    if term_list_index == 0:
        # start the DataFrame with the first term list versions data
        accumulated_frame = versions_df.copy()
    else:
        # append subsequent term lists data to the DataFrame
        accumulated_frame = accumulated_frame.append(versions_df.copy(), sort=True)
        
# Special procedure for obsolete terms
# Retrieve versions metadata
versions_url = github_baseUri + 'dwc-obsolete-versions/dwc-obsolete-versions.csv'
print(versions_url)
versions_df = pd.read_csv(versions_url, na_filter=False)

# Retrieve term/version join data
join_url = github_baseUri + 'dwc-obsolete/dwc-obsolete-versions.csv'
join_df = pd.read_csv(join_url, na_filter=False)

# Find the term IRI for each version and add it to a list
term_iri_list = []
for row_index,row in versions_df.iterrows():
    for join_index,join_row in join_df.iterrows():
        # Locate the row in the join data where the version matches the row in the versions DataFrame
        if join_row['version'] == row['version']:
            term_iri_list.append(join_row['term'])
            break
'''    
    # Locate the row in the join data where the version matches the row in the versions DataFrame
    term_iri_row = join_df.loc[join_df['version'] == row['version']]
    # Add the current term IRI from the join data row to the list
    term_iri_list.append(term_iri_row['term'])
'''
# Add the curren term IRI list to the DataFrame as the term_iri column
versions_df['term_iri'] = term_iri_list
# Add the obsolete terms DataFrame to the accumulated DataFrame
accumulated_frame = accumulated_frame.append(versions_df.copy(), sort=True)

accumulated_frame.reset_index(drop=True, inplace=True) # reset the row indices to consecutive starting with zero
accumulated_frame.fillna('', inplace=True) # replace all missing values with empty strings
accumulated_frame.head()
print('done')

http://rs.tdwg.org/dwc/terms/
https://raw.githubusercontent.com/tdwg/rs.tdwg.org/dwc/terms-versions/terms-versions.csv
http://rs.tdwg.org/dwc/iri/
https://raw.githubusercontent.com/tdwg/rs.tdwg.org/dwc/iri-versions/iri-versions.csv
http://purl.org/dc/elements/1.1/
https://raw.githubusercontent.com/tdwg/rs.tdwg.org/dwc/dc-for-dwc-versions/dc-for-dwc-versions.csv
http://purl.org/dc/terms/
https://raw.githubusercontent.com/tdwg/rs.tdwg.org/dwc/dcterms-for-dwc-versions/dcterms-for-dwc-versions.csv
http://rs.tdwg.org/dwc/curatorial/
https://raw.githubusercontent.com/tdwg/rs.tdwg.org/dwc/curatorial-versions/curatorial-versions.csv
http://rs.tdwg.org/dwc/dwcore/
https://raw.githubusercontent.com/tdwg/rs.tdwg.org/dwc/dwcore-versions/dwcore-versions.csv
http://rs.tdwg.org/dwc/dwctype/
https://raw.githubusercontent.com/tdwg/rs.tdwg.org/dwc/dwctype-versions/dwctype-versions.csv
http://rs.tdwg.org/dwc/geospatial/
https://raw.githubusercontent.com/tdwg/rs.tdwg.org/dwc/geospatial-versions/geospatial

In [62]:
accumulated_frame

Unnamed: 0,dcterms_description,document_modified,examples,label,rdf_type,rdfs_comment,rdfs_seeAlso,replaces1_version,replaces2_version,replaces_version,tdwgutility_abcdEquivalence,tdwgutility_decision,tdwgutility_organizedInClass,term_iri,term_localName,version,versionLocalName,version_isDefinedBy,version_issued,version_status
0,"Example: ""Tamias minimus"" valid name for ""Euta...",2020-06-30T18:34:51-05:00,,Accepted Name Usage,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,"The full name, with authorship and date inform...",,,,http://rs.tdwg.org/dwc/terms/version/acceptedS...,not in ABCD,,http://rs.tdwg.org/dwc/terms/Taxon,http://rs.tdwg.org/dwc/terms/acceptedNameUsage,acceptedNameUsage,http://rs.tdwg.org/dwc/terms/version/acceptedN...,acceptedNameUsage-2009-09-21,http://rs.tdwg.org/dwc/terms/version/,2009-09-21,superseded
1,"Example: ""8fa58e08-08de-4ac1-b69c-1235340b7001""",2020-06-30T18:34:51-05:00,,Accepted Name Usage ID,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,An identifier for the name usage (documented m...,,,,http://rs.tdwg.org/dwc/terms/version/acceptedT...,not in ABCD,,http://rs.tdwg.org/dwc/terms/Taxon,http://rs.tdwg.org/dwc/terms/acceptedNameUsageID,acceptedNameUsageID,http://rs.tdwg.org/dwc/terms/version/acceptedN...,acceptedNameUsageID-2009-09-21,http://rs.tdwg.org/dwc/terms/version/,2009-09-21,superseded
2,"Example: ""Tamias minimus"" valid name for ""Euta...",2017-09-23T06:53:00-05:00,,Accepted Scientific Name,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,The currently valid (zoological) or accepted (...,,,,http://rs.tdwg.org/dwc/terms/version/acceptedT...,not in ABCD,,http://rs.tdwg.org/dwc/terms/Taxon,http://rs.tdwg.org/dwc/terms/acceptedScientifi...,acceptedScientificName,http://rs.tdwg.org/dwc/terms/version/acceptedS...,acceptedScientificName-2009-07-06,http://rs.tdwg.org/dwc/terms/version/,2009-07-06,deprecated
3,,2017-09-23T06:53:00-05:00,,Accepted Scientific Name ID,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,A unique identifier for the acceptedScientific...,,,,http://rs.tdwg.org/dwc/terms/version/acceptedT...,not in ABCD,,http://rs.tdwg.org/dwc/terms/Taxon,http://rs.tdwg.org/dwc/terms/acceptedScientifi...,acceptedScientificNameID,http://rs.tdwg.org/dwc/terms/version/acceptedS...,acceptedScientificNameID-2009-07-06,http://rs.tdwg.org/dwc/terms/version/,2009-07-06,deprecated
4,,2017-09-23T06:53:00-05:00,,Accepted Taxon,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,The currently valid (zoological) or accepted (...,,,,,not in ABCD,,http://rs.tdwg.org/dwc/terms/Taxon,http://rs.tdwg.org/dwc/terms/AcceptedTaxon,AcceptedTaxon,http://rs.tdwg.org/dwc/terms/version/AcceptedT...,AcceptedTaxon-2008-11-19,http://rs.tdwg.org/dwc/terms/version/,2008-11-19,deprecated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
793,,2017-09-23T06:53:00-05:00,,Latest Period Or Highest System,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,The full name of the latest possible geochrono...,,,,,,,,http://digir.net/schema/conceptual/darwin/exte...,LatestPeriodOrHighestSystem,http://digir.net/schema/conceptual/darwin/exte...,LatestPeriodOrHighestSystem-2005-07-03,http://digir.net/schema/conceptual/darwin/exte...,2005-07-03,deprecated
794,,2017-09-23T06:53:00-05:00,,Lithostratigraphic Terms,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,The combination of all litho-stratigraphic nam...,,,,,,,,http://digir.net/schema/conceptual/darwin/exte...,LithostratigraphicTerms,http://digir.net/schema/conceptual/darwin/exte...,LithostratigraphicTerms-2005-07-03,http://digir.net/schema/conceptual/darwin/exte...,2005-07-03,deprecated
795,,2017-09-23T06:53:00-05:00,,Lowest Biostratigraphic Zone,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,The full name of the lowest possible geologica...,,,,,,,,http://digir.net/schema/conceptual/darwin/exte...,LowestBiostratigraphicZone,http://digir.net/schema/conceptual/darwin/exte...,LowestBiostratigraphicZone-2005-07-03,http://digir.net/schema/conceptual/darwin/exte...,2005-07-03,deprecated
796,,2017-09-23T06:53:00-05:00,,Member,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,The full name of the lithostratigraphic member...,,,,,,,,http://digir.net/schema/conceptual/darwin/exte...,Member,http://digir.net/schema/conceptual/darwin/exte...,Member-2005-07-03,http://digir.net/schema/conceptual/darwin/exte...,2005-07-03,deprecated


Create a list of lists building each row of the normative document

In [63]:
# Create column header list for the normative document
column_headers = []
for column_mapping in column_mappings:
    # Add the value of the 'norm' key for the column
    column_headers.append(column_mapping['norm'])
#print(column_headers)

# Create the rows of the normative document
normative_doc_list = []
for row_index,row in accumulated_frame.iterrows():
    normative_doc_row = []
    for column_mapping in column_mappings:
        # Add the value from the accumulation DataFrame column whose name is the value of the 'accum' key for the column
        if column_mapping['norm'] == 'replaces':
            # concatenate all versions that were replaced; pipe separated
            replace_iri = row['replaces_version']
            if row['replaces1_version'] != '':
                replace_iri += '|' + row['replaces1_version']
                if row['replaces2_version'] != '':
                    replace_iri += '|' + row['replaces2_version']
            normative_doc_row.append(replace_iri)
        else:
            normative_doc_row.append(row[column_mapping['accum']])
    normative_doc_list.append(normative_doc_row)

# Turn list of lists into dataframe
normative_doc_df = pd.DataFrame(normative_doc_list, columns = column_headers)

# Save the normative document DataFrame as a CSV
normative_doc_df.to_csv('generated_normative_document.csv', index = False)

normative_doc_df

Unnamed: 0,iri,label,definition,comments,examples,organized_in,issued,status,replaces,rdf_type,term_iri,abcd_equivalence
0,http://rs.tdwg.org/dwc/terms/version/acceptedN...,Accepted Name Usage,"The full name, with authorship and date inform...","Example: ""Tamias minimus"" valid name for ""Euta...",,http://rs.tdwg.org/dwc/terms/Taxon,2009-09-21,superseded,http://rs.tdwg.org/dwc/terms/version/acceptedS...,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://rs.tdwg.org/dwc/terms/acceptedNameUsage,not in ABCD
1,http://rs.tdwg.org/dwc/terms/version/acceptedN...,Accepted Name Usage ID,An identifier for the name usage (documented m...,"Example: ""8fa58e08-08de-4ac1-b69c-1235340b7001""",,http://rs.tdwg.org/dwc/terms/Taxon,2009-09-21,superseded,http://rs.tdwg.org/dwc/terms/version/acceptedT...,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://rs.tdwg.org/dwc/terms/acceptedNameUsageID,not in ABCD
2,http://rs.tdwg.org/dwc/terms/version/acceptedS...,Accepted Scientific Name,The currently valid (zoological) or accepted (...,"Example: ""Tamias minimus"" valid name for ""Euta...",,http://rs.tdwg.org/dwc/terms/Taxon,2009-07-06,deprecated,http://rs.tdwg.org/dwc/terms/version/acceptedT...,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://rs.tdwg.org/dwc/terms/acceptedScientifi...,not in ABCD
3,http://rs.tdwg.org/dwc/terms/version/acceptedS...,Accepted Scientific Name ID,A unique identifier for the acceptedScientific...,,,http://rs.tdwg.org/dwc/terms/Taxon,2009-07-06,deprecated,http://rs.tdwg.org/dwc/terms/version/acceptedT...,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://rs.tdwg.org/dwc/terms/acceptedScientifi...,not in ABCD
4,http://rs.tdwg.org/dwc/terms/version/AcceptedT...,Accepted Taxon,The currently valid (zoological) or accepted (...,,,http://rs.tdwg.org/dwc/terms/Taxon,2008-11-19,deprecated,,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://rs.tdwg.org/dwc/terms/AcceptedTaxon,not in ABCD
...,...,...,...,...,...,...,...,...,...,...,...,...
793,http://digir.net/schema/conceptual/darwin/exte...,Latest Period Or Highest System,The full name of the latest possible geochrono...,,,,2005-07-03,deprecated,,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://digir.net/schema/conceptual/darwin/exte...,
794,http://digir.net/schema/conceptual/darwin/exte...,Lithostratigraphic Terms,The combination of all litho-stratigraphic nam...,,,,2005-07-03,deprecated,,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://digir.net/schema/conceptual/darwin/exte...,
795,http://digir.net/schema/conceptual/darwin/exte...,Lowest Biostratigraphic Zone,The full name of the lowest possible geologica...,,,,2005-07-03,deprecated,,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://digir.net/schema/conceptual/darwin/exte...,
796,http://digir.net/schema/conceptual/darwin/exte...,Member,The full name of the lithostratigraphic member...,,,,2005-07-03,deprecated,,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://digir.net/schema/conceptual/darwin/exte...,
