# Update TDWG documents metadata

This script is a companion to the other script that updates the vocabularies metadata and should be run after it is finished and any new list of terms documents have been created.

## Configuration and function definitions

In [None]:
import pandas as pd
import yaml
import json
import sys
import copy
from os.path import exists

def csv_read(path, **kwargs):
    """Loads a CSV table into a Pandas DataFrame with all cells as strings and blank cells as empty strings
    
    Keyword argument:
    rows -- the number of rows of the table to return when used for testing. When omitted, all rows are returned.
    """
    dataframe = pd.read_csv(path, na_filter=False, dtype = str)
    if 'rows' in kwargs:
        return dataframe.head(kwargs['rows']).copy(deep=True)
    else:
        return dataframe
            

## Load document data

If the document already exists, its data is retrieved from current documents CSV. A `document_configuration.yaml` file provides new data, which replaces any existing data or is used to create a new record.

In [None]:
version_date = '2022-05-26'
doc_config_path = 'document_configuration.yaml'
format_config_path = 'format_configuration.yaml'
repo_path = '../../rs.tdwg.org/'
author_config_path = 'authors_configuration.yaml'
#doc_iri = 'http://rs.tdwg.org/sds/doc/specification/'
doc_iri = 'http://rs.tdwg.org/ac/doc/blah/'
standard_iri = 'http://www.tdwg.org/standards/638'
new_accessUrl = ''

current_docs_df = csv_read(repo_path + 'docs/docs.csv')

# Find the row index if the document already exists
row_matches = current_docs_df.index[current_docs_df['current_iri']==doc_iri].tolist()
if len(row_matches) == 0:
    print('Document IRI not found in existing data.')
    new_document = True
elif len(row_matches) > 1:
    sys.exit('Multiple rows match the document IRI:' + str(row_matches))
else:
    row_index = row_matches[0]
    new_document = False

    # .squeeze() turns a single-row or column dataframe into a series.
    # See https://stackoverflow.com/questions/50575802/convert-dataframe-row-to-dict
    # and https://www.w3resource.com/pandas/dataframe/dataframe-squeeze.php
    row_data = current_docs_df[current_docs_df['current_iri']==doc_iri].squeeze().to_dict()

# Try to load new document data from a configuration file.
if exists(doc_config_path):
    with open(doc_config_path) as file_object:
        new_row_data = yaml.safe_load(file_object)
    
    # Need to stash any new accessUrl that is provided
    if new_row_data['accessUrl'] != None: # Empty YAML values are read in as a None keyword.
        new_accessUrl = new_row_data['accessUrl']

    # For new documents, the data from the file is used as the initial record.
    if new_document:
        row_data = new_row_data
    # For existing documents, any new data replaces the existing data.
    else:
        for key in new_row_data.keys():
            if new_row_data[key] != None: # Empty YAML values are read in as a None keyword.
                row_data[key] = new_row_data[key]
    
else:
    # If the document is new but there isn't a config file, there are no data to work with for the document
    if new_document:
        sys.exit('New documents must have a document_configuration.yaml file.')

# Try to load format data from a configuration file.
if exists(format_config_path):
    with open(format_config_path) as file_object:
        format_data = yaml.safe_load(file_object)
else:
    format_data['mediaType'] = ''
    format_data['lastVersionAccessUri'] = ''
    
#print(json.dumps(format_data, indent=2))

# Replace any existing doc_modified date with the new version date
row_data['doc_modified'] = version_date
#print(json.dumps(row_data, indent=2))


## Write new data to the current documents CSV

In [None]:
if new_document: # If it's a new document, the row data gets added to the end of the DataFrame
    # Constructs a one-row DataFrame from a list containing a single dict, then concatenates it to the end
    # of the existing DataFrame.
    current_docs_df = pd.concat([current_docs_df, pd.DataFrame([row_data])])
else: # The new values of the row cells replace the old one.
    for key in row_data:
        current_docs_df.at[row_index, key] = row_data[key]

current_docs_df.to_csv(repo_path + 'docs/docs.csv', index = False)
print('done')


## Update the documents versions metadata

In [None]:
# Generate a new version for the document based on the current document IRI and version_date.
doc_version_iri = row_data['current_iri'] + version_date

# Load versions list and find most recent version if not a new document.
versions_list_df = csv_read(repo_path + 'docs/docs-versions.csv')
if not new_document:
    matching_versions = versions_list_df[versions_list_df['current_iri']==doc_iri]
    matching_versions = matching_versions.sort_values(by=['version_iri'], ascending=[False])
    most_recent_version_iri = matching_versions.iat[0, 1]
#print(most_recent_version_iri)

# Update the list of document versions in the docs folder
version_row_data = {'current_iri': row_data['current_iri'], 'version_iri': doc_version_iri}
versions_list_df = pd.concat([versions_list_df, pd.DataFrame([version_row_data])])

#versions_list_df.to_csv(repo_path + 'docs/docs-versions.csv', index = False)


In [None]:
# Wrangle current document metadata row dictionary to match the versions metadata column headers
versions_data = copy.deepcopy(row_data)

del versions_data['doc_created']
del versions_data['doc_modified']
versions_data['version_issued'] = version_date
versions_data['version_iri'] = doc_version_iri
versions_data['mediaType'] = format_data['mediaType']

# Update the document versions metadata in the docs-versions folder
versions_metadata_df = csv_read(repo_path + 'docs-versions/docs-versions.csv')
versions_metadata_df = pd.concat([versions_metadata_df, pd.DataFrame([versions_data])])
versions_metadata_df.to_csv(repo_path + 'docs-versions/docs-versions.csv', index = False)

print('done')


In [None]:
# Update the versions replacements unless the document is new
if not new_document:
    versions_replacements_df = csv_read(repo_path + 'docs-versions/docs-versions-replacements.csv')
    replacement_row_data = {'replacing_document': doc_version_iri, 'replaced_document': most_recent_version_iri}
    versions_replacements_df = pd.concat([versions_replacements_df, pd.DataFrame([replacement_row_data])])
    versions_replacements_df.to_csv(repo_path + 'docs-versions/docs-versions-replacements.csv', index = False)

print('done')


## Update the access URLs and media types

In [None]:
# Load format information
formats_metadata_df = csv_read(repo_path + 'docs/docs-formats.csv')

# Look for the previously used format information for this doc
if not new_document:
    old_accessUrl = formats_metadata_df.loc[formats_metadata_df.doc_iri == doc_iri, 'accessUri'].values[0]
    old_mediaType = formats_metadata_df.loc[formats_metadata_df.doc_iri == doc_iri, 'mediaType'].values[0]
    
#print(old_accessUrl)
#print(old_mediaType)

# If there is a newly provided access URL and media type for the current document, use it.
# Otherwise use the old one.

# NOTE: if it's a new document, a new accessUrl must be provided along with the rest of the metadata.
# If that isn't done, the script here doesn't handle it and will throw an error later when current_accessUrl
# doesn't have a value.
if new_accessUrl:
    current_accessUrl = new_accessUrl
else:
    current_accessUrl = old_accessUrl

if format_data['mediaType']:
    current_mediaType = format_data['mediaType']
else:
    try:
        current_mediaType = old_mediaType
    # Handle the case where the creator of a new document doesn't bother to create the format config file
    except: # We assume the document is in Markdown if no information is given
        current_mediaType = 'text/markdown'
        
# For pre-existing documents, we try to replace the values of the accessUrl and mediaType, which might change.
if not new_document:
    # Find the row for the pre-existing document
    not_found = False
    row_matches = formats_metadata_df.index[formats_metadata_df['doc_iri']==doc_iri].tolist()
    if len(row_matches) == 0:
        not_found = True # If not previously present, we'll add it as if it were a new document and fix it.
    else:
        if len(row_matches) > 1:
            print('Warning: Multiple rows in the docs-formats.csv file match the document IRI:' + str(row_matches))
            row_index = row_matches[0]
        else:
            row_index = row_matches[0]
        # Now make the replacements
        formats_metadata_df.at[row_index, 'mediaType'] = current_mediaType
        formats_metadata_df.at[row_index, 'accessUri'] = current_accessUrl
        
# Cases where we need to add a row because the media type wasn't there before  
if new_document or not_found:
    format_row_data = {'doc_iri': doc_iri, 'mediaType': current_mediaType, 'accessUri': current_accessUrl}
    formats_metadata_df = pd.concat([formats_metadata_df, pd.DataFrame([format_row_data])])

# Now save the updated table
formats_metadata_df.to_csv(repo_path + 'docs/docs-formats.csv', index = False)


In [None]:
# Load format information for versions.
versions_format_metadata_df = csv_read(repo_path + 'docs-versions/docs-versions-formats.csv')

# The previous version usually needs to have it's access URL changed since it's not the current version webpage any more.
if not new_document:
    # Find the row for the pre-existing document
    not_found = False
    row_matches = versions_format_metadata_df.index[versions_format_metadata_df['version_iri']==most_recent_version_iri].tolist()
    if len(row_matches) == 0:
        print('no match found')
        not_found = True # If not previously present, we'll add it as if it were a new document and fix it.
    else:
        if len(row_matches) > 1:
            print('Warning: Multiple rows in the docs-versions-formats.csv file match the document IRI:' + str(row_matches))
            row_index = row_matches[0]
        else:
            row_index = row_matches[0]
        # Now make the replacement, using the URL provided in the format config file, if it was provided.
        # If it wasn't provided, then whatever URL was already there will remain.
        if format_data['lastVersionAccessUri']:
            versions_format_metadata_df.at[row_index, 'accessUri'] = format_data['lastVersionAccessUri']
            
# Handle the edge case where the row for the previous document is missing.
# Doesn't error trap the case where the old access URI isn't provided, but hey, it's an edge case and be more careful.
if not_found:
    versions_format_row_data = {'version_iri': most_recent_version_iri, 'mediaType': old_mediaType, 'accessUri': format_data['lastVersionAccessUri']}
    versions_format_metadata_df = pd.concat([versions_format_metadata_df, pd.DataFrame([versions_format_row_data])])

# For versions, a new row is always added to the file
versions_format_row_data = {'version_iri': doc_version_iri, 'mediaType': current_mediaType, 'accessUri': current_accessUrl}
versions_format_metadata_df = pd.concat([versions_format_metadata_df, pd.DataFrame([versions_format_row_data])])

versions_format_metadata_df.to_csv(repo_path + 'docs-versions/docs-versions-formats.csv', index = False)


## Update data about authors

Behaviors:
1. If there is a configuration file, it gets used as-is. 
- For new documents, the authors get added. This is also true for docs-roles.csv .
- For existing documents, the data from the config file replaces the existing data for the current doc. Also true for docs-roles.csv .
2. If there is no configuration file, the current doc data is unchanged. The previous author information gets used for the new version. No change is made to the docs-roles.csv file.

In [None]:
# Load existing author data
authors_df = csv_read(repo_path + 'docs/docs-authors.csv')
roles_df = csv_read(repo_path + 'docs-roles/docs-roles.csv')

# Try to load new document data from a configuration file.
# For new documents, the data from the YAML file must be used as the initial record.
if exists(author_config_path):
    # Load the new author data from the YAML file
    with open(author_config_path) as file_object:
        author_data = yaml.safe_load(file_object)
    for author_number in range(len(author_data)):
        # Need to add in the document column
        author_data[author_number]['document'] = doc_iri
        
        # Need to turn None values into empty strings
        for key in author_data[author_number].keys():
            if author_data[author_number][key] == None: # Empty YAML values are read in as a None keyword.
                author_data[author_number][key] = ''
        
    #print(json.dumps(new_author_data, indent=2))
    
    if not new_document:
        # For existing documents, any new data replaces the existing data.
        # Remove existing rows where the doc IRI matches, then add in new author data
        authors_df = authors_df[authors_df['document']!=doc_iri]
        roles_df = roles_df[roles_df['document']!=doc_iri]
        
    # Write the modified author DataFrame back out to the authors data file
    authors_df = pd.concat([authors_df, pd.DataFrame(author_data)])    
    authors_df.to_csv(repo_path + 'docs/docs-authors.csv', index = False)
    
    # The new (or replacement) rows for docs-roles.csv need to be constructed.
    roles_list = []
    for author in author_data:
        roles_dict = {'document': doc_iri, 'contributor_role': author['contributor_role'], 'contributor_literal': author['contributor_literal']}
        # Put the author IRI in the column that corresponds to their role
        contributor_role_column_header = author['contributor_role'].replace(' ', '_') # column headers don't have spaces
        roles_dict[contributor_role_column_header] = author['contributor_iri']
        # Perform a check to warn if the author's role isn't one that's already represented in the columns of the CSV
        if not contributor_role_column_header in roles_df.columns:
            print('WARNING: author', author['contributor_literal'], 'has the role', author['contributor_role'], 'that is not an existing column in the docs-roles.csv file')
        roles_list.append(roles_dict)
    # Now add the generated rows to the end of the dataframe and save
    roles_df = pd.concat([roles_df, pd.DataFrame(roles_list)])    
    roles_df.to_csv(repo_path + 'docs-roles/docs-roles.csv', index = False)    
        
else: # No new author data found, use existing data. The authors of the current documents (docs-authors.csv) are unchanged.
    # Load the existing data from the CSV
    author_data = []
    for index, row in authors_df.iterrows():
        # The row is a Pandas series whose items can be referenced by their identifiers (from the column headers)
        if row['document']==doc_iri:
            row_dict = row.to_dict()
            author_data.append(row_dict)
            
    #print(json.dumps(rows_list, indent=2))
    
# Create author records for the new version
versions_author_metadata_df = csv_read(repo_path + 'docs-versions/docs-versions-authors.csv')

# In each row of the new metadata, change the "document" column to the "document-version" column with a new IRI
versions_author_data = []
for author_dict in author_data:
    del author_dict['document']
    author_dict['document_version'] = doc_version_iri
    versions_author_data.append(author_dict)

# Now add the modified versions author data to the original DataFrame
versions_author_metadata_df = pd.concat([versions_author_metadata_df, pd.DataFrame(versions_author_data)])
versions_author_metadata_df.to_csv(repo_path + 'docs-versions/docs-versions-authors.csv', index = False)
    
print('done')


## Update standards components with doc information

In [None]:
if new_document:
    # Load existing standards data
    stds_parts_df = csv_read(repo_path + 'standards/standards-parts.csv')
    
    # Add a new row for the new document
    stds_parts_row_data = {'standard': standard_iri, 'part': doc_iri, 'rdf_type': 'foaf:Document'}
    stds_parts_df = pd.concat([stds_parts_df, pd.DataFrame([stds_parts_row_data])])

    stds_parts_df.to_csv(repo_path + 'standards/standards-parts.csv', index = False)

# Load existing standards versions data
stds_version_parts_df = csv_read(repo_path + 'standards-versions/standards-versions-parts.csv')

# Add a new row for the new document version
stds_version_parts_row_data = {'standard_version': standard_iri + '/version/' + version_date, 'part': doc_version_iri}
stds_version_parts_df = pd.concat([stds_version_parts_df, pd.DataFrame([stds_version_parts_row_data])])

stds_version_parts_df.to_csv(repo_path + 'standards-versions/standards-versions-parts.csv', index = False)


# Don't run the next two cells

The cells can be used to generate template YAML configuration files from existing rows in the table, but most times you won't need to do that.

In [None]:
# Sample data for documents

# The doc_iri determines the row of the table to be used to generate the sample
doc_iri = 'http://rs.tdwg.org/ac/doc/termlist/'

current_docs_df = csv_read(repo_path + 'docs/docs.csv')

# Find the row index if the document already exists
row_matches = current_docs_df.index[current_docs_df['current_iri']==doc_iri].tolist()
if len(row_matches) == 0:
    print('Document IRI not found in existing data.')
    new_document = True
elif len(row_matches) > 1:
    sys.exit('Multiple rows match the document IRI:' + str(row_matches))
else:
    row_index = row_matches[0]
    new_document = False

    # .squeeze() turns a single-row or column dataframe into a series.
    # See https://stackoverflow.com/questions/50575802/convert-dataframe-row-to-dict
    # and https://www.w3resource.com/pandas/dataframe/dataframe-squeeze.php
    row_data = current_docs_df[current_docs_df['current_iri']==doc_iri].squeeze().to_dict()

with open('document_configuration.yaml', 'w', encoding = "utf-8") as file_object:
    dump = yaml.dump(row_data, allow_unicode=True, sort_keys=False)
    file_object.write(dump)


In [None]:
# Sample data for authors

# The doc_iri determines the rows of the table to be used to generate the sample
doc_iri = 'http://rs.tdwg.org/ac/doc/termlist/'

current_docs_df = csv_read(repo_path + 'docs/docs-authors.csv')

rows_list = []
for index, row in current_docs_df.iterrows():
    # The row is a Pandas series whose items can be referenced by their identifiers (from the column headers)
    if row['document']==doc_iri:
        row_dict = row.to_dict()
        del row_dict['document']
        rows_list.append(row_dict)

with open('authors_configuration.yaml', 'w', encoding = "utf-8") as file_object:
    #dump = yaml.dump(rows_list)
    dump = yaml.dump(rows_list, allow_unicode=True, sort_keys=False)
    dump = dump.replace('\n-', '\n\n-') # Insert extra newline between records
    file_object.write(dump)

#print(json.dumps(rows_list, indent =2))
