## A workflow for automating the creation of a ScienceBase Data Release from the content of a CSDGM metadata record

#### Created by the Data Managment Team of the Fort Collins Science Center with help from the SB team

In [1]:
import os
from pathlib import Path
import glob
import requests
import pysb
import random
import smtplib
import json
from bs4 import BeautifulSoup
from email.mime.text import MIMEText
#from requests_ntlm import HttpNtlmAuth

import getpass

from pymdwizard.core.xml_utils import XMLRecord, XMLNode
from pymdwizard.core import utils

from IPDS import ipds_utils
from usgs_datatools import doi

### Identify the items we'll need for this process:

#### The CSDGM metadata record,  A list of data files to include in the release,  the IPDS number

In [2]:
# Path to the metadata record to be posted on the ScienceBase landing page:
md_fname = r"C:\AARON\FLH_Metadata\FLH_prj_level.xml"

# Path to folder containing all other data and metadata associated with the data release:
data_dname = r"V:"

# A list of data files in data_dname that are associated with the data release:
data_files = [d for d in glob.glob(data_dname + '\**\*.*', recursive=True) if not d.startswith('V:\\zzz_')]

# Remove files in a folder that contains a file that is too big to be uploaded to SB (files between 2 and 10GB can be 
# uploaded manually):
clean_files = [f for f in data_files if not f.startswith('V:\\Terrain') and data_files if not f.startswith('V:\\Aquatic')
              and data_files if not f.startswith('V:\\Cultural')]

# Files to be uploaded to this data release's landing page:
lp_files = [clean_files[0], md_fname]

# The IPDS number of the data release, formatted as 'IP-xxxxxx':
ipds_number = 'IP-086518'

assert Path(md_fname).exists(), Path(data_dname).exists()

lp_files

['V:\\READ_ME_FLH-DataReferenceTable.pdf',
 'C:\\AARON\\FLH_Metadata\\FLH_prj_level.xml']

#### get the username and password we'll be using

In [3]:
# Enter your Active Directory username here if it's different that your current user name
username = getpass.getuser() + '@contractor.usgs.gov' 
password = getpass.getpass()
# Output of this cell is a prompt to enter your Active Directory password.

········


#### extract the information needed from the MD record

In [4]:
md = XMLRecord(md_fname)

In [5]:
# Extract the text in the Title element of md_fname
title = md.metadata.idinfo.citation.citeinfo.title.text
title

'Compilation and Assessment of Resource Values and Hazards to Inform Transportation and Associated Land-use Planning'

In [6]:
# Extract the text in the Abstract element of md_fname
abstract = md.metadata.idinfo.descript.abstract.text
abstract

'Management of transportation networks is affected by, and has effects on, natural and cultural resources through direct and indirect interactions. Until recently, the availability of such spatially explicit information has been limited; however, the data released here enables evaluation of potential impact to, or from, environmental factors across broad areas, for example, States and Agency Planning Regions. Integrated network and resource analyses can provide insights into implications of natural conditions for construction and maintenance as well as safety risks and environmental impacts during project planning and assessment.\n\nThis cooperative project was created by an agreement between Federal Lands and Highways (FLH) and USGS Fort Collins Science Center (FORT) to facilitate development of the spatially-explicit information to support transportation planning. This data represents national level (Continental United States) distributions of natural and cultural resource informatio

In [7]:
# Extract the year from the Publication Date element of md_fname
pubdate = md.metadata.idinfo.citation.citeinfo.pubdate.text
pubyear = pubdate[:4]
pubdate, pubyear

('2018', '2018')

### Pull the authors and ORCID's from IPDS

In [8]:
driver = ipds_utils.ipds_search(ipds_number=ipds_number,username=username, password=password)

Searching for IPDS entry on page 10
Searching for IPDS entry on page 9
Searching for IPDS entry on page 8
Searching for IPDS entry on page 7
Searching for IPDS entry on page 6
Searching for IPDS entry on page 5
Searching for IPDS entry on page 4


In [9]:
ipds_url = driver.current_url

In [10]:
author_list = ipds_utils.get_author_list(driver)
author_list

[{'ORCID': '000-0002-1105-1327', 'author_name': 'Manier, Daniel'},
 {'ORCID': '0000-0002-3488-003X', 'author_name': "O'Donnell, Michael"}]

In [11]:
author_email = ipds_utils.get_senior_author(driver)

### Update existing SB item with info from our metadata and upload our metadata to it

In [25]:
# Log in to ScienceBase
sb = pysb.SbSession()
sb.login(username, password)

<pysb.SbSession.SbSession at 0xa64f748>

In [13]:
# Uncomment and execute this cell if the release item already exists

item_id = '57bddf65e4b03fd6b7df58a4' # Insert SB ID of existing item 
item_json = sb.get_item(item_id)
item_json['id']

'57bddf65e4b03fd6b7df58a4'

### Specify the DOI for this data release

In [14]:
new_doi = 'doi:10.5066/F7MW2F8W'
new_doi

'doi:10.5066/F7MW2F8W'

### Update our SB item to include the IPDS link and DOI as identifiers

In [46]:
ipds_format = {'type':"IPDS", "scheme":"https://www.sciencebase.gov/vocab/category/item/identifier", 'key':ipds_number}
doi_format = {'type':"DOI", "scheme":"https://www.sciencebase.gov/vocab/category/item/identifier", 'key':new_doi}
sc_format = {'type':"USGS_ScienceCenter", "scheme":"https://www.sciencebase.gov/vocab/category/item/identifier", 'key':'Fort Collins Science Center'}
ma_format = {'type':"USGS_MissionArea", "scheme":"https://www.sciencebase.gov/vocab/category/item/identifier", 'key':'Ecosystems'}
item_json = sb.update_item({'id':item_id, 'identifiers':[ipds_format, doi_format, sc_format, ma_format]})

In [47]:
 # Give the lead PI read privlidges 
sb.add_acl_user_read(author_email, item_id)

{'parentId': '552d79bee4b0b22a157f59a3',
 'read': {'acl': ['USER:afreeman@contractor.usgs.gov',
   'USER:odonnellm@usgs.gov',
   'USER:manierd@usgs.gov'],
  'inherited': False},
 'write': {'acl': ['USER:afreeman@contractor.usgs.gov',
   'USER:odonnellm@usgs.gov'],
  'inherited': False}}

### Make proper citation for the data release

In [7]:
item_json = sb.get_item(item_id)

In [49]:
item_json['citation']

"Daniel J. Manier, and Michael S. O'Donnell, 20160824, Compilation and Assessment of Resource Values and Hazards to Inform Transportation and Associated Land-use Planning: ."

In [50]:
def format_author_name(author_name):
    '''
    Formats author names in the citation properly
    '''
    items = author_name.split()

    if ',' in author_name:
        last = items[0][:-1]
        first_i = items[1][0] + '.'
        try:
            middle_i = items[2][0] + '.'
        except:
            middle_i = ''
    else:
        last = items[-1]
        first_i = items[0][0] + '.'
        if len(items) == 3:
            middle_i = items[1][0] + '.'
        else:
            middle_i = ''

    return "{}, {}{}".format(last, first_i, middle_i)

In [51]:
if len(author_list) == 1:
    author_str = format_author_name(author_list[0]['author_name'])
else:
    author_str = ", ".join([format_author_name(a['author_name']) for a in author_list[:-1]])
    author_str += ', and ' + format_author_name(author_list[-1]['author_name'])
new_citation = ", ".join([author_str, pubyear, title])
new_citation += ": U.S. Geological Survey data release, https://doi.org/{}.".format(new_doi[4:])
item_json['citation'] = new_citation
new_citation

"Manier, D., and O'Donnell, M., 2018, Compilation and Assessment of Resource Values and Hazards to Inform Transportation and Associated Land-use Planning: U.S. Geological Survey data release, https://doi.org/10.5066/F7MW2F8W."

In [52]:
# Apply 'Data Release - In Progress' tag
item_json['browseCategories'] = ['Data Release - In Progress']

In [17]:
# Strip erroneous external sources scraped from metadata
item_json['webLinks'] = []

In [18]:
item_json = sb.update_item(item_json)

### To put data files on SB landing page:

In [15]:
item_json = sb.get_item(item_id)

In [16]:
item_json = sb.upload_files_and_update_item(item_json, lp_files)

### To put individual data files on separate SB child items:

In [None]:
# for csv in data_files:
#     print(csv)
#     csv_fname = csv
#     xml_fname = csv_fname.replace('.csv', '.xml')
#     this_md = XMLRecord(xml_fname)
#     this_md.metadata.idinfo.citation.citeinfo.onlink.text = doi_url
#     this_md.metadata.distinfo.stdorder.digform.digtopt.onlinopt.computer.networka.networkr.text = doi_url
#     this_md.save()
    
#     child_item = sb.upload_file_and_create_item(item_id, xml_fname)
#     child_id = child_item['id']
#     sb.upload_file_to_item(child_item, csv_fname)
    
#     child_item = sb.get_item(child_id)
#     child_item['citation'] = new_citation
#     child_item['webLinks'] = []
    
#     sb.update_item(child_item)

In [20]:
LandUse = []
Natural = []
Terrestrial = []
listoflists = [LandUse, Natural, Terrestrial]

for file in clean_files:
    if 'LandUse' in file:
        LandUse.append(file)
    elif 'Natural' in file:
        Natural.append(file)
    elif 'Terrestrial' in file:
        Terrestrial.append(file)

print(len(LandUse))
print(len(Natural))
print(len(Terrestrial))
print(len(listoflists))

30
37
48
3


In [31]:
LandUse

['V:\\LandUseConditions\\LC1_LC3_ImpactLvl_ProtectRare.tif',
 'V:\\LandUseConditions\\LC1_LC3_ImpactLvl_ProtectRare.tif.ovr',
 'V:\\LandUseConditions\\LC1_LC3_ImpactLvl_ProtectRare.tif.xml',
 'V:\\LandUseConditions\\LC1_NoiseLev-Use_Clss.tif',
 'V:\\LandUseConditions\\LC1_NoiseLev-Use_Clss.tif.ovr',
 'V:\\LandUseConditions\\LC1_NoiseLev-Use_Clss.tif.vat.dbf',
 'V:\\LandUseConditions\\LC1_NoiseLev-Use_Clss.tif.xml',
 'V:\\LandUseConditions\\LC2_ConstructBasement.tfw',
 'V:\\LandUseConditions\\LC2_ConstructBasement.tif',
 'V:\\LandUseConditions\\LC2_ConstructBasement.tif.ovr',
 'V:\\LandUseConditions\\LC2_ConstructBasement.tif.xml',
 'V:\\LandUseConditions\\LC2_ConstructErosion.tfw',
 'V:\\LandUseConditions\\LC2_ConstructErosion.tif',
 'V:\\LandUseConditions\\LC2_ConstructErosion.tif.ovr',
 'V:\\LandUseConditions\\LC2_ConstructErosion.tif.xml',
 'V:\\LandUseConditions\\LC2_ConstructRoad.tfw',
 'V:\\LandUseConditions\\LC2_ConstructRoad.tif',
 'V:\\LandUseConditions\\LC2_ConstructRoad.tif.

In [30]:
LandUse[0].split('.')[0]

'V:\\LandUseConditions\\LC1_LC3_ImpactLvl_ProtectRare'

In [26]:
for List in listoflists:    
    child_item = sb.upload_files_and_create_item(item_id, List)
    child_item['citation'] = new_citation
    child_item['webLinks'] = []
    print(List)
    
    sb.update_item(child_item)

OverflowError: string longer than 2147483647 bytes