## Example of Workspace Update from Spreadsheet

The example below updates workspace dataset attributes from a spreadsheet by:


1. Getting the default access token using the gcloud cli
1. Copying the data file over from the workspace bucket to the VM
1. Reading the file in to memory
1. Listing workspaces the default token has write access to
1. Then for each wokspace in the update spreadsheet that we have write access to:
   1. Back it up - read it and export the json as a file to the local VM
   1. Generate the list of attributes to update
   1. Merge the current attribute set with the updates
   1. Update the workspace using firecloud API
   1. Publish the workspace using the firecloud API
          


In [None]:
import datetime
import firecloud.api as FAPI
import json
import os
import pandas as pd
import pprint
import re
import requests


# Anvil Tools
#https://github.com/broadinstitute/horsefish/blob/master/scripts/anvil_tools/utils.py
    
# Schema
# https://github.com/broadinstitute/firecloud-orchestration/blob/develop/src/main/resources/library/attribute-definitions.json
# https://github.com/broadinstitute/firecloud-orchestration/blob/219bdb1a7c4c85df5350f27be9292bb66233a355/src/main/resources/library/attribute-definitions.json

#  "consentCodes": {
#     "GRU": "For health/medical/biomedical purposes and other biological research, including the study of population origins or ancestry.",
#     "HMB": "Use of the data is limited to health/medical/biomedical purposes, does not include the study of population origins or ancestry.",
#     "DS": "Use of the data must be related to a particular disease.",
#     "NCU": "Use of the data is limited to non-commercial use.",
#     "NPU": "Use of the data is limited to not-for-profit organizations.",
#     "NMDS": "Use of the data includes methods development research (e.g., development of software or algorithms) ONLY within the bounds of other specified data use limitations.",
#     "NAGR": "Use of the data includes aggregate level analysis to address research questions (e.g., determine variant frequencies across the general population) ONLY within the bounds of other specified data use limitations.",
#     "NCTRL": "Data can be used as a control set ONLY within the bounds of other specified data use limitations (e.g. only for cancer studies).",
#     "RS-G": "Use of the data is limited to studies of particular gender.",
#     "RS-PD": "Use of the data is limited to pediatric research."

#   "required": ["library:datasetName", "library:datasetVersion", "library:datasetDescription", "library:datasetCustodian",
#     "library:datasetDepositor", "library:contactEmail", "library:datasetOwner", "library:institute", "library:indication",
#     "library:numSubjects", "library:projectName", "library:dataCategory", "library:datatype", "library:dataUseRestriction",
#     "library:studyDesign", "library:requiresExternalApproval", "library:useLimitationOption"],
#   "oneOf": [{
#     "required": ["library:GRU", "library:HMB", "library:NCU", "library:NPU", "library:NMDS", "library:NAGR",
#       "library:NCTRL", "library:RS-G", "library:RS-PD", "library:useLimitationOption", "library:IRB"],
#     "anyOf": [
#       {
#         "properties": {"library:GRU": {"enum": [true]}}
#       },
#       {
#         "properties": {"library:HMB": {"enum": [true]}}
#       },
#       {
#         "required": ["library:DS"],
#         "properties": {"library:DS": {"minItems": 1}}
#       }
#     ],
#     "properties": {
#       "library:useLimitationOption": {
#         "type": "string",
#         "enum": ["questionnaire"]
#       }
#     }
#   }, {
#     "required": ["library:orsp", "library:useLimitationOption"],
#     "properties": {
#       "library:useLimitationOption": {
#         "type": "string",
#         "enum": ["orsp"]
#       }
#     }
#   }, {
#     "required": ["library:useLimitationOption"],
#     "properties": {
#       "library:useLimitationOption": {
#         "type": "string",
#         "enum": ["skip"]
#       }
#     }
#   }],


print(FAPI.whoami())
pp = pprint.PrettyPrinter(indent=4)

BILLING_PROJECT_ID = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE = os.environ['WORKSPACE_NAME']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']
CATALOG_WORKSPACE_NAMESPACE = "anvil-datastorage"
# CATALOG_WORKSPACE_NAMESPACE = "clever-canary-anvil-dev"

API_ROOT = "https://api.firecloud.org/api"

def getDefaultAccessToken():
    result = !gcloud auth application-default print-access-token
    return result[0]


ACCESS_TOKEN = getDefaultAccessToken()

print("Copy files:")
# print(WORKSPACE_BUCKET)
!gsutil cp $WORKSPACE_BUCKET/notebooks/AnVILCatalogWorkspacesInput-2022-03-11.csv  .
print("List all files:")
!ls 

## Read in metadata list
## Comment out so this can't run.
##metadata = pd.read_csv('AnVILCatalogWorkspacesInput-2022-03-11.csv',keep_default_na=False)
metadata = metadata[["name",
                     "consortium",
                     "phsId",
                     "library:indication",
                     "library:datatype",
                     "library:studyDesign",
                     "library:dataUseRestriction"]]
metadata.set_index("name", inplace=True)

## Filter out workspaces without write access
workspaces = FAPI.list_workspaces().json()

reader = set()
writer = set()
for w in workspaces:
    name = w['workspace']['name']
    if w['accessLevel'] == "READER":
        reader.add(name)
    if w['accessLevel'] == "WRITER" or w['accessLevel'] == "OWNER":
        writer.add(name)
        
def getName(workspace):
    return workspace['workspace']['name']        

workspaceByName = {}
for w in workspaces:
    workspaceByName[getName(w)]=w


In [None]:
def getUpdateAttributesURL(namespace,workspace):
    return  API_ROOT+"/library/"+ namespace + "/" + workspace + "/metadata"

def getAPIHeaders():
    headers = {
        'accept': 'application/json',
        'Authorization' : 'Bearer '+ACCESS_TOKEN
    }
    return headers


# Replaces the set the tags on a workspace to the given list of tags
def updateWorkspaceAttributes(namespace,workspace,attributes):        
    url = getUpdateAttributesURL(namespace,workspace)
    headers = getAPIHeaders()
## Uncomment the line below to use
##   resp = requests.put(url, json=attributes, headers=headers)
    print(resp.status_code)
    print(resp.text)

# 
def getWorkspaceAttributes(namespace,workspace):        
    url = getUpdateAttributesURL(namespace,workspace)
    headers = getAPIHeaders()
    resp = requests.get(url, headers=headers)
    print(resp.status_code)
    pp.pprint(json.dumps(resp.json()))
    return resp.json()
    
def backupWorkspace(w):
   
    # Create the filename
    name = w['workspace']['name']
    timestampStr = datetime.datetime.now().isoformat()
    fileName = name+'_'+timestampStr+'.json'
    
    # Save the to the local cloud environment
    with open(fileName, 'w') as fp:
        json.dump(w, fp,  indent=4)
        
    # Copy the file to the workspace bucket
    !gsutil cp ./$fileName $WORKSPACE_BUCKET/backups/

    
# publish a workspace to the Data Library in FireCloud
def publish_workspace_to_data_library(workspace_name, project="anvil-datastorage"):
    """Publish workspace to Firecloud Data Library."""

    # Library/publishLibraryWorkspace
    uri = f"https://api.firecloud.org/api/library/{project}/{workspace_name}/published"

    # Get access token and and add to headers for requests.
    # -H  "accept: application/json" -H  "Authorization: Bearer [token]"
    headers = {"Authorization": "Bearer " + ACCESS_TOKEN, "accept": "application/json"}

    # capture API response and status_code
    response = requests.post(uri, headers=headers)
    status_code = response.status_code

    # publishing fail
    if status_code not in [200, 204]:
        print(f"WARNING: Failed to publish workspace to Data Library: {project}/{workspace_name}.")
        print("Please see full response for error:")
        print(response.text)
        return False, response.text

    # publishiing success
    print(f"Successfully published {project}/{workspace_name} to Data Library.")
    return True, response.text



## Update workspace attributes
print(CATALOG_WORKSPACE_NAMESPACE)   
length = 0        
for name in metadata.index:
    if name in workspaceByName:
        length = length+1     
        print("##########################")
        print(name) 
        backupWorkspace(w)
    
        update_attrs = [

            {
                "name": "library:dataUseRestriction",
                "value":  metadata.at[name, "library:dataUseRestriction"]
            },
             {
                "name": "library:indication",
                "value": metadata.at[name, "library:indication"]
            },
            {
                "name": "library:datatype",
                "value": metadata.at[name, "library:datatype"].split(",")
            },
             {
                "name": "library:studyDesign",
                "value": metadata.at[name, "library:studyDesign"]
            },

        ]


 
        current_attrs = getWorkspaceAttributes(CATALOG_WORKSPACE_NAMESPACE,name)
        next_attrs = current_attrs.copy()

        for update in update_attrs:
            key = update["name"]
            ## use set for unordered comparison of lists
            if set(next_attrs[key]) != set(update['value']):
                next_attrs[key] = update['value']
        pp.pprint(next_attrs)    

        #updateWorkspaceAttributes(CATALOG_WORKSPACE_NAMESPACE,name,next_attrs)

        publish_workspace_to_data_library(name)

200
# {"attributes":{"library:requiresExternalApproval":false,"library:discoverableByGroups":{"itemsType":"AttributeValue","items":["GP_DataDelivery_TaskTeam"]},"library:dulvn":4,"library:GR":true,"library:studyDesign":"Case/Control","description":"ANVIL_CMG_BROAD_BRAIN_ENGLE_WES","library:published":true,"library:indication":"Poor health","library:contactEmail":"sam@sneed.com","library:numSubjects":123,"library:datasetOwner":"Virgina Sneed","library:datatype":{"itemsType":"AttributeValue","items":["Whole Exome","Whole Genome"]},"library:datasetCustodian":"Sam Sneed","library:projectName":"TopMed","library:institute":{"itemsType":"AttributeValue","items":["Scool of Hard Knocks"]},"tag:tags":{"itemsType":"AttributeValue","items":["Access WS","DUL:GSR:notAllowed","GRCh38/hg38","Consortium: CMG","dbGaP: phs001272"]},"library:dataUseRestriction":"HMB-ABC-123","library:datasetDepositor":"Larry Sneed","library:datasetVersion":"1.1","library:datasetName":"Workspace Name","library:dataCategory":{"itemsType":"AttributeValue","items":["Simple Nucleotide Variation (SNVs), CNV"]},"library:useLimitationOption":"skip","library:datasetDescription":"Cohort Description"},"authorizationDomain":[{"membersGroupName":"AUTH_AnVIL_CMG_Broad_Brain_Engle_WES"}],"billingAccount":"billingAccounts/00C5C3-F6B804-1667EC","bucketName":"fc-secure-76700a19-3005-4f2e-a595-abed41d306d4","completedCloneWorkspaceFileTransfer":"2022-03-04T22:43:02.100Z","createdBy":"dave@clevercanary.com","createdDate":"2022-03-04T22:42:55.280Z","googleProject":"terra-51086c51","googleProjectNumber":"568298150090","isLocked":false,"lastModified":"2022-03-04T22:42:56.769Z","name":"Clever_Canary_ANVIL_CMG_BROAD_BRAIN_ENGLE_WES_TEST","namespace":"anvil-datastorage","workflowCollectionName":"76700a19-3005-4f2e-a595-abed41d306d4","workspaceId":"76700a19-3005-4f2e-a595-abed41d306d4","workspaceVersion":"v2"}


