# Imports and Common Functions

In [None]:
# !pip install --upgrade data_repo_client
# !pip install --upgrade xmltodict

In [None]:
# Imports
import requests
import json
import google.auth
import xmltodict
import data_repo_client
import pandas as pd
import re
from time import sleep
import ast
import datetime
import os
ws_bucket = os.environ["WORKSPACE_BUCKET"]

# Function to refresh TDR API client
def refresh_tdr_api_client():
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    config = data_repo_client.Configuration()
    config.host = "https://data.terra.bio"
    config.access_token = creds.token
    api_client = data_repo_client.ApiClient(configuration=config)
    api_client.client_side_validation = False
    return api_client

# Function to pull existing AnVIL data from DUOS
def get_anvil_datasets_from_duos(duos_token, duos_env):
    # Determine the target URL from the env variable
    if duos_env == "prod":
        url = "https://consent.dsde-prod.broadinstitute.org"
    else:
        url = "https://consent.dsde-dev.broadinstitute.org"

    # Pull a list of existing AnVIL studies and datasets from DUOS
    results = []
    datasets = requests.get(
        url=f"{url}/api/dataset/v3",
        headers={"Authorization": f"Bearer {duos_token}"}
    ).json()
    datasets_to_process = len(datasets)
    datasets_processed = 0
    for dataset_entry in datasets:
        datasets_processed += 1
        print(f"Processing dataset {datasets_processed} of {datasets_to_process}...")
        dataset_id = dataset_entry["dataset_id"]
        dataset_details = requests.get(
            url=f"{url}/api/dataset/v2/{dataset_id}",
            headers={"Authorization": f"Bearer {duos_token}"}
        ).json() 
        if dataset_details.get("study"):
            study_id = dataset_details["study"]["studyId"]
            if dataset_details["study"].get("description") and "Platform: AnVIL" in dataset_details["study"]["description"]: 
                study_name = dataset_details["study"]["name"]
                study_phs = ""
                for prop_entry in dataset_details["study"]["properties"]:
                    if prop_entry["key"] == "dbGaPPhsID":
                        study_phs = prop_entry["value"]
                        break
                dataset_name = dataset_details["name"]
                dataset_identifier = dataset_details["datasetIdentifier"]
                dac_id = dataset_details.get("dacId") if dataset_details.get("dacId") else ""
                data_use = dataset_details.get("dataUse")
                du_gru = data_use.get("generalUse") if data_use.get("generalUse") else False
                du_hmb = data_use.get("hmbResearch") if data_use.get("hmbResearch") else False
                du_disease = data_use.get("diseaseRestrictions") if data_use.get("diseaseRestrictions") else []
                du_poa = data_use.get("populationOriginsAncestry") if data_use.get("populationOriginsAncestry") else False
                du_ethics = data_use.get("ethicsApprovalRequired") if data_use.get("ethicsApprovalRequired") else False
                du_collab = data_use.get("collaboratorRequired") if data_use.get("collaboratorRequired") else False
                du_geog = data_use.get("geographicalRestrictions") if data_use.get("geographicalRestrictions") else ""
                du_genetic = data_use.get("geneticStudiesOnly") if data_use.get("geneticStudiesOnly") else False
                du_pub = data_use.get("publicationResults") if data_use.get("publicationResults") else False
                du_nmds = data_use.get("methodsResearch") if data_use.get("methodsResearch") else False
                du_npu = data_use.get("nonProfitUse") if data_use.get("nonProfitUse") else False
                du_other = data_use.get("other") if data_use.get("other") else ""
                access_management = ""
                snapshot_id = ""
                for prop_entry in dataset_details["properties"]:
                    if prop_entry["propertyName"] == "URL":
                        snapshot_url = prop_entry["propertyValue"]
                        if snapshot_url and "https://data.terra.bio/snapshots/" in snapshot_url:
                                snapshot_id = snapshot_url.replace("https://data.terra.bio/snapshots/", "")
                    elif prop_entry["propertyName"] == "Access Management":
                        access_management = prop_entry["propertyValue"]
                results.append([study_id, study_name, study_phs, dataset_id, dataset_identifier, dataset_name, dac_id, access_management, du_gru, du_hmb, du_disease, du_poa, du_ethics, du_collab, du_geog, du_genetic, du_pub, du_nmds, du_npu, du_other, snapshot_id])

    # Return results
    return results

# Display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Step 0: Review Existing AnVIL DUOS Entries

In [None]:
#############################################
## Input Parameters
#############################################

# Token for use in DUOS (use gcloud auth print-access-token to get this)
duos_token = ""

# DUOS Environment
duos_env = "prod"

#############################################
## Execution
#############################################

# Fetch results
results = get_anvil_datasets_from_duos(duos_token, duos_env)

# Display results
df_results = pd.DataFrame(results, columns = ["Study ID", "Study Name", "Study PHS", "Dataset ID", "Dataset Identifier", "Dataset Name", "DAC ID", "Access", "GRU", "HMB", "DS", "POA", "IRB", "COL", "GS", "GSO", "PUB", "NMDS", "NPU", "OTHER", "Snapshot ID"])
df_results_sorted = df_results.sort_values(by=["Study ID", "Dataset ID"], ascending=[True, True], ignore_index=True)
print("\nResults:")
display(df_results)

# Step 1: Collect AnVIL Metadata for Review

In [None]:
#############################################
## Functions
#############################################

def coalesce(*arg): 
    remove_list = ["", "NA", "N/A", "NONE", "TBD", "UNKNOWN", "UNSPECIFIED"]
    # update to remove N/A, None, Null, TBD
    for input_item in arg:
        if input_item is False or input_item == []:
            return input_item
        elif input_item:
            if isinstance(input_item, list):
                temp_list = [ele for ele in input_item if ele is not None and ele.upper() not in remove_list]
                if temp_list:
                    return temp_list
                else:
                    return []
            else:
                if str(input_item).upper() not in remove_list:
                    return input_item
    return None

def format_description(input_string):
    output_string = str(input_string) if input_string else ""
    output_string = re.sub("\n\n\t", " ", output_string)
    output_string = re.sub("\t", " ", output_string)
    output_string = output_string.replace("\t", " ")
    output_string = re.sub("\n", "\\n", output_string)
    output_string = output_string.replace("\n", "\\n")
    output_string = re.sub("study.cgi\?study_id=|.\/study.cgi\?study_id=", "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=", output_string)
    return output_string

def format_phs_id(input_str):
    try:
        num = re.search("phs0*([0-9]+)", input_str, re.IGNORECASE).group(1)
    except:
        num = ""
    if num:
        output_str = "phs" + str(num).zfill(6)
    else:
        output_str = ""
    return output_str

def try_join(l):
    try:
        if isinstance(l, list):
            return ', '.join(map(str, l))
        else:
            return l
    except TypeError:
        return l
    
def val_study_type_enum(l):
    if l and l not in ["Observational", "Interventional", "Descriptive", "Analytical", "Prospective", "Retrospective", "Case report", "Case series", "Cross-sectional", "Cohort study"]:
        return 1
    else:
        return 0

def val_nih_inst_center_sub_enum(l):
    if l and l not in ["NCI", "NEI", "NHLBI", "NHGRI", "NIA", "NIAAA", "NIAID", "NIAMS", "NIBIB", "NICHD", "NIDCD", "NIDCR", "NIDDK", "NIDA", "NIEHS", "NIGMS", "NIMH", "NIMHD", "NINDS", "NINR", "NLM", "CC", "CIT", "CSR", "FIC", "NCATS", "NCCIH"]:
        return 1
    else:
        return 0

def val_nih_ic_supp_study_enum(l):
    if l and isinstance(l, list):
        for item in l:
            if item not in ["NCI", "NEI", "NHLBI", "NHGRI", "NIA", "NIAAA", "NIAID", "NIAMS", "NIBIB", "NICHD", "NIDCD", "NIDCR", "NIDDK", "NIDA", "NIEHS", "NIGMS", "NIMH", "NIMHD", "NINDS", "NINR", "NLM", "CC", "CIT", "CSR", "FIC", "NCATS", "NCCIH"]:
                return 1
        return 0
    else:
        return 0

def val_file_type_enum(l):
    if l and isinstance(l, list):
        for item in l:
            if item not in ["Arrays", "Genome", "Exome", "Survey", "Phenotype"]:
                return 1
        return 0
    else:
        return 0

def fetch_dataset_details(snapshot_id, ds_consent_map, duos_token, duos_env, match_existing):
    
    # Initialize variables
    dataset_details_records = []

    # Determine the DUOS URL from the duos_env variable
    if duos_env == "prod":
        url = "https://consent.dsde-prod.broadinstitute.org"
    else:
        url = "https://consent.dsde-dev.broadinstitute.org"

    # Build DUOS lookups
    print(f"Building DUOS dataset and study lookups...")
    study_lookup = {}
    study_name_lookup = {}
    dataset_lookup = []
    datasets = requests.get(
        url=f"{url}/api/dataset/v3",
        headers={"Authorization": f"Bearer {duos_token}"}
    ).json()
    study_ids_processed = set()
    for dataset_entry in datasets:
        dataset_id = dataset_entry.get("dataset_id")
        dataset_name = dataset_entry.get("dataset_name")
        identifier = dataset_entry.get("identifier")
        study_id = dataset_entry.get("study_id")
        try:
            base_consent_group_name = re.search(r'(.*)_[0-9]{8}_ANV[0-9]+_[0-9]{12}$', dataset_name).group(1)
        except:
            base_consent_group_name = dataset_name
        if study_id:
            # Build dataset lookup
            dataset_lookup.append({
                "dataset_id": dataset_id,
                "consent_group_name": dataset_name, 
                "base_consent_group_name": base_consent_group_name,
                "identifier": identifier,
                "study_id": study_id
            })
            # Build study lookups
            if study_id not in study_ids_processed:
                study_ids_processed.add(study_id)
                study_details = requests.get(
                    url=f"{url}/api/dataset/registration/{identifier}",
                    headers={"Authorization": f"Bearer {duos_token}"}
                ).json()
                study_desc = study_details.get("studyDescription")
                if study_desc and "Platform: AnVIL" in study_desc:
                    study_phs = study_details.get("dbGaPPhsID")
                    if study_phs:
                        id_in_lookup = study_lookup.get(study_phs)
                        if id_in_lookup and id_in_lookup != study_id:
                            print(f"Warning: PHS ID {study_phs} tied to multiple studies in DUOS: {id_in_lookup}, {study_id}. Please review.")
                        else:
                            study_lookup[study_phs] = str(study_id)
                    study_name = study_details.get("studyName")
                    if study_name:
                        id_in_lookup = study_name_lookup.get(study_name)
                        if id_in_lookup and id_in_lookup != study_id:
                            print(f"Warning: Study Name '{study_name}' tied to multiple studies in DUOS: {id_in_lookup}, {study_id}. Please review.")
                        else:
                            study_name_lookup[study_name] = str(study_id)
                    
    # Loop through and process snapshots
    for snapshot_id in snapshot_id_list:
        
        # Initialize variables
        terra_dict = {}
        dbgap_xml_dict = {}
        dbgap_study_api_dict = {}
        dbgap_fhir_dict = {}
        final_results_dict = {}
        snapshot_phs_id = ""

        # Retrieve snapshot details
        print(f"Processing snapshot_id: {snapshot_id}...")
        final_results_dict = {}
        api_client = refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
        attempt_counter = 0
        snapshot_details = {}
        while attempt_counter <= 2:
            try:
                snapshot_details = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
                break
            except:
                sleep(5)
                attempt_counter += 1  
        snapshot_name = snapshot_details["name"]
        dataset_id = snapshot_details["source"][0]["dataset"]["id"]
        snapshot_phs_id = format_phs_id(snapshot_details["source"][0]["dataset"]["phs_id"])
        if snapshot_details["source"][0]["dataset"]["secure_monitoring_enabled"] == True:
            access_management = "controlled"
        else:
            access_management = "open"
        if snapshot_details["source"][0]["dataset_properties"].get("source_workspaces"):  
            source_workspace = snapshot_details["source"][0]["dataset_properties"]["source_workspaces"][0]
        else:
            source_workspace = None
        if snapshot_details["source"][0]["dataset_properties"].get("consent_name"):
            snapshot_consent_code = snapshot_details["source"][0]["dataset_properties"]["consent_name"]
        else:
            if access_management == "open":
                snapshot_consent_code = "NRES"
            else:
                snapshot_consent_code = None
        if snapshot_details["duos_firecloud_group"] != None:
            duos_id = snapshot_details["duos_firecloud_group"]["duos_id"]
        else:
            duos_id = None
        try:
            base_consent_group_name = re.search(r'(.*)_[0-9]{8}_ANV[0-9]+_[0-9]{12}$', snapshot_name).group(1)
        except:
            base_consent_group_name = snapshot_name
        if access_management == "open":
            consent_group_name = base_consent_group_name + " (NRES)"
        elif snapshot_consent_code:
            consent_group_name = base_consent_group_name + f" ({snapshot_consent_code})"
        else:
            consent_group_name = base_consent_group_name
        
        print("\tSnapshot PHS_ID: " + str(snapshot_phs_id))
        print("\tSnapshot Consent Code: " + str(snapshot_consent_code))
        print("\tSource Workspace: " + str(source_workspace))
        print("\tDUOS ID: " + str(duos_id))
        print("\tConsent Group Name: " + str(consent_group_name))
        
        # Derive study name for use in matching logic
        studyName = ""
        ws_study_name = ""
        xml_study_name = ""
        api_study_name = ""
        fhir_study_name = ""
        attempt_counter = 0
        while attempt_counter <= 2:
            creds, project = google.auth.default()
            auth_req = google.auth.transport.requests.Request()
            creds.refresh(auth_req)
            try:
                ws_attributes = requests.get(
                    url=f"https://api.firecloud.org/api/workspaces/anvil-datastorage/{source_workspace}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
                    headers={"Authorization": f"Bearer {creds.token}"}
                ).json()
                if ws_attributes.get("workspace"):
                    ws_study_name = coalesce(ws_attributes["workspace"]["attributes"].get("library:projectName"), source_workspace)
                break
            except:
                sleep(5)
                attempt_counter += 1
        if snapshot_phs_id:
            phs_short = snapshot_phs_id.replace("phs", "")
            study_uid = ""
            dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = requests.get(url=dbgap_url)
                    xml_data = xmltodict.parse(response.text)
                    if xml_data["dbgapss"].get("Study"):
                        if isinstance(xml_data["dbgapss"]["Study"], list):
                            study_data = xml_data["dbgapss"]["Study"][0]
                        else:
                            study_data = xml_data["dbgapss"]["Study"] 
                        study_uid = study_data.get("@uid")
                        xml_study_name = study_data["StudyInfo"].get("StudyNameEntrez")
                    break
                except:
                    sleep(5)
                    attempt_counter += 1
            if study_uid:
                dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
                attempt_counter = 0
                while attempt_counter <= 2:
                    try:
                        response = requests.get(url=dbgap_study_url)
                        study_api_data = json.loads(response.text)
                        if study_api_data.get("error") == None:
                            api_study_name = study_api_data["data"].get("report_name")
                        break
                    except:
                        sleep(5)
                        attempt_counter += 1 
            dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + snapshot_phs_id
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = requests.get(url=dbgap_fhir_url)
                    fhir_data = json.loads(response.text)
                    if fhir_data.get("entry"):
                        fhir_study_name = fhir_data["entry"][0]["resource"].get("title")
                    break
                except:
                    sleep(5)
                    attempt_counter += 1
        studyName = coalesce(fhir_study_name, xml_study_name, api_study_name, ws_study_name)
        if snapshot_phs_id and studyName and f" ({snapshot_phs_id})" not in studyName:
            studyName = studyName + f" ({snapshot_phs_id})"
        
        # Attempt to match the dataset to a DUOS Dataset ID based on consent group name
        match_duos_id = ""
        target_dataset_id = ""
        for dataset in dataset_lookup:
            if dataset["base_consent_group_name"] == base_consent_group_name or dataset["consent_group_name"] == consent_group_name:
                match_duos_id = dataset["identifier"]
                break
        match_study_id = ""
        
        # Attempt to match the dataset to a DUOS Study ID based on PHS ID or Study Name
        if snapshot_phs_id:
            match_study_id = study_lookup.get(snapshot_phs_id)
        else:
            match_study_id = study_name_lookup.get(studyName)

        # If a snapshot or match DUOS ID is present, use this to build the final result dictionary
        if match_existing and (duos_id or match_duos_id):

            # Pull existing DUOS study registration
            duos_id_to_use = coalesce(duos_id, match_duos_id)
            for dataset in dataset_lookup:
                if dataset["identifier"] == duos_id_to_use:
                    target_dataset_id = dataset["dataset_id"]
                    break
            duos_dict = {}
            duos_dict = requests.get(
                url=f"{url}/api/dataset/registration/{duos_id_to_use}",
                headers={"Authorization": f"Bearer {duos_token}"}
            ).json()
            #print(duos_dict)

            # Pull dataset details from DUOS (to get data use info) 
            if not duos_dict.get("consentGroups"):
                duos_dict["consentGroups"] = [{"datasetId": None}]
            duos_dataset_id = duos_dict["consentGroups"][0].get("datasetId")
            duos_data_use_dict = {}
            dac_id = ""
            if duos_dataset_id:
                dataset_details = requests.get(
                    url=f"{url}/api/dataset/v2/{duos_dataset_id}",
                    headers={"Authorization": f"Bearer {duos_token}"}
                ).json()
                duos_data_use_dict = dataset_details.get("dataUse")
                dac_id = dataset_details.get("dacId") if dataset_details.get("dacId") else ""

            # Build final results dictionary
            if snapshot_consent_code:
                consent_code = snapshot_consent_code.upper().replace("_", "-")
            else:
                consent_code = ""
            final_results_dict["snapshot_id"] = snapshot_id
            final_results_dict["snapshot_phs_id"] = snapshot_phs_id
            final_results_dict["snapshot_duos_id"] = duos_id
            final_results_dict["match_duos_id"] = match_duos_id
            final_results_dict["target_dataset_id"] = target_dataset_id
            final_results_dict["target_study_id"] = match_study_id
            studyName = duos_dict.get("studyName")
            dbGaP_study_name = duos_dict.get("dbGaPStudyRegistrationName")
            if snapshot_phs_id and studyName and f" ({snapshot_phs_id})" not in studyName:
                final_results_dict["studyName"] = studyName + f" ({snapshot_phs_id})"
            else:
                final_results_dict["studyName"] = studyName
            final_results_dict["studyType"] = duos_dict.get("studyType")
            final_results_dict["studyDescription"] = duos_dict.get("studyDescription")
            final_results_dict["dataTypes"] = duos_dict.get("dataTypes")
            final_results_dict["phenotypeIndication"] = duos_dict.get("phenotypeIndication")
            final_results_dict["species"] = duos_dict.get("species")
            final_results_dict["piName"] = duos_dict.get("piName")
            final_results_dict["dataCustodianEmail"] = duos_dict.get("dataCustodianEmail")
            final_results_dict["publicVisibility"] = duos_dict.get("publicVisibility")
            final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if duos_dict.get("nihAnvilUse") and 'already' in duos_dict.get("nihAnvilUse").lower() else "I am NHGRI funded and I do not have a dbGaP PHS ID"
            final_results_dict["submittingToAnvil"] = duos_dict.get("submittingToAnvil")
            if snapshot_phs_id:
                final_results_dict["dbGaPPhsID"] = snapshot_phs_id
            else:
                final_results_dict["dbGaPPhsID"] = duos_dict.get("dbGaPPhsID")
            if snapshot_phs_id and dbGaP_study_name and f" ({snapshot_phs_id})" in dbGaP_study_name:
                final_results_dict["dbGaPStudyRegistrationName"] = dbGaP_study_name.replace(f" ({snapshot_phs_id})", "")
            else:
                final_results_dict["dbGaPStudyRegistrationName"] = duos_dict.get("dbGaPStudyRegistrationName")
            final_results_dict["embargoReleaseDate"] = duos_dict.get("embargoReleaseDate")
            final_results_dict["sequencingCenter"] = duos_dict.get("sequencingCenter")
            final_results_dict["piEmail"] = duos_dict.get("piEmail")
            final_results_dict["piInstitution"] = duos_dict.get("piInstitution")
            final_results_dict["nihGrantContractNumber"] = duos_dict.get("nihGrantContractNumber")
            final_results_dict["nihICsSupportingStudy"] = duos_dict.get("nihICsSupportingStudy")
            final_results_dict["nihProgramOfficerName"] = duos_dict.get("nihProgramOfficerName")
            final_results_dict["nihInstitutionCenterSubmission"] = duos_dict.get("nihInstitutionCenterSubmission")
            final_results_dict["nihInstitutionalCertificationFileName"] = duos_dict.get("nihInstitutionalCertificationFileName")
            final_results_dict["nihGenomicProgramAdministratorName"] = duos_dict.get("nihGenomicProgramAdministratorName")
            final_results_dict["multiCenterStudy"] = duos_dict.get("multiCenterStudy")
            final_results_dict["collaboratingSites"] = duos_dict.get("collaboratingSites")
            final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSR")
            final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation")
            final_results_dict["alternativeDataSharingPlan"] = duos_dict.get("alternativeDataSharingPlan")
            final_results_dict["alternativeDataSharingPlanReasons"] = duos_dict.get("alternativeDataSharingPlanReasons")
            final_results_dict["alternativeDataSharingPlanExplanation"] = duos_dict.get("alternativeDataSharingPlanExplanation")
            final_results_dict["alternativeDataSharingPlanFileName"] = duos_dict.get("alternativeDataSharingPlanFileName")
            final_results_dict["alternativeDataSharingPlanDataSubmitted"] = duos_dict.get("alternativeDataSharingPlanDataSubmitted")
            final_results_dict["alternativeDataSharingPlanDataReleased"] = duos_dict.get("alternativeDataSharingPlanDataReleased")
            final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = duos_dict.get("alternativeDataSharingPlanTargetDeliveryDate")
            final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = duos_dict.get("alternativeDataSharingPlanTargetPublicReleaseDate")
            final_results_dict["alternativeDataSharingPlanAccessManagement"] = duos_dict.get("alternativeDataSharingPlanAccessManagement")
            final_results_dict["consentGroups.consentGroupName"] = consent_group_name
            final_results_dict["consentGroups.accessManagement"] = access_management
            final_results_dict["dacId"] = dac_id
            final_results_dict["consentGroups.numberOfParticipants"] = duos_dict["consentGroups"][0].get("numberOfParticipants")
            final_results_dict["consentCode"] = consent_code
            final_results_dict["consentGroups.generalResearchUse"] = coalesce(duos_dict["consentGroups"][0].get("generalResearchUse"), duos_data_use_dict.get("generalUse"), False)
            final_results_dict["consentGroups.hmb"] = coalesce(duos_dict["consentGroups"][0].get("hmb"), duos_data_use_dict.get("hmbResearch"), False)
            final_results_dict["consentGroups.diseaseSpecificUse"] = coalesce(duos_dict["consentGroups"][0].get("diseaseSpecificUse"), duos_data_use_dict.get("diseaseRestrictions"), [])
            final_results_dict["consentGroups.gs"] = coalesce(duos_dict["consentGroups"][0].get("gs"), duos_data_use_dict.get("geographicalRestrictions"))
            final_results_dict["consentGroups.poa"] = coalesce(duos_dict["consentGroups"][0].get("poa"), duos_data_use_dict.get("populationOriginsAncestry"), False)
            final_results_dict["consentGroups.nmds"] = coalesce(duos_dict["consentGroups"][0].get("nmds"), False)
            final_results_dict["consentGroups.gso"] = coalesce(duos_dict["consentGroups"][0].get("gso"), duos_data_use_dict.get("geneticStudiesOnly"), False)
            final_results_dict["consentGroups.pub"] = coalesce(duos_dict["consentGroups"][0].get("pub"), duos_data_use_dict.get("publicationResults"), False)
            final_results_dict["consentGroups.col"] = coalesce(duos_dict["consentGroups"][0].get("col"), duos_data_use_dict.get("collaboratorRequired"), False)
            final_results_dict["consentGroups.irb"] = coalesce(duos_dict["consentGroups"][0].get("irb"), duos_data_use_dict.get("ethicsApprovalRequired"), False)
            final_results_dict["consentGroups.npu"] = coalesce(duos_dict["consentGroups"][0].get("npu"), False)
            final_results_dict["consentGroups.otherPrimary"] = coalesce(duos_dict["consentGroups"][0].get("otherPrimary"), duos_data_use_dict.get("other"))
            final_results_dict["consentGroups.otherSecondary"] = duos_dict["consentGroups"][0].get("otherSecondary")
            final_results_dict["consentGroups.mor"] = duos_dict["consentGroups"][0].get("mor")
            final_results_dict["consentGroups.morDate"] = duos_dict["consentGroups"][0].get("morDate")
            final_results_dict["consentGroups.dataLocation"] = "TDR Location"
            final_results_dict["consentGroups.url"] = "https://data.terra.bio/snapshots/" + snapshot_id
            if duos_dict["consentGroups"][0].get("fileTypes") and duos_dict["consentGroups"][0]["fileTypes"].get("fileType"):
                final_results_dict["consentGroups.fileTypes.fileType"] = duos_dict["consentGroups"][0]["fileTypes"][0].get("fileType")
            else:
                final_results_dict["consentGroups.fileTypes.fileType"] = None
            if duos_dict["consentGroups"][0].get("fileTypes") and duos_dict["consentGroups"][0]["fileTypes"].get("functionalEquivalence"):
                final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = duos_dict["consentGroups"][0]["fileTypes"][0].get("functionalEquivalence")
            else:
                final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = None
            collab_site = duos_dict.get("collaboratingSites")
            if collab_site:
                final_results_dict["consortium"] = collab_site[0]
            else:
                final_results_dict["consortium"] = None
            dataset_details_records.append(final_results_dict)
            continue

        # Pull information from original workspace (if listed)
        workspace_phs_id = ""
        if source_workspace:
            # Establish credentials
            creds, project = google.auth.default()
            auth_req = google.auth.transport.requests.Request()
            creds.refresh(auth_req)

            # Pull workspace attributes
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    ws_attributes = requests.get(
                        url=f"https://api.firecloud.org/api/workspaces/anvil-datastorage/{source_workspace}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
                        headers={"Authorization": f"Bearer {creds.token}"}
                    ).json()
                    break
                except:
                    sleep(5)
                    attempt_counter += 1

            # Map to schema
            if ws_attributes.get("workspace"):
                terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
                terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
                if ws_attributes["workspace"]["attributes"].get("library:dataCategory"):
                    terra_dict["dataTypes"] = []
                    for item in ws_attributes["workspace"]["attributes"]["library:dataCategory"]["items"]:
                        inner_list = item.split(",")
                        for inner_item in inner_list:
                            inner_item = inner_item.replace("'", "").strip()
                            terra_dict["dataTypes"].append(inner_item)
                terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
                terra_dict["species"] = "Homo sapiens"
                terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
                terra_dict["dataCustodianEmail"] = [ws_attributes["workspace"]["attributes"].get("library:contactEmail")]
                if ws_attributes["workspace"]["attributes"].get("tag:tags"):
                    for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
                        if "Consortium:" in tag:
                            terra_dict["consortium"] = tag.split(":")[1].strip()
                        elif "dbGaP:" in tag:
                            terra_dict["dbGaPPhsID"] = format_phs_id(tag.split(":")[1].strip())
                            if not snapshot_phs_id:
                                workspace_phs_id = format_phs_id(tag.split(":")[1].strip()) 
                                print(f"Warning: PHS ID ({workspace_phs_id}) found on workspace but not snapshot! Please resolve.")
                terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"].get("library:dataUseRestriction")
                if ws_attributes["workspace"]["attributes"].get("library:datatype"):
                    terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]
                if ws_attributes["workspace"]["attributes"].get("library:numSubjects"):
                    terra_dict["consentGroups.numberOfParticipants"] = ws_attributes["workspace"]["attributes"]["library:numSubjects"]
        #         print("------------------------------------------------------")
        #         print("terra_dict")
        #         print(terra_dict)

        # Pull study information from DUOS (if matched to DUOS Study based on PHS ID)
        if not match_study_id and workspace_phs_id:
            match_study_id = study_lookup.get(workspace_phs_id)
        duos_study_dict = {}
        if match_existing and match_study_id:
            duos_study_dict = requests.get(
                    url=f"{url}/api/dataset/study/registration/{match_study_id}",
                    headers={"Authorization": f"Bearer {duos_token}"}
                ).json()
            collab_site = duos_study_dict.get("collaboratingSites")
            if collab_site:
                duos_study_dict["consortium"] = collab_site[0]
        
        # Pull information from dbGaP (if phs_id listed)
        dac_names = ""
        dbgap_phs_id = coalesce(snapshot_phs_id, workspace_phs_id)
        if dbgap_phs_id:
            # Pull and parse XML
            phs_short = dbgap_phs_id.replace("phs", "")
            dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = requests.get(url=dbgap_url)
                    xml_data = xmltodict.parse(response.text)
                    break
                except:
                    sleep(5)
                    attempt_counter += 1
            study_uid = ""

            # Map to schema
            if xml_data["dbgapss"].get("Study"):
                if isinstance(xml_data["dbgapss"]["Study"], list):
                    study_data = xml_data["dbgapss"]["Study"][0]
                else:
                    study_data = xml_data["dbgapss"]["Study"] 
                study_uid = study_data.get("@uid")
                dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
                dbgap_xml_dict["dbGaPPhsID"] = dbgap_phs_id
                dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
                if study_data["Authority"]["Persons"].get("Person"):
                    for ap_entry in study_data["Authority"]["Persons"]["Person"]:
                        if ap_entry["Role"] == "PI":
                            dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                            dbgap_xml_dict["piEmail"] = ap_entry["@email"]
                            dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
                        elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
                            dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                        elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
                            dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                ic_list = []
                if isinstance(study_data["Authority"]["ICs"]["IC"], list):
                    for ic_entry in study_data["Authority"]["ICs"]["IC"]:
                        ic_list.append(ic_entry["@name"])
                else:
                    ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
                dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
                dbgap_xml_dict["consentGroups.numberOfParticipants"] = study_data.get("@num_participants")
                dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")
                if study_data["Policy"].get("ConsentGroup"):
                    if isinstance(study_data["Policy"]["ConsentGroup"], list):
                        dac_name_set = set()
                        for idx, consent in enumerate(study_data["Policy"]["ConsentGroup"]):
                            tmp_dac = consent["@dac_name"]
                            dac_name_set.add(tmp_dac)
                        dac_name_list = list(dac_name_set)
                        dac_names = ", ".join(dac_name_list)
                    else:
                        dac_names = study_data["Policy"]["ConsentGroup"]["@dac_name"]     
        #             print("------------------------------------------------------")
        #             print("dbgap_xml_dict")
        #             print(dbgap_xml_dict)

            # Pull and parse Study API JSON
            if study_uid:
                dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
                attempt_counter = 0
                while attempt_counter <= 2:
                    try:
                        response = requests.get(url=dbgap_study_url)
                        study_api_data = json.loads(response.text)
                        break
                    except:
                        sleep(5)
                        attempt_counter += 1

                # Map to schema
                if study_api_data.get("error") == None:
                    dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
                    dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
                    dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
                    dbgap_study_api_dict["dbGaPPhsID"] = dbgap_phs_id
                    dbgap_study_api_dict["dbGaPStudyRegistrationName"] = study_api_data["data"].get("report_name")
                    for attr_entry in study_api_data["data"].get("attribution"):
                        if attr_entry.get("title") == "Principal Investigator":
                            dbgap_study_api_dict["piName"] = attr_entry.get("name")
                            dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
                            break
        #             print("------------------------------------------------------")
        #             print("dbgap_study_api_dict")
        #             print(dbgap_study_api_dict)

            # Pull and parse FHIR API JSON
            dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + dbgap_phs_id
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = requests.get(url=dbgap_fhir_url)
                    fhir_data = json.loads(response.text)
                    break
                except:
                    sleep(5)
                    attempt_counter += 1

            # Map to schema
            if fhir_data.get("entry"):
                dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
                dbgap_fhir_dict["dbGaPPhsID"] = dbgap_phs_id
                dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
                # NIH ICs
                if "Organization/" in fhir_data["entry"][0]["resource"]["sponsor"].get("reference"):
                    dbgap_fhir_dict["nihICsSupportingStudy"] = [fhir_data["entry"][0]["resource"]["sponsor"].get("reference")[13:]]
                else:
                    ic_display = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
                    if ic_display == "National Human Genome Research Institute":
                        dbgap_fhir_dict["nihICsSupportingStudy"] = ["NHGRI"]
                    else:
                        dbgap_fhir_dict["nihICsSupportingStudy"] = [ic_display]
                # studyType
                if fhir_data["entry"][0]["resource"].get("category"):
                    for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
                        if cat_entry.get("coding"):
                            for coding_entry in cat_entry.get("coding"):
                                if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
                                    value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                                    if dbgap_fhir_dict.get("studyType") and value:
                                        dbgap_fhir_dict["studyType"] += f", {value}"
                                    elif value:
                                        dbgap_fhir_dict["studyType"] = value
                # dataTypes
                dt_list = []
                if fhir_data["entry"][0]["resource"].get("extension"): 
                    for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                        if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
                            for inner_ext_entry in ext_entry.get("extension"):
                                if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                                    for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                                        dt_list.append(coding_entry.get("code"))
                dbgap_fhir_dict["dataTypes"] = dt_list
                # phenotypeIndication
                if fhir_data["entry"][0]["resource"].get("focus"):
                    for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
                        if focus_entry.get("coding"):
                            for coding_entry in focus_entry.get("coding"):
                                value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                                if dbgap_fhir_dict.get("phenotypeIndication") and value:
                                    dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
                                elif value:
                                    dbgap_fhir_dict["phenotypeIndication"] = value
                # numberOfParticipants
                if fhir_data["entry"][0]["resource"].get("extension"):
                    for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                        if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
                            for inner_ext_entry in ext_entry.get("extension"):
                                if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                                    dbgap_fhir_dict["consentGroups.numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")
        #         print("------------------------------------------------------")
        #         print("dbgap_fhir_dict")
        #         print(dbgap_fhir_dict)

        # Reconcile information and create final results
        consent_code = coalesce(snapshot_consent_code, terra_dict.get("consentGroups.consentCode"), dbgap_fhir_dict.get("consentGroups.consentCode"), dbgap_xml_dict.get("consentGroups.consentCode"), dbgap_study_api_dict.get("consentGroups.consentCode"))
        if consent_code:
            consent_code = consent_code.upper().replace("_", "-")
        else:
            consent_code = ""
        consortium = coalesce(duos_study_dict.get("consortium"), terra_dict.get("consortium"), dbgap_fhir_dict.get("consortium"), dbgap_xml_dict.get("consortium"), dbgap_study_api_dict.get("consortium"))
        dbGaPPhsID = coalesce(duos_study_dict.get("dbGaPPhsID"), dbgap_fhir_dict.get("dbGaPPhsID"), dbgap_xml_dict.get("dbGaPPhsID"), dbgap_study_api_dict.get("dbGaPPhsID"), terra_dict.get("dbGaPPhsID"))
        dbGaPStudyRegistrationName = coalesce(duos_study_dict.get("dbGaPStudyRegistrationName"), dbgap_fhir_dict.get("dbGaPStudyRegistrationName"), dbgap_xml_dict.get("dbGaPStudyRegistrationName"), dbgap_study_api_dict.get("dbGaPStudyRegistrationName"), terra_dict.get("dbGaPStudyRegistrationName"))
        if dbGaPPhsID and consent_code:
            study_consent = dbGaPPhsID + ":" + consent_code
            purl_doid = ds_consent_map.get(study_consent)
            if purl_doid:
                if not isinstance(purl_doid, list):
                    purl_doid = [purl_doid]
            else:
                purl_doid = []
        else:
            purl_doid = []
        final_results_dict["snapshot_id"] = snapshot_id
        final_results_dict["snapshot_phs_id"] = snapshot_phs_id
        final_results_dict["snapshot_duos_id"] = duos_id
        final_results_dict["match_duos_id"] = match_duos_id
        final_results_dict["target_dataset_id"] = target_dataset_id
        final_results_dict["target_study_id"] = match_study_id
        studyName = coalesce(duos_study_dict.get("studyName"), studyName)
        if dbGaPPhsID and studyName and f" ({dbGaPPhsID})" not in studyName:
            final_results_dict["studyName"] = studyName + f" ({dbGaPPhsID})"
        else:
            final_results_dict["studyName"] = studyName
        final_results_dict["studyType"] = coalesce(duos_study_dict.get("studyType"), dbgap_fhir_dict.get("studyType"), dbgap_xml_dict.get("studyType"), dbgap_study_api_dict.get("studyType"), terra_dict.get("studyType"))
        final_results_dict["studyDescription"] = format_description(coalesce(duos_study_dict.get("studyDescription"), dbgap_fhir_dict.get("studyDescription"), dbgap_xml_dict.get("studyDescription"), dbgap_study_api_dict.get("studyDescription"), terra_dict.get("studyDescription")))
        if final_results_dict["studyDescription"]:
            if "Platform: AnVIL" not in final_results_dict["studyDescription"]:
                final_results_dict["studyDescription"] = final_results_dict["studyDescription"] + "\nPlatform: AnVIL"
        else:
            final_results_dict["studyDescription"] = "Platform: AnVIL"
        final_results_dict["dataTypes"] = coalesce(duos_study_dict.get("dataTypes"), terra_dict.get("dataTypes"), dbgap_fhir_dict.get("dataTypes"), dbgap_xml_dict.get("dataTypes"), dbgap_study_api_dict.get("dataTypes"))
        final_results_dict["phenotypeIndication"] = coalesce(duos_study_dict.get("phenotypeIndication"), terra_dict.get("phenotypeIndication"), dbgap_fhir_dict.get("phenotypeIndication"), dbgap_xml_dict.get("phenotypeIndication"), dbgap_study_api_dict.get("phenotypeIndication"))
        final_results_dict["species"] = "Human"
        final_results_dict["piName"] = coalesce(duos_study_dict.get("piName"), dbgap_fhir_dict.get("piName"), dbgap_xml_dict.get("piName"), dbgap_study_api_dict.get("piName"), terra_dict.get("piName"), "None")
        final_results_dict["dataCustodianEmail"] = ["help@lists.anvilproject.org"]
        final_results_dict["publicVisibility"] = True
        final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if dbGaPPhsID else "I am NHGRI funded and I do not have a dbGaP PHS ID"
        final_results_dict["submittingToAnvil"] = True
        final_results_dict["dbGaPPhsID"] = dbGaPPhsID
        if dbGaPPhsID and dbGaPStudyRegistrationName and f" ({dbGaPPhsID})" in dbGaPStudyRegistrationName:
            final_results_dict["dbGaPStudyRegistrationName"] = dbGaPStudyRegistrationName.replace(f" ({dbGaPPhsID})", "")
        else:
            final_results_dict["dbGaPStudyRegistrationName"] = dbGaPStudyRegistrationName
        final_results_dict["embargoReleaseDate"] = coalesce(duos_study_dict.get("embargoReleaseDate"), dbgap_fhir_dict.get("embargoReleaseDate"), dbgap_xml_dict.get("embargoReleaseDate"), dbgap_study_api_dict.get("embargoReleaseDate"), terra_dict.get("embargoReleaseDate"))
        final_results_dict["sequencingCenter"] = None
        final_results_dict["piEmail"] = coalesce(duos_study_dict.get("piEmail"), dbgap_fhir_dict.get("piEmail"), dbgap_xml_dict.get("piEmail"), dbgap_study_api_dict.get("piEmail"), terra_dict.get("piEmail"))
        final_results_dict["piInstitution"] = coalesce(duos_study_dict.get("piInstitution"), dbgap_fhir_dict.get("piInstitution"), dbgap_xml_dict.get("piInstitution"), dbgap_study_api_dict.get("piInstitution"), terra_dict.get("piInstitution"))
        final_results_dict["nihGrantContractNumber"] = None
        final_results_dict["nihICsSupportingStudy"] = coalesce(duos_study_dict.get("nihICsSupportingStudy"), dbgap_fhir_dict.get("nihICsSupportingStudy"), dbgap_xml_dict.get("nihICsSupportingStudy"), dbgap_study_api_dict.get("nihICsSupportingStudy"), terra_dict.get("nihICsSupportingStudy"))
        final_results_dict["nihProgramOfficerName"] = coalesce(duos_study_dict.get("nihProgramOfficerName"), dbgap_fhir_dict.get("nihProgramOfficerName"), dbgap_xml_dict.get("nihProgramOfficerName"), dbgap_study_api_dict.get("nihProgramOfficerName"), terra_dict.get("nihProgramOfficerName"))
        final_results_dict["nihInstitutionCenterSubmission"] = "NHGRI"
        final_results_dict["nihInstitutionalCertificationFileName"] = None
        final_results_dict["nihGenomicProgramAdministratorName"] = coalesce(duos_study_dict.get("nihGenomicProgramAdministratorName"), dbgap_fhir_dict.get("nihGenomicProgramAdministratorName"), dbgap_xml_dict.get("nihGenomicProgramAdministratorName"), dbgap_study_api_dict.get("nihGenomicProgramAdministratorName"), terra_dict.get("nihGenomicProgramAdministratorName"))
        final_results_dict["multiCenterStudy"] = None
        final_results_dict["collaboratingSites"] = [consortium] if consortium else []
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = None
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = None
        final_results_dict["alternativeDataSharingPlan"] = False
        final_results_dict["alternativeDataSharingPlanReasons"] = []
        final_results_dict["alternativeDataSharingPlanExplanation"] = None
        final_results_dict["alternativeDataSharingPlanFileName"] = None
        final_results_dict["alternativeDataSharingPlanDataSubmitted"] = None
        final_results_dict["alternativeDataSharingPlanDataReleased"] = None
        final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = None
        final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = None
        final_results_dict["alternativeDataSharingPlanAccessManagement"] = None
        final_results_dict["consentGroups.consentGroupName"] = consent_group_name
        if access_management == "controlled" and 'NHGRI' not in dac_names:
            final_results_dict["consentGroups.accessManagement"] = "external"
        else:
            final_results_dict["consentGroups.accessManagement"] = access_management
        final_results_dict["dacId"] = dac_names
        final_results_dict["consentGroups.numberOfParticipants"] = coalesce(terra_dict.get("consentGroups.numberOfParticipants"), dbgap_fhir_dict.get("consentGroups.numberOfParticipants"), dbgap_xml_dict.get("consentGroups.numberOfParticipants"), dbgap_study_api_dict.get("consentGroups.numberOfParticipants"), "0")
        final_results_dict["consentCode"] = consent_code
        final_results_dict["consentGroups.generalResearchUse"] = True if access_management == "controlled" and "GRU" in consent_code else False
        final_results_dict["consentGroups.hmb"] = True if access_management == "controlled" and "HMB" in consent_code else False
        if purl_doid:
            final_results_dict["consentGroups.diseaseSpecificUse"] = purl_doid
        else:
            final_results_dict["consentGroups.diseaseSpecificUse"] = []
        final_results_dict["consentGroups.gs"] = consent_code if access_management == "controlled" and "GS-" in consent_code else None
        final_results_dict["consentGroups.poa"] = True if access_management == "controlled" and "POA" in consent_code else False
        final_results_dict["consentGroups.nmds"] = True if access_management == "controlled" and "NMDS" in consent_code else False
        final_results_dict["consentGroups.gso"] = True if access_management == "controlled" and "GSO" in consent_code else False
        final_results_dict["consentGroups.pub"] = True if access_management == "controlled" and "PUB" in consent_code else False 
        final_results_dict["consentGroups.col"] = True if access_management == "controlled" and "COL-" in consent_code else False
        final_results_dict["consentGroups.irb"] = True if access_management == "controlled" and "IRB" in consent_code else False
        final_results_dict["consentGroups.npu"] = True if access_management == "controlled" and "NPU" in consent_code else False
        final_results_dict["consentGroups.otherPrimary"] = consent_code if (consent_code and access_management == "controlled" and not (final_results_dict["consentGroups.generalResearchUse"] or final_results_dict["consentGroups.hmb"] or final_results_dict["consentGroups.diseaseSpecificUse"] or final_results_dict["consentGroups.gs"] or final_results_dict["consentGroups.poa"] or final_results_dict["consentGroups.nmds"] or final_results_dict["consentGroups.gso"] or final_results_dict["consentGroups.pub"] or final_results_dict["consentGroups.col"] or final_results_dict["consentGroups.irb"] or final_results_dict["consentGroups.npu"])) else None
        final_results_dict["consentGroups.otherSecondary"] = None
        final_results_dict["consentGroups.mor"] = None
        final_results_dict["consentGroups.morDate"] = None
        final_results_dict["consentGroups.dataLocation"] = "TDR Location"
        final_results_dict["consentGroups.url"] = "https://data.terra.bio/snapshots/" + snapshot_id
        final_results_dict["consentGroups.fileTypes.fileType"] = coalesce(terra_dict.get("consentGroups.fileTypes.fileType"), dbgap_fhir_dict.get("consentGroups.fileTypes.fileType"), dbgap_xml_dict.get("consentGroups.fileTypes.fileType"), dbgap_study_api_dict.get("consentGroups.fileTypes.fileType"))
        final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = None
        final_results_dict["consortium"] = consortium
        dataset_details_records.append(final_results_dict)

    # Return results
    return dataset_details_records

#############################################
## Input Parameters
#############################################

# Specify the snapshots to pull data for:
snapshot_id_list = [
    '29ec75f0-53ac-405f-a973-b034126ae457',
    '9393d37f-8c9d-43fa-a42a-52536a24236d',
    '175d510a-b8e3-4e43-86da-0aec15ba1ce0',
    '8afa7677-ce77-4ff4-9968-04f8794f26bf',
    '69d0762d-8acd-4962-86eb-b924630858d0',
    '48417de5-c3b9-4a1b-807b-f7cb5ba05fea',
]

# Specify a mapping from phs-consent to DOID for DS consent codes (replace "_" with "-" in consent first)
ds_consent_map = {
    'phs000298:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs000693:DS-BDIS': 'http://purl.obolibrary.org/obo/DOID_936',
    'phs000693:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs000744:DS-RD': 'http://purl.obolibrary.org/obo/DOID_15',
    'phs000744:DS-THAL-IRB': 'http://purl.obolibrary.org/obo/DOID_10241',
    'phs001222:DS-DRC-IRB-NPU': 'http://purl.obolibrary.org/obo/DOID_9351',
    'phs001227:DS-ATHSCL-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_1936',
    'phs001259:DS-CARD-MDS-GSO': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001487:DS-MULTIPLE-DISEASES-IRB-COL-NPU-RD': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001489:DS-EAED-IRB-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EAED-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EARET-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-ADULT-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-MULTI-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-NPU': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPCOM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBA-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-NPU-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBAID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-MBND-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs001489:DS-NSD-ADULTS-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001489:DS-NSD-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001506:DS-CVD-IRB': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001592:DS-CVD': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001642:DS-GR-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-DSDI-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-GID': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-IBD': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001642:DS-IBD-MDS': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001676:DS-AONDD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001740:DS-ASD-RD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001741:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001871:DS-CAD-IRB': 'http://purl.obolibrary.org/obo/DOID_3393',
    'phs001894:DS-EAC-PUB-GSO': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001901:DS-CVD-MDS': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs002004:DS-AUT': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002032:DS-SMA-MDS': 'http://purl.obolibrary.org/obo/DOID_12377',
    'phs002032:DS-MBND-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002041:DS-MLHLTH-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002041:DS-SZ-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002041:DS-SZRD-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002042:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002043:DS-AASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002044:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002206:DS-PEDD-IRB': 'http://purl.obolibrary.org/obo/DOID_4',
    'phs002282:DS-CVDRF': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs002502:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-MDS': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-NPU': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'ph2002502:DS-MLHLTH-IRB-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'ph2002502:DS-MH': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002502:DS-MBND-MDS': 'http://purl.obolibrary.org/obo/DOID_1289',
    'phs003200:DS-MSC-MDS': ['http://purl.obolibrary.org/obo/DOID_1909', 'http://purl.obolibrary.org/obo/DOID_4159']
}

# Token for use in DUOS (use gcloud auth print-access-token to get this)
duos_token = ""

# DUOS Environment
duos_env = "prod"

# Specify whether results should be written out to a file in the workspace bucket
write_to_bucket = True

# Set the below to "False" to turn off matching to existing dataset/study information in DUOS (to see what the original values would have been)
match_existing = False

#############################################
## Execution
#############################################
dataset_details_records = fetch_dataset_details(snapshot_id_list, ds_consent_map, duos_token, duos_env, match_existing)
output = pd.DataFrame(dataset_details_records)
output_sorted = output.sort_values(by=["studyName", "consentGroups.consentGroupName"], ascending=[True, True], ignore_index=True)

#############################################
## Validation and Output
#############################################
# Create copy of dataframe for unique value validation
output_unique_val = output_sorted.copy()

# Convert study list fields to strings
list_fields = ["dataTypes", "dataCustodianEmail", "nihICsSupportingStudy", "collaboratingSites", "alternativeDataSharingPlanReasons"]
for field in list_fields:
    output_unique_val[field] = [try_join(l) for l in output_unique_val[field]]

# Get unique values per study-level field, by study
study_level_col_list = []
for col in output_unique_val.columns:
    if "consentGroups." not in col and col not in ["studyName", "snapshot_id", "consortium", "consentCode", "snapshot_duos_id", "match_duos_id"]:
        study_level_col_list.append(col)
df_unique = output_unique_val.groupby("studyName")[study_level_col_list].nunique()
df_unique["unique_value_validation"] = df_unique.max(axis=1)
df_unique["unique_value_validation"] = ["Pass" if l <= 1 else "Fail" for l in df_unique["unique_value_validation"]]

# Create copy of dataframe for enum validation
output_enum_val = output_sorted.copy()

# Validate enum fields
output_enum_val["studyType"] = [val_study_type_enum(l) for l in output_enum_val["studyType"]]
output_enum_val["nihInstitutionCenterSubmission"] = [val_nih_inst_center_sub_enum(l) for l in output_enum_val["nihInstitutionCenterSubmission"]]
output_enum_val["nihICsSupportingStudy"] = [val_nih_ic_supp_study_enum(l) for l in output_enum_val["nihICsSupportingStudy"]]
output_enum_val["consentGroups.fileTypes.fileType"] = [val_file_type_enum(l) for l in output_enum_val["consentGroups.fileTypes.fileType"]]
study_enum_cols = ["studyType", "nihInstitutionCenterSubmission", "nihICsSupportingStudy"]
df_study_enum = output_enum_val.groupby("studyName")[study_enum_cols].sum()
df_study_enum["study_enum_value_validation"] = df_study_enum.max(axis=1)
df_study_enum["study_enum_value_validation"] = ["Pass" if l < 1 else "Fail" for l in df_study_enum["study_enum_value_validation"]]
consent_group_enum_cols = ["consentGroups.fileTypes.fileType"]
df_consent_group_enum = output_enum_val.groupby("consentGroups.consentGroupName")[consent_group_enum_cols].sum()
df_consent_group_enum["consent_group_enum_value_validation"] = df_consent_group_enum.max(axis=1)
df_consent_group_enum["consent_group_enum_value_validation"] = ["Pass" if l < 1 else "Fail" for l in df_consent_group_enum["consent_group_enum_value_validation"]]

# Join validation dataframes to original dataframe
output_sorted_validated = output_sorted.join(df_unique["unique_value_validation"], on="studyName", how="left")
output_sorted_validated = output_sorted_validated.join(df_study_enum["study_enum_value_validation"], on="studyName", how="left")
output_sorted_validated = output_sorted_validated.join(df_consent_group_enum["consent_group_enum_value_validation"], on="consentGroups.consentGroupName", how="left")

# Write out output
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H:%M:%S")
output_file = f"dataset_metadata_{current_datetime_string}.tsv"
output_sorted_validated.to_csv(output_file, index=False, sep="\t")
!gsutil cp $output_file $ws_bucket/dataset_metadata/output/ 2> stdout

# Display outputs
print("----------------------------------------------------------------------------------------------------")
print("----------------------------------------------------------------------------------------------------")
print("Validated Metadata Output:")
display(output_sorted_validated.style.hide(axis="index"))


# Step 2: Load Reviewed Metadata into DUOS

In [None]:
#############################################
## Functions
#############################################

def format_list(input_list, min_items):
    if input_list:
        if isinstance(input_list, list):
            return input_list
        elif isinstance(input_list, str):
            return format_list(ast.literal_eval(input_list), min_items)
        else:
            return []
    else:
        if min_items > 0:
            i = 0
            temp_list = []
            while i < min_items:
                temp_list.append("Unknown")
                i += 1
            return temp_list
        else:
            return []
    
def format_file_types(ft_list, fe):
    if ft_list:
        output_list = []
        formatted_ft_list = format_list(ft_list, 0)
        for ft in formatted_ft_list:
            ft_dict = {"fileType": ft}
            if fe:
                ft_dict["functionalEquivalence"] = fe
            else:
                ft_dict["functionalEquivalence"] = "Unknown"
            output_list.append(ft_dict)
        return output_list
    else:
        return []
    
def upload_to_duos(input_file, token, env, study_upload_list, preview_only, include_anvil_in_preview):
    
    # Determine the target URL from the env variable
    if env == "prod":
        url = "https://consent.dsde-prod.broadinstitute.org"
    else:
        url = "https://consent.dsde-dev.broadinstitute.org"
    
    # Pull down specified file from the cloud
    results_log = []
    print(f"Downloading input file {input_file}...")
    try:
        input_df = pd.read_csv(input_file, delimiter = "\t", encoding='unicode_escape')
        input_df = input_df.astype(object).where(pd.notnull(input_df),None)
        input_df.fillna("",inplace=True)
        input_dict = input_df.to_dict(orient="records")
        results_log.append(["Input File Download", "Succeeded", ""])
    except Exception as e:
        msg = f"Error downloading input file ({input_file}): {str(e)}"
        results_log.append(["Input File Download", "Failed", msg])
        print(msg)
        return results_log

    # Parse and build DUOS schema for inputted file
    print("Parsing input file and formatting into DUOS schema...")
    upload_dict = {}
    study_lookup = {}
    try:
        # Determine data submitter id
        response = requests.get(
            url=f"{url}/api/user/me",
            headers={"Authorization": f"Bearer {token}"}
        ).json()
        data_submitter_id = response["userId"]
        # Build dictionary for upload
        existing_dataset_cnt = 0
        new_dataset_cnt = 0
        for input_entry in input_dict:
            snapshot_id = input_entry["snapshot_id"]
            dataset_id = str(int(input_entry["target_dataset_id"])) if input_entry["target_dataset_id"] else input_entry["target_dataset_id"]
            dataset_name = input_entry["consentGroups.consentGroupName"]
            study_id = str(int(input_entry["target_study_id"])) if input_entry["target_study_id"] else input_entry["target_study_id"]
            study_name = input_entry["studyName"]
            if study_id:
                study_lookup[study_name] = study_id
            tar_ds_id = dataset_id if dataset_id else "ID_TBD"
            tar_st_id = study_id if study_id else "ID_TBD"
            access_type = input_entry["consentGroups.accessManagement"]
            file_types_dict = []
            if input_entry.get("consentGroups.fileTypes"):
                file_types_dict = json.loads(input_dict[0]["consentGroups.fileTypes"])
            print(f"Parsing and formatting metadata for snapshot {snapshot_id} from the input file. Target study is: {study_name} ({tar_st_id}). Target consent group is: {dataset_name} ({tar_ds_id})")

            # If this is an existing dataset in the specified existing study, provide limited consent group information (for updates only)
            if dataset_id:
                existing_dataset_cnt += 1
                consent_group_dict = {
                            "consentGroupName": dataset_name,
                            "datasetId": dataset_id,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            # If this is a new dataset that is open access, provide limited consent group information
            elif access_type == "open":
                new_dataset_cnt += 1
                consent_group_dict = {
                            "consentGroupName": dataset_name,
                            "accessManagement": access_type,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            # If this is a new dataset that is external access, provide consent group information (minus the dac_id)
            elif access_type == "external":
                new_dataset_cnt += 1
                consent_group_dict = {
                            "consentGroupName": dataset_name,
                            "accessManagement": access_type,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "generalResearchUse": input_entry["consentGroups.generalResearchUse"],
                            "hmb": input_entry["consentGroups.hmb"],
                            "diseaseSpecificUse": format_list(input_entry["consentGroups.diseaseSpecificUse"], 0),
                            "gs": input_entry["consentGroups.gs"],
                            "poa": input_entry["consentGroups.poa"],
                            "nmds": input_entry["consentGroups.nmds"],
                            "gso": input_entry["consentGroups.gso"],
                            "pub": input_entry["consentGroups.pub"],
                            "col": input_entry["consentGroups.col"],
                            "irb": input_entry["consentGroups.irb"],
                            "npu": input_entry["consentGroups.npu"],
                            "otherPrimary": input_entry["consentGroups.otherPrimary"],
                            #"otherSecondary": input_entry["consentGroups.otherSecondary"], --> Excluding for now, per JL's request
                            #"mor": input_entry["consentGroups.mor"], --> Date formatting validation for morDate, exclude for now
                            #"morDate": input_entry["consentGroups.morDate"], --> Date formatting validation, exclude for now
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            # If this is a new dataset that is NOT open or external access, provide the full consent group information
            else:
                new_dataset_cnt += 1
                consent_group_dict = {
                            "consentGroupName": dataset_name,
                            "dataAccessCommitteeId": input_entry["dacId"],
                            "accessManagement": access_type,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "generalResearchUse": input_entry["consentGroups.generalResearchUse"],
                            "hmb": input_entry["consentGroups.hmb"],
                            "diseaseSpecificUse": format_list(input_entry["consentGroups.diseaseSpecificUse"], 0),
                            "gs": input_entry["consentGroups.gs"],
                            "poa": input_entry["consentGroups.poa"],
                            "nmds": input_entry["consentGroups.nmds"],
                            "gso": input_entry["consentGroups.gso"],
                            "pub": input_entry["consentGroups.pub"],
                            "col": input_entry["consentGroups.col"],
                            "irb": input_entry["consentGroups.irb"],
                            "npu": input_entry["consentGroups.npu"],
                            "otherPrimary": input_entry["consentGroups.otherPrimary"],
                            #"otherSecondary": input_entry["consentGroups.otherSecondary"], --> Excluding for now, per JL's request
                            #"mor": input_entry["consentGroups.mor"], --> Date formatting validation for morDate, exclude for now
                            #"morDate": input_entry["consentGroups.morDate"], --> Date formatting validation, exclude for now
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }

            # If the study associated with the record is not already in the upload dictionary, create a new study dict and append the consent group dict
            study_dict = {}
            consent_group_list = []
            if study_name not in upload_dict.keys():
                consent_group_list.append(consent_group_dict)
                study_dict = {
                    "studyName": study_name,
                    #"studyType": input_entry["studyType"], --> Enumeration, exclude for now
                    "studyDescription": input_entry["studyDescription"].replace("\\n", "\n"),
                    "dataTypes": format_list(input_entry["dataTypes"], 1),
                    "phenotypeIndication": input_entry["phenotypeIndication"],
                    "species": input_entry["species"],
                    "piName": input_entry["piName"] if input_entry["piName"] else "NA",
                    "dataSubmitterUserId": data_submitter_id,
                    "dataCustodianEmail": format_list(input_entry["dataCustodianEmail"], 0),
                    "publicVisibility": input_entry["publicVisibility"],
                    "nihAnvilUse": input_entry["nihAnvilUse"],
                    "submittingToAnvil": input_entry["submittingToAnvil"],
                    "dbGaPPhsID": input_entry["dbGaPPhsID"],
                    "dbGaPStudyRegistrationName": input_entry["studyName"],
                    #"embargoReleaseDate": input_entry["embargoReleaseDate"], --> Date formatting validation, exclude for now
                    "sequencingCenter": input_entry["sequencingCenter"],
                    "piEmail": input_entry["piEmail"],
                    #"piInstitution": input_entry["piInstitution"], --> Integer ID for registered institutions, exclude for now
                    "piInstitution": 0,
                    "nihGrantContractNumber": "Unknown", # Required currently
                    "nihICsSupportingStudy": format_list(input_entry["nihICsSupportingStudy"], 0),
                    "nihProgramOfficerName": input_entry["nihProgramOfficerName"],
                    "nihInstitutionCenterSubmission": input_entry["nihInstitutionCenterSubmission"],
                    "nihInstitutionalCertificationFileName": input_entry["nihInstitutionalCertificationFileName"],
                    "nihGenomicProgramAdministratorName": input_entry["nihGenomicProgramAdministratorName"],
                    "collaboratingSites": format_list(input_entry["collaboratingSites"], 0),
                    "alternativeDataSharingPlan": input_entry["alternativeDataSharingPlan"],
                    "alternativeDataSharingPlanExplanation": input_entry["alternativeDataSharingPlanExplanation"],
                    "alternativeDataSharingPlanReasons": ["Other"] if input_entry["alternativeDataSharingPlan"] == True and input_entry["alternativeDataSharingPlanReasons"] == "[]" else format_list(input_entry["alternativeDataSharingPlanReasons"], 0), 
                    "consentGroups": consent_group_list
                }
                upload_dict[study_name] = study_dict
            # If the study is already in the upload dictionary, create an updated study dict and extend its list of consent groups
            else:
                study_dict = upload_dict[study_name].copy()
                for consent_group in study_dict["consentGroups"]:
                    if consent_group["consentGroupName"] != consent_group_dict["consentGroupName"]:
                        consent_group_list.append(consent_group)
                consent_group_list.append(consent_group_dict)
                study_dict["consentGroups"] = consent_group_list
                upload_dict[study_name] = study_dict
        msg = f"Input file formatting complete. Existing Datasets: {existing_dataset_cnt} New Datasets: {new_dataset_cnt}"
        print(msg)
        results_log.append(["Input File Parsing and Formatting", "Succeeded", msg])
    except Exception as e:
        msg = f"Error parsing and formatting input file: {str(e)}"
        results_log.append(["Input File Parsing and Formatting", "Failed", msg])
        print(msg)
        #return results_log

    # Loop through studies to upload and augment with an missing existing datasets
    print("Augmenting upload set with missing existing datasets...")
    dataset_upload_aug_list = []
    for study in upload_dict.keys():
        if study in study_upload_list or len(study_upload_list) == 0:
            study_id = study_lookup.get(study)
            if study_id:
                try:
                    study_datasets_in_duos = set()
                    study_datasets_in_input = set()
                    study_datasets_diff = set()
                    study_details = requests.get(
                            url=f"{url}/api/dataset/study/registration/{study_id}",
                            headers={"Authorization": f"Bearer {token}"}
                        ).json()
                    for dataset in study_details["consentGroups"]:
                        if dataset.get("datasetId"):
                            study_datasets_in_duos.add(dataset.get("datasetId"))
                    for dataset in upload_dict[study]["consentGroups"]:
                        if dataset.get("datasetId"):
                            study_datasets_in_input.add(dataset.get("datasetId"))
                    for dataset_in_duos in study_datasets_in_duos:
                        if str(dataset_in_duos) not in study_datasets_in_input:
                            study_datasets_diff.add(dataset_in_duos)
                    # Add missing datasets to the upload dict
                    temp_cg = upload_dict[study]["consentGroups"].copy()
                    for missing_dataset_id in study_datasets_diff:
                        dataset_upload_aug_list.append(missing_dataset_id)
                        dataset_details = requests.get(
                            url=f"{url}/api/dataset/v2/{missing_dataset_id}",
                            headers={"Authorization": f"Bearer {token}"}
                        ).json()
                        name = dataset_details["name"]
                        data_loc = ""
                        data_loc_url = ""
                        num_participants = 0
                        for prop_entry in dataset_details["properties"]:
                            if prop_entry["propertyName"] == "Data Location":
                                data_loc = prop_entry["propertyValue"]
                            elif prop_entry["propertyName"] == "# of participants":
                                num_participants = prop_entry["propertyValue"]
                            elif prop_entry["propertyName"] == "URL":
                                data_loc_url = prop_entry["propertyValue"] 
                        consent_group_dict = {
                            "consentGroupName": dataset_details["name"],
                            "datasetId": missing_dataset_id,
                            "numberOfParticipants": num_participants,
                            "dataLocation": data_loc,
                            "url": data_loc_url,
                            "fileTypes": []
                        }
                        temp_cg.append(consent_group_dict)
                    upload_dict[study]["consentGroups"] = temp_cg
                except:
                    print(f"WARNING: Issue retrieving study details for study_id {study_id}. May cause issues with upload downstream.")

    # Preview of upload input dictionary
    if preview_only:
        # Build a preview of the upload dictionary data
        print("Building upload set preview...")
        output_preview = []
        study_id_set = set()
        for study_name, study_dict in upload_dict.items():
            if study_name in study_upload_list or len(study_upload_list) == 0:
                study_id = study_lookup.get(study_name) if study_lookup.get(study_name) else f"ID_TBD ({study_name})"
                study_id_set.add(study_id)
                study_phs = study_dict["dbGaPPhsID"]
                for consent_group in study_dict["consentGroups"]:
                    dataset_id = consent_group.get("datasetId")
                    dataset_name = consent_group.get("consentGroupName")
                    snapshot_url = consent_group.get("url")
                    if snapshot_url and "https://data.terra.bio/snapshots/" in snapshot_url and dataset_id in dataset_upload_aug_list:
                        snapshot_id = snapshot_url.replace("https://data.terra.bio/snapshots/", "")
                        record_src = "upload_aug"
                    elif snapshot_url and "https://data.terra.bio/snapshots/" in snapshot_url:
                        snapshot_id = snapshot_url.replace("https://data.terra.bio/snapshots/", "")
                        record_src = "upload"
                    else:
                        snapshot_id = ""
                        record_src = "upload_aug"
                    if dataset_id:
                        dataset_details = requests.get(
                            url=f"{url}/api/dataset/v2/{dataset_id}",
                            headers={"Authorization": f"Bearer {token}"}
                        ).json()
                        dataset_identifier = dataset_details.get("datasetIdentifier")
                        dac_id = dataset_details.get("dacId") if dataset_details.get("dacId") else ""
                        duos_data_use_dict = dataset_details.get("dataUse")
                        du_gru = duos_data_use_dict.get("generalUse") if duos_data_use_dict.get("generalUse") else False
                        du_hmb = duos_data_use_dict.get("hmbResearch") if duos_data_use_dict.get("hmbResearch") else False
                        du_disease = duos_data_use_dict.get("diseaseRestrictions") if duos_data_use_dict.get("diseaseRestrictions") else []
                        du_poa = duos_data_use_dict.get("populationOriginsAncestry") if duos_data_use_dict.get("populationOriginsAncestry") else False
                        du_ethics = duos_data_use_dict.get("ethicsApprovalRequired") if duos_data_use_dict.get("ethicsApprovalRequired") else False
                        du_collab = duos_data_use_dict.get("collaboratorRequired") if duos_data_use_dict.get("collaboratorRequired") else False
                        du_geog = duos_data_use_dict.get("geographicalRestrictions") if duos_data_use_dict.get("geographicalRestrictions") else ""
                        du_genetic = duos_data_use_dict.get("geneticStudiesOnly") if duos_data_use_dict.get("geneticStudiesOnly") else False
                        du_pub = duos_data_use_dict.get("publicationResults") if duos_data_use_dict.get("publicationResults") else False
                        du_nmds = duos_data_use_dict.get("methodsResearch") if duos_data_use_dict.get("methodsResearch") else False
                        du_npu = duos_data_use_dict.get("nonProfitUse") if duos_data_use_dict.get("nonProfitUse") else False
                        du_other = duos_data_use_dict.get("other") if duos_data_use_dict.get("other") else ""
                        access_management = ""
                        for prop_entry in dataset_details["properties"]:
                            if prop_entry["propertyName"] == "Access Management":
                                access_management = prop_entry["propertyValue"]
                                break
                    else:
                        dataset_id = f"ID_TBD ({dataset_name})"
                        dataset_identifier = "ID_TBD"
                        dac_id = consent_group.get("dataAccessCommitteeId") if consent_group.get("dataAccessCommitteeId") else ""
                        du_gru = consent_group.get("generalResearchUse") if consent_group.get("generalResearchUse") else False
                        du_hmb = consent_group.get("hmb") if consent_group.get("hmb") else False
                        du_disease = consent_group.get("diseaseSpecificUse") if consent_group.get("diseaseSpecificUse") else []
                        du_poa = consent_group.get("poa") if consent_group.get("poa") else False
                        du_ethics = consent_group.get("irb") if consent_group.get("irb") else False
                        du_collab = consent_group.get("col") if consent_group.get("col") else False
                        du_geog = consent_group.get("gs") if consent_group.get("gs") else ""
                        du_genetic = consent_group.get("gso") if consent_group.get("gso") else False
                        du_pub = consent_group.get("pub") if consent_group.get("pub") else False
                        du_nmds = consent_group.get("nmds") if consent_group.get("nmds") else False
                        du_npu = consent_group.get("npu") if consent_group.get("npu") else False
                        du_other = consent_group.get("otherPrimary") if consent_group.get("otherPrimary") else ""
                        access_management = consent_group.get("accessManagement") if consent_group.get("accessManagement") else "" 
                    output_preview.append([study_id, study_name, study_phs, dataset_id, dataset_identifier, dataset_name, dac_id, access_management, du_gru, du_hmb, du_disease, du_poa, du_ethics, du_collab, du_geog, du_genetic, du_pub, du_nmds, du_npu, du_other, snapshot_id, record_src])

        # Add in AnVIL datasets not in the upload dictionary 
        if len(study_upload_list) == 0 and include_anvil_in_preview:
            anvil_datasets_in_duos = get_anvil_datasets_from_duos(token, env)
            for dataset in anvil_datasets_in_duos:
                dataset_exists = False
                for output_dataset in output_preview:
                    if str(dataset[3]) == str(output_dataset[3]):
                        dataset_exists = True
                        break
                if not dataset_exists:
                    rec_to_add = dataset.copy()
                    rec_to_add.append("prod_add")
                    output_preview.append(rec_to_add)

        # Display output preview
        df_results = pd.DataFrame(output_preview, columns = ["Study ID", "Study Name", "Study PHS", "Dataset ID", "Dataset Identifier", "Dataset Name", "DAC ID", "Access", "GRU", "HMB", "DS", "POA", "IRB", "COL", "GS", "GSO", "PUB", "NMDS", "NPU", "OTHER", "Snapshot ID", "Record Source"])
        print("\nOutput Preview:")
        display(df_results)
    
    else:
        print("Uploading studies to DUOS...")
        for study in upload_dict.keys():
            if study in study_upload_list or len(study_upload_list) == 0:
                study_id = study_lookup.get(study)
                # For studies that don't exist in DUOS, create a new study
                if not study_id:
                    print("Study does NOT currently exist in DUOS. Registering new study...")
                    try:
                        new_study_response = requests.post(
                            url=f"{url}/api/dataset/v3",
                            headers={"Authorization": f"Bearer {token}"},
                            files = {
                                "dataset": json.dumps(upload_dict[study]),
                                "alternativeDataSharingPlan": "",
                                "consentGroups[0].nihInstitutionalCertificationFile": ""  
                            }
                        ).json()
                        if new_study_response.get("studyId"):
                            study_id = new_study_response["studyId"]
                            msg = f"Study registration succeeded! Study Id: {study_id}"
                            results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                            print(msg)
                        else:
                            err_msg = new_study_response["message"]
                            msg = f"Study registration failed: {err_msg}"
                            results_log.append([f"New Study Registration - {study}", "Failed", msg])
                            print(msg)
                    except Exception as e:
                        msg = f"Study registration failed: {str(e)}"
                        results_log.append([f"New Study Registration - {study}", "Failed", msg])
                        print(msg)
                # For studies that already exist in DUOS, update the existing study
                else:
                    print("Study DOES currently exist in DUOS. Updating study...")
                    try:
                        # Update study in DUOS
#                         print(study_id)
#                         print(json.dumps(upload_dict[study]))
                        update_study_response = requests.put(
                            url=f"{url}/api/dataset/study/{study_id}",
                            headers={"Authorization": f"Bearer {token}"},
                            files = {
                                "dataset": json.dumps(upload_dict[study]),
                                "alternativeDataSharingPlan": "",
                                "consentGroups[0].nihInstitutionalCertificationFile": ""  
                            }
                        ).json()   
                        if update_study_response.get("studyId"):
                            study_id = update_study_response["studyId"]
                            msg = f"Study registration succeeded! Study Id: {study_id}"
                            results_log.append([f"Study Registration Update - {study}", "Succeeded", msg])
                            print(msg)
                        else:
                            err_msg = update_study_response["message"]
                            msg = f"Study registration failed: {err_msg}"
                            results_log.append([f"Study Registration Update - {study}", "Failed", msg])
                            print(msg)
                    except Exception as e:
                        msg = f"Study registration failed: {str(e)}"
                        results_log.append([f"Study Registration Update - {study}", "Failed", msg])
                        print(msg)
    
    # Return results
    return results_log


#############################################
## Input Parameters
#############################################

# Cloud path to file to process
input_file_gcs_path = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/input/firecloud_target_tcga_metadata_20250227.tsv"

# User token (use gcloud auth print-access-token to get this)
duos_token = ""

# Environment
duos_env = "dev"

# Study Upload List (to limit the studies upload, leave empty for all)
study_upload_list = [
#     "Center for Common Disease Genomics [CCDG] - Autoimmune: Inflammatory Bowel Disease (IBD) Exomes and Genomes (phs001642)",
#     "Center for Common Disease Genomics [CCDG] - Neuropsychiatric: Epilepsy: Epi25 Consortium (phs001489)",
#     "Center for Common Disease Genomics [CCDG] Neuropsychiatric: Autism Spectrum Disorder (ASD) - Whole Exomes (phs002502)",
]

# Specifies whether the upload should run (False) or if only a preview of the upload should be displayed
preview_only = True

# For preview_only = True cases, specifies whether AnVIL datasets not included in the upload should also be displayed
include_anvil_in_preview = False

#############################################
## Execution
#############################################

upload_results = upload_to_duos(input_file_gcs_path, duos_token, duos_env, study_upload_list, preview_only, include_anvil_in_preview)
df_results = pd.DataFrame(upload_results, columns = ["Item", "Status", "Message"])
print("\nUpload Results:")
display(df_results)


# Step 3: Sync DUOS Datasets with TDR Snapshots
Based on the provided pairs of Snapshots and DUOS IDs:
- If no DUOS ID is specified, the existing DUOS ID will be removed from the snapshot. The DUOS group will NOT be automatically removed from the snapshot's auth domain group, given that auth domain group may be shared with multiple snapshots. 
- If a DUOS ID is specified and the snapshot doesn't currently have one, the DUOS ID will be linked to the snapshot, the DUOS group will be added to the snapshot auth domain group, and the DUOS dataset registration will be updated with to point to the snapshot. 
- If a DUOS ID is specified and the snapshot currently has a different one, the new DUOS ID will be linked to the snapshot, the new DUOS group will be added to the snapshot auth domain group, the old DUOS group will be removed from the snapshot auth domain group, and the DUOS dataset registration will be updated to point to the snapshot.

In [None]:
#############################################
## Functions
#############################################

def sync_duos_ids_and_snapshots(duos_token, duos_env, snapshot_duos_list):
    results_log = []

    # Determine the target URL from the env variable
    if duos_env == "prod":
        duos_url = "https://consent.dsde-prod.broadinstitute.org"
    else:
        duos_url = "https://consent.dsde-dev.broadinstitute.org"
    
    # Loop through input snapshots and link DUOS IDs to them
    print("Syncing DUOS IDs to Snapshots...")
    for ss_duos_entry in snapshot_duos_list:
        creds, project = google.auth.default()
        auth_req = google.auth.transport.requests.Request()
        creds.refresh(auth_req)
        api_client = refresh_tdr_api_client()
        snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
        duos_api = data_repo_client.DuosApi(api_client=api_client)
        snapshot_id = ss_duos_entry[0]
        new_duos_id = ss_duos_entry[1]
        print(f"\tProcessing snapshot ID = {snapshot_id}")
        if new_duos_id:
            duos_action = "LINK"
        else:
            duos_action = "UNLINK"
            
        # Get current DUOS ID on the snapshot
        print(f"\t\t- Retrieving original DUOS ID from snapshot {snapshot_id} (if any).")
        current_duos_id = ""
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                response = snapshots_api.retrieve_snapshot(id=snapshot_id, include=["DUOS"]).to_dict()
                duos_firecloud_group = response["duos_firecloud_group"]
                if duos_firecloud_group:
                    current_duos_id = response["duos_firecloud_group"].get("duos_id")
                break
            except Exception as e:
                msg = f"Error retrieving original DUOS ID from Snapshot: {str(e)}"
                if attempt_counter >= 2:
                    results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Failed", msg])
                    current_duos_id = "Unknown"
                    break
                sleep(5)
                attempt_counter += 1
        if current_duos_id == "Unknown":
            continue
        
        # Get DUOS user group associated with new and current duos id
        new_duos_group = ""
        current_duos_group = ""
        if new_duos_id:
            print(f"\t\t- Fetching DUOS user group from new DUOS ID {new_duos_id}.")
            attempt_counter = 0
            while attempt_counter <= 2:
                try:  
                    response = duos_api.retrieve_duos_firecloud_group(duos_id=new_duos_id).to_dict()
                    new_duos_group = response["firecloud_group_email"]
                    results_log.append([f"DUOS User Group Fetching ({new_duos_id})", "Success", ""])
                    break
                except Exception as e:
                    msg = f"Error fetching DUOS user group for DUOS ID {new_duos_id}: {str(e)}"
                    if attempt_counter >= 2:
                        results_log.append([f"DUOS User Group Fetching ({new_duos_id})", "Failed", msg])
                    sleep(5)
                    attempt_counter += 1 
        if current_duos_id:
            if new_duos_id == current_duos_id:
                print(f"\t\t- New DUOS ID matches original DUOS ID on snapshot, so no additinoal DUOS user group to fetch.")
                current_duos_group = new_duos_group
            else:
                print(f"\t\t- Fetching DUOS user group from original DUOS ID {current_duos_id}.")
                attempt_counter = 0
                while attempt_counter <= 2:
                    try:  
                        response = duos_api.retrieve_duos_firecloud_group(duos_id=current_duos_id).to_dict()
                        current_duos_group = response["firecloud_group_email"]
                        results_log.append([f"DUOS User Group Fetching ({current_duos_id})", "Success", ""])
                        break
                    except Exception as e:
                        msg = f"Error fetching DUOS user group for DUOS ID {current_duos_id}: {str(e)}"
                        if attempt_counter >= 2:
                            results_log.append([f"DUOS User Group Fetching ({current_duos_id})", "Failed", msg])
                        sleep(5)
                        attempt_counter += 1 
            
        # Processing DUOS_ID-Snapshot sync
        if duos_action == "LINK":
            # Link the DUOS ID to the snapshot
            print(f"\t\t- Linking DUOS ID {new_duos_id} to snapshot.")
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = snapshots_api.link_duos_dataset_to_snapshot(id=snapshot_id, duos_id=new_duos_id).to_dict()
                    if response.get("linked"):
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {new_duos_id})", "Success", ""])
                        break
                    elif response.get("message"):
                        response_message = response.get("message")
                        msg = f"Error linking DUOS ID to Snapshot: {response_message}"
                        if attempt_counter >= 2:
                            results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {new_duos_id})", "Failed", msg])
                            break
                except Exception as e:
                    msg = f"Error linking DUOS ID to Snapshot: {str(e)}"
                    if attempt_counter >= 2:
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {new_duos_id})", "Failed", msg])
                    sleep(5)
                    attempt_counter += 1  

            # Add the DUOS user group to any DAC groups on the snapshot
            print(f"\t\t- Adding DUOS user group {new_duos_group} to snapshot DAC user group(s).")
            dac_groups = []
            try:
                response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id).to_dict()
                if response.get("auth_domain"):
                    dac_groups = response["auth_domain"]
                if dac_groups:
                    print(f"\t\t\t- DAC user group(s) found on snapshot: {dac_groups}.")
                    for dac_group in dac_groups:
                        response = requests.put(
                            url=f"https://api.firecloud.org/api/groups/{dac_group}/member/{new_duos_group}",
                            headers={"Authorization": f"Bearer {creds.token}"}
                        )
                        if response.status_code != 204:
                            results_log.append([f"DUOS Group to DAC Group Addition ({new_duos_group} - {dac_group})", "Failed", "Error adding DUOS group to DAC group."])
                        else:
                            results_log.append([f"DUOS Group to DAC Group Addition ({new_duos_group} - {dac_group})", "Success", ""])
                else:
                    msg = f"No DAC user group(s) found on snapshot."
                    print(f"\t\t\t- {msg}")
                    results_log.append([f"DUOS Group to DAC Group Addition ({snapshot_id} - {new_duos_id})", "Warning", msg])   
            except Exception as e:
                msg = f"Error adding DUOS Group to DAC Group: {str(e)}"
                results_log.append([f"DUOS Group to DAC Group Addition ({snapshot_id} - {new_duos_id})", "Failed", msg])
                
            # Retrieve DUOS registration
            print(f"\t\t- Retrieving DUOS dataset registration for DUOS ID {new_duos_id}.")
            duos_dataset_id = ""
            try:
                dataset_details = requests.get(
                    url=f"{duos_url}/api/tdr/{new_duos_id}",
                    headers={"Authorization": f"Bearer {duos_token}"}
                ).json()
                duos_dataset_id = dataset_details["datasetId"]
                duos_dataset_name = dataset_details["name"]
                results_log.append([f"DUOS Dataset Retrieval ({snapshot_id} - {new_duos_id})", "Success", f"Dataset_id = {duos_dataset_id}"])
            except Exception as e:
                msg = f"Error retrieving DUOS dataset registration: {str(e)}"
                results_log.append([f"DUOS Dataset Retrieval ({snapshot_id} - {new_duos_id})", "Failed", msg])
            
            # Update snapshot on the DUOS registration
            if duos_dataset_id and duos_dataset_name:
                payload = {
                    "name": duos_dataset_name,
                    "properties": [{
                        "propertyName": "URL",
                        "propertyValue": f"https://data.terra.bio/snapshots/{snapshot_id}",
                        "schemaProperty": "url",
                        "propertyType": "String"
                    }, {
                        "propertyName": "Data Location",
                        "propertyValue": "TDR Location",
                        "schemaProperty": "dataLocation",
                        "propertyType": "String"
                    }]
                }
                try:
                    dataset_patch_response = requests.patch(
                        url=f"{duos_url}/api/dataset/{duos_dataset_id}",
                        headers={"Authorization": f"Bearer {duos_token}"},
                        json=payload
                    )
                    if dataset_patch_response.status_code == 200:
                        results_log.append([f"DUOS Dataset Patch ({snapshot_id} - {new_duos_id})", "Success", ""])
                    elif dataset_patch_response.status_code == 304:
                        results_log.append([f"DUOS Dataset Patch ({snapshot_id} - {new_duos_id})", "Success", "Dataset not modified. No patch required."])
                    else:
                        err = dataset_patch_response.text
                        msg = f"Error patching DUOS Dataset: {err}"
                        results_log.append([f"DUOS Dataset Patch ({snapshot_id} - {new_duos_id})", "Failed", msg])
                except Exception as e:
                    msg = f"Error patching DUOS Dataset: {str(e)}"
                    results_log.append([f"DUOS Dataset Patch ({snapshot_id} - {new_duos_id})", "Failed", msg])
            else:
                results_log.append([f"DUOS Dataset Patch ({snapshot_id} - {new_duos_id})", "Failed", "Missing DUOS dataset ID or name"])     
        
        elif duos_action == "UNLINK" and current_duos_id:
            # Unlink the current DUOS ID from the snapshot (if any)
            print(f"\t\t- No DUOS ID specified. Removing current DUOS ID {current_duos_id} from the snapshot.")
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = snapshots_api.unlink_duos_dataset_from_snapshot(id=snapshot_id).to_dict()
                    if response.get("message"):
                        response_message = response.get("message")
                        msg = f"Error removing DUOS ID from Snapshot: {response_message}"
                        if attempt_counter >= 2:
                            results_log.append([f"Remove existing DUOS ID from Snapshot ({snapshot_id})", "Failed", msg])
                            break
                    else:
                        results_log.append([f"Remove existing DUOS ID from Snapshot ({snapshot_id})", "Success", ""])
                        break
                except Exception as e:
                    msg = f"Error removing DUOS ID from Snapshot: {str(e)}"
                    if attempt_counter >= 2:
                        results_log.append([f"Remove existing DUOS ID from Snapshot ({snapshot_id})", "Failed", msg])
                    sleep(5)
                    attempt_counter += 1
        else:
            print(f"\t\t- No DUOS ID specified and no current DUOS ID to remove from the snapshot.")
            results_log.append([f"Remove existing DUOS ID from Snapshot ({snapshot_id})", "Success", ""])
            
        # If a new duos ID is replacing an existing duos ID, remove the current DUOS user group from any DAC groups on the snapshot
        if new_duos_group and current_duos_group and new_duos_group != current_duos_group:
            print(f"\t\t- Removing DUOS user group {current_duos_group} from snapshot DAC user group(s).")
            dac_groups = []
            try:
                response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id).to_dict()
                if response.get("auth_domain"):
                    dac_groups = response["auth_domain"]
                if dac_groups:
                    print(f"\t\t\t- DAC user group(s) found on snapshot: {dac_groups}.")
                    for dac_group in dac_groups:
                        response = requests.delete(
                            url=f"https://api.firecloud.org/api/groups/{dac_group}/member/{current_duos_group}",
                            headers={"Authorization": f"Bearer {creds.token}"}
                        )
                        if response.status_code != 204:
                            results_log.append([f"DUOS Group Removal from DAC Group ({current_duos_group} - {dac_group})", "Failed", "Error removing DUOS group from DAC group."])
                        else:
                            results_log.append([f"DUOS Group Removal from DAC Group ({current_duos_group} - {dac_group})", "Success", ""])
                else:
                    msg = f"No DAC user group(s) found on snapshot."
                    print(f"\t\t\t- {msg}")
                    results_log.append([f"DUOS Group Removal from DAC Group ({current_duos_group} - N/A)", "Warning", msg])   
            except Exception as e:
                msg = f"Error removing DUOS Group from DAC Group: {str(e)}"
                results_log.append([f"DUOS Group Removal from DAC Group ({current_duos_group})", "Failed", msg])
                
    return results_log

#############################################
## Input Parameters
#############################################

# Token for use in DUOS (use gcloud auth print-access-token to get this)
duos_token = ""

# DUOS Environment
duos_env = "prod"

# Snapshot list
snapshot_duos_list = [
    #['snapshot_id', 'duos_id (or empty string to remove existing duos_id from snapshot)']
    ['34526f3b-945e-4f81-a25c-150f342b46c0', 'DUOS-000474'],
    ['22f3c573-ea7a-4ba3-ad0a-1680dc44c4bb', 'DUOS-000485'],
]

#############################################
## Execution
#############################################

results = sync_duos_ids_and_snapshots(duos_token, duos_env, snapshot_duos_list)
df_results = pd.DataFrame(results, columns = ["Item", "Status", "Message"])
print("\nLinking Results:")
display(df_results)

# Script Development

## Fetch parameters from snapshot/dataset

In [None]:
# Parameters
snapshot_id = "099d2585-1379-4333-b3b1-ffc0d26d95c5"

# Retrieve snapshot details
api_client = refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
snapshot_details = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
dataset_id = snapshot_details["source"][0]["dataset"]["id"]
phs_id = snapshot_details["source"][0]["dataset"]["phs_id"]

# Retrieve dataset details
dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["PROPERTIES"]).to_dict()
if dataset_details["properties"].get("auth_domains"):
    auth_domain = dataset_details["properties"]["auth_domains"][0]
if dataset_details["properties"].get("source_workspaces"):
    source_workspace = dataset_details["properties"]["source_workspaces"][0]

# Print output
print(phs_id)
print(source_workspace)

## Pulling Workspace Attributes

In [None]:
# Parameters
ws_project = "anvil-datastorage"
ws_name = "AnVIL_DepMap_HMB"

# Establish credentials
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

# Pull workspace attributes
ws_attributes = requests.get(
    url=f"https://api.firecloud.org/api/workspaces/{ws_project}/{ws_name}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Map to schema
terra_dict = {}
terra_dict["studyName"] = ws_attributes["workspace"]["attributes"].get("library:projectName")
terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
#terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
terra_dict["dataTypes"] = ws_attributes["workspace"]["attributes"].get("library:dataCategory")["items"]
terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
terra_dict["species"] = "Homo sapiens"
terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
terra_dict["dataCustodianEmail"] = ws_attributes["workspace"]["attributes"].get("library:contactEmail")
if ws_attributes["workspace"]["attributes"].get("tag:tags"):
    for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
        if "Consortium:" in tag:
            terra_dict["consortium"] = tag.split(":")[1].strip()
        elif "dbGaP:" in tag:
            terra_dict["dbGaPPhsID"] = tag.split(":")[1].strip()
terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"]["library:dataUseRestriction"] 
terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]

# View schema
print(terra_dict)


In [None]:
ws_attributes

In [None]:
ws_attributes

## dbGaP XML Parse

In [None]:
# Parameters
#phs_id = "phs003047"
phs_id = "phs003444"

# Pull and parse XML
phs_short = phs_id.replace("phs", "")
dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
response = requests.get(url=dbgap_url)
xml_data = xmltodict.parse(response.text)

# Map to schema
dac_names = ""
dbgap_xml_dict = {}
if isinstance(xml_data["dbgapss"]["Study"], list):
    study_data = xml_data["dbgapss"]["Study"][0]
else:
    study_data = xml_data["dbgapss"]["Study"] 
dbgap_xml_dict["studyName"] = study_data["StudyInfo"].get("StudyNameEntrez")
dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
dbgap_xml_dict["dbGaPPhsID"] = phs_id
dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
for ap_entry in study_data["Authority"]["Persons"]["Person"]:
    if ap_entry["Role"] == "PI":
        dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
        dbgap_xml_dict["piEmail"] = ap_entry["@email"]
        dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
    elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
        dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
    elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
        dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
ic_list = []
if isinstance(study_data["Authority"]["ICs"]["IC"], list):
    for ic_entry in study_data["Authority"]["ICs"]["IC"]:
        ic_list.append(ic_entry["@name"])
else:
    ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
dbgap_xml_dict["numberOfParticipants"] = study_data.get("@num_participants")
dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")
if isinstance(study_data["Policy"]["ConsentGroup"], list):
    dac_name_set = set()
    for idx, consent in enumerate(study_data["Policy"]["ConsentGroup"]):
        tmp_dac = consent["@dac_name"]
        dac_name_set.add(tmp_dac)
    dac_name_list = list(dac_name_set)
    dac_names = ", ".join(dac_name_list)
else:
    dac_names = study_data["Policy"]["ConsentGroup"]["@dac_name"]

# View schema
print(dbgap_xml_dict)


In [None]:
dac_names

In [None]:
type(study_data["Policy"]["ConsentGroup"])

## dbGaP Study API

In [None]:
# Parameters
study_uid = 483191234

# Pull and parse JSON
dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
response = requests.get(url=dbgap_study_url)
study_api_data = json.loads(response.text)

# Map to schema
dbgap_study_api_dict = {}
if study_api_data.get("error") == None:
    dbgap_study_api_dict["studyName"] = study_api_data["data"].get("report_name")
    dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
    dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
    dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
    for attr_entry in study_api_data["data"].get("attribution"):
        if attr_entry.get("title") == "Principal Investigator":
            dbgap_study_api_dict["piName"] = attr_entry.get("name")
            dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
            break

# View schema
print(dbgap_study_api_dict)

In [None]:
study_api_data

## dbGaP FHIR API

In [None]:
# Parameters
#phs_id = "phs003047"
phs_id = "phs000693"

# Pull and parse JSON
dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + phs_id
response = requests.get(url=dbgap_fhir_url)
fhir_data = json.loads(response.text)

# Map to schema
dbgap_fhir_dict = {}
dbgap_fhir_dict["studyName"] = fhir_data["entry"][0]["resource"].get("title")
dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
dbgap_fhir_dict["dbGaPPhsID"] = phs_id
dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
dbgap_fhir_dict["nihICsSupportingStudy"] = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
# studyType
for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
    for coding_entry in cat_entry.get("coding"):
        if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
            value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
            if dbgap_fhir_dict.get("studyType") and value:
                dbgap_fhir_dict["studyType"] += f", {value}"
            elif value:
                dbgap_fhir_dict["studyType"] = value
# dataTypes
dt_list = []
for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
        for inner_ext_entry in ext_entry.get("extension"):
            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                    dt_list.append(coding_entry.get("code"))
dbgap_fhir_dict["dataTypes"] = dt_list
# phenotypeIndication
for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
    for coding_entry in focus_entry.get("coding"):
        value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
        if dbgap_fhir_dict.get("phenotypeIndication") and value:
            dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
        elif value:
            dbgap_fhir_dict["phenotypeIndication"] = value
# numberOfParticipants
for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
        for inner_ext_entry in ext_entry.get("extension"):
            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                dbgap_fhir_dict["numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")

# View schema
print(dbgap_fhir_dict)

In [None]:
fhir_data

# Utilities

## Create Study Lookup

In [None]:
#############################################
## Input Parameters
#############################################

# User token (use gcloud auth print-access-token to get this)
duos_token = ""

# Environment
duos_env = "prod"


#############################################
## Execution
#############################################

if duos_env == "prod":
    url = "https://consent.dsde-prod.broadinstitute.org"
else:
    url = "https://consent.dsde-dev.broadinstitute.org"
    
# Iterate through studies sequentially
results = []
study_id = 1
missing_count = 0
print("Pulling study registrations...")
while missing_count < 50:
    try:
        print(f"Attempting to pull registration for study_id = {study_id}...")
        study_registration = requests.get(
            url=f"{url}/api/dataset/study/registration/{study_id}",
            headers={"Authorization": f"Bearer {duos_token}"}
        ).json()
        study_name = study_registration["studyName"]
        consent_groups = study_registration["consentGroups"]
        cg_length = 0
        if consent_groups:
            cg_length = len(consent_groups)
        results.append([study_id, study_name, cg_length])
        missing_count = 0
    except:
        missing_count += 1
    study_id += 1

# Display study lookup
print("\nDUOS Studies: ")
results_df = pd.DataFrame(results, columns=["study_id", "study_name", "consent_group_count"])  
display(results_df)
        

In [None]:
study_registration

## Delete Studies from DUOS

In [None]:
#############################################
## Input Parameters
#############################################

# User token (use gcloud auth print-access-token to get this)
duos_token = ""

# Environment
duos_env = "dev"

# Studies to delete
study_id_list = [
]

#############################################
## Execution
#############################################

if duos_env == "prod":
    url = "https://consent.dsde-prod.broadinstitute.org"
else:
    url = "https://consent.dsde-dev.broadinstitute.org"

# Delete studies
for study_id in study_id_list:
    print(f"Deleting study ID {study_id}")
    response = requests.delete(
        url=f"{url}/api/dataset/study/{study_id}",
        headers={"Authorization": f"Bearer {duos_token}"} 
    )
    if response.status_code == 200:
        print("Study deleted successfully.")
    else:
        msg = response.json()["message"]
        print(f"Error deleting study: {msg}")
    

## Delete Datasets from DUOS

In [None]:
#############################################
## Input Parameters
#############################################

# User token (use gcloud auth print-access-token to get this)
duos_token = ""

# Environment
duos_env = "dev"

# Datasets to delete
dataset_id_list = [ 
]

#############################################
## Execution
#############################################

if duos_env == "prod":
    url = "https://consent.dsde-prod.broadinstitute.org"
else:
    url = "https://consent.dsde-dev.broadinstitute.org"

# Delete datasets
for dataset_id in dataset_id_list:
    print(f"Deleting dataset ID {dataset_id}")
    response = requests.delete(
        url=f"{url}/api/dataset/index/{dataset_id}",
        headers={"Authorization": f"Bearer {duos_token}"} 
    )
    if response.status_code == 200:
        print("Dataset deleted successfully.")
    else:
        msg = response.json()["message"]
        print(f"Error deleting dataset: {msg}")
        

## Export Datasets from DUOS

In [None]:
def coalesce(*arg): 
    remove_list = ["", "NA", "N/A", "NONE", "TBD", "UNKNOWN", "UNSPECIFIED"]
    # update to remove N/A, None, Null, TBD
    for input_item in arg:
        if input_item is False or input_item == []:
            return input_item
        elif input_item:
            if isinstance(input_item, list):
                temp_list = [ele for ele in input_item if ele is not None and ele.upper() not in remove_list]
                if temp_list:
                    return temp_list
                else:
                    return []
            else:
                if str(input_item).upper() not in remove_list:
                    return input_item
    return None

#############################################
## Input Parameters
#############################################

# User token (use gcloud auth print-access-token to get this)
duos_token = ""

# Environment
duos_env = "prod"


#############################################
## Execution
#############################################

if duos_env == "prod":
    url = "https://consent.dsde-prod.broadinstitute.org"
else:
    url = "https://consent.dsde-dev.broadinstitute.org"
datasets = requests.get(
        url=f"{url}/api/dataset/v3",
        headers={"Authorization": f"Bearer {duos_token}"}
    ).json()
# Loop through datasets and capture information from dataset schema
dataset_details_records = []
for dataset in datasets:
    duos_identifier = dataset["identifier"]
    
    # Fetch dataset details
    duos_dict = {}
    duos_dict = requests.get(
        url=f"{url}/api/dataset/registration/{duos_identifier}",
        headers={"Authorization": f"Bearer {duos_token}"}
    ).json()
   
    # Pull additional dataset details from DUOS if needed (to get data use info) 
    if not duos_dict.get("consentGroups"):
        duos_dict["consentGroups"] = [{"datasetId": None}]
    duos_dataset_id = duos_dict["consentGroups"][0].get("datasetId")
    duos_data_use_dict = {}
    dac_id = ""
    if duos_dataset_id:
        dataset_details = requests.get(
            url=f"{url}/api/dataset/v2/{duos_dataset_id}",
            headers={"Authorization": f"Bearer {duos_token}"}
        ).json()
        duos_data_use_dict = dataset_details.get("dataUse")
        dac_id = dataset_details.get("dacId") if dataset_details.get("dacId") else ""
    
    # Format output
    final_results_dict = {}
    final_results_dict["dacId"] = dac_id
    final_results_dict["studyName"] = duos_dict.get("studyName")
    final_results_dict["studyType"] = duos_dict.get("studyType")
    final_results_dict["studyDescription"] = duos_dict.get("studyDescription")
    final_results_dict["dataTypes"] = duos_dict.get("dataTypes")
    final_results_dict["phenotypeIndication"] = duos_dict.get("phenotypeIndication")
    final_results_dict["species"] = duos_dict.get("species")
    final_results_dict["piName"] = duos_dict.get("piName")
    final_results_dict["dataCustodianEmail"] = duos_dict.get("dataCustodianEmail")
    final_results_dict["publicVisibility"] = duos_dict.get("publicVisibility")
    final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if duos_dict.get("nihAnvilUse") and 'already' in duos_dict.get("nihAnvilUse").lower() else "I am NHGRI funded and I do not have a dbGaP PHS ID"
    final_results_dict["submittingToAnvil"] = duos_dict.get("submittingToAnvil")
    final_results_dict["dbGaPPhsID"] = duos_dict.get("dbGaPPhsID")
    final_results_dict["dbGaPStudyRegistrationName"] = duos_dict.get("dbGaPStudyRegistrationName")
    final_results_dict["embargoReleaseDate"] = duos_dict.get("embargoReleaseDate")
    final_results_dict["sequencingCenter"] = duos_dict.get("sequencingCenter")
    final_results_dict["piEmail"] = duos_dict.get("piEmail")
    final_results_dict["piInstitution"] = duos_dict.get("piInstitution")
    final_results_dict["nihGrantContractNumber"] = duos_dict.get("nihGrantContractNumber")
    final_results_dict["nihICsSupportingStudy"] = duos_dict.get("nihICsSupportingStudy")
    final_results_dict["nihProgramOfficerName"] = duos_dict.get("nihProgramOfficerName")
    final_results_dict["nihInstitutionCenterSubmission"] = duos_dict.get("nihInstitutionCenterSubmission")
    final_results_dict["nihInstitutionalCertificationFileName"] = duos_dict.get("nihInstitutionalCertificationFileName")
    final_results_dict["nihGenomicProgramAdministratorName"] = duos_dict.get("nihGenomicProgramAdministratorName")
    final_results_dict["multiCenterStudy"] = duos_dict.get("multiCenterStudy")
    final_results_dict["collaboratingSites"] = duos_dict.get("collaboratingSites")
    final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSR")
    final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation")
    final_results_dict["alternativeDataSharingPlan"] = duos_dict.get("alternativeDataSharingPlan")
    final_results_dict["alternativeDataSharingPlanReasons"] = duos_dict.get("alternativeDataSharingPlanReasons")
    final_results_dict["alternativeDataSharingPlanExplanation"] = duos_dict.get("alternativeDataSharingPlanExplanation")
    final_results_dict["alternativeDataSharingPlanFileName"] = duos_dict.get("alternativeDataSharingPlanFileName")
    final_results_dict["alternativeDataSharingPlanDataSubmitted"] = duos_dict.get("alternativeDataSharingPlanDataSubmitted")
    final_results_dict["alternativeDataSharingPlanDataReleased"] = duos_dict.get("alternativeDataSharingPlanDataReleased")
    final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = duos_dict.get("alternativeDataSharingPlanTargetDeliveryDate")
    final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = duos_dict.get("alternativeDataSharingPlanTargetPublicReleaseDate")
    final_results_dict["alternativeDataSharingPlanAccessManagement"] = duos_dict.get("alternativeDataSharingPlanAccessManagement")
    final_results_dict["consentGroups.consentGroupName"] = duos_dict["consentGroups"][0].get("consentGroupName")
    final_results_dict["consentGroups.accessManagement"] = duos_dict["consentGroups"][0].get("accessManagement")
    final_results_dict["consentGroups.numberOfParticipants"] = duos_dict["consentGroups"][0].get("numberOfParticipants")
    final_results_dict["consentGroups.generalResearchUse"] = coalesce(duos_dict["consentGroups"][0].get("generalResearchUse"), duos_data_use_dict.get("generalUse"), False)
    final_results_dict["consentGroups.hmb"] = coalesce(duos_dict["consentGroups"][0].get("hmb"), duos_data_use_dict.get("hmbResearch"), False)
    final_results_dict["consentGroups.diseaseSpecificUse"] = coalesce(duos_dict["consentGroups"][0].get("diseaseSpecificUse"), duos_data_use_dict.get("diseaseRestrictions"), [])
    final_results_dict["consentGroups.gs"] = coalesce(duos_dict["consentGroups"][0].get("gs"), duos_data_use_dict.get("geographicalRestrictions"))
    final_results_dict["consentGroups.poa"] = coalesce(duos_dict["consentGroups"][0].get("poa"), duos_data_use_dict.get("populationOriginsAncestry"), False)
    final_results_dict["consentGroups.nmds"] = coalesce(duos_dict["consentGroups"][0].get("nmds"), False)
    final_results_dict["consentGroups.gso"] = coalesce(duos_dict["consentGroups"][0].get("gso"), duos_data_use_dict.get("geneticStudiesOnly"), False)
    final_results_dict["consentGroups.pub"] = coalesce(duos_dict["consentGroups"][0].get("pub"), duos_data_use_dict.get("publicationResults"), False)
    final_results_dict["consentGroups.col"] = coalesce(duos_dict["consentGroups"][0].get("col"), duos_data_use_dict.get("collaboratorRequired"), False)
    final_results_dict["consentGroups.irb"] = coalesce(duos_dict["consentGroups"][0].get("irb"), duos_data_use_dict.get("ethicsApprovalRequired"), False)
    final_results_dict["consentGroups.npu"] = coalesce(duos_dict["consentGroups"][0].get("npu"), False)
    final_results_dict["consentGroups.otherPrimary"] = coalesce(duos_dict["consentGroups"][0].get("otherPrimary"), duos_data_use_dict.get("other"))
    final_results_dict["consentGroups.otherSecondary"] = duos_dict["consentGroups"][0].get("otherSecondary")
    final_results_dict["consentGroups.mor"] = duos_dict["consentGroups"][0].get("mor")
    final_results_dict["consentGroups.morDate"] = duos_dict["consentGroups"][0].get("morDate")
    final_results_dict["consentGroups.dataLocation"] = duos_dict["consentGroups"][0].get("dataLocation")
    final_results_dict["consentGroups.url"] = duos_dict["consentGroups"][0].get("url")
    final_results_dict["consentGroups.fileTypes"] = str(duos_dict["consentGroups"][0].get("fileTypes"))
    dataset_details_records.append(final_results_dict)
    
# Display results
print("\nDUOS Datasets: ")
results_df = pd.DataFrame(dataset_details_records)  
display(results_df)
    

# Troubleshooting

## TSV File Parsing

In [None]:
input_file = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/input/firecloud_target_tcga_metadata_20250227.tsv" 

# Pull down specified file from the cloud
results_log = []
print(f"Downloading input file {input_file}...")
try:
    input_df = pd.read_csv(input_file, delimiter = "\t", encoding='unicode_escape')
    input_df = input_df.astype(object).where(pd.notnull(input_df),None)
    input_df.fillna("",inplace=True)
    input_dict = input_df.to_dict(orient="records")
    results_log.append(["Input File Download", "Succeeded", ""])
except Exception as e:
    msg = f"Error downloading input file ({input_file}): {str(e)}"
    results_log.append(["Input File Download", "Failed", msg])
    print(msg)

In [None]:
file_types_dict = json.loads(input_dict[0]["consentGroups.fileTypes"])

In [None]:
file_types_dict