# Imports and Common Functions

In [None]:
# !pip install --upgrade data_repo_client
# !pip install --upgrade xmltodict

In [1]:
# Imports
import requests
import json
import google.auth
import xmltodict
import data_repo_client
import pandas as pd
import re
from time import sleep
import ast

# Function to refresh TDR API client
def refresh_tdr_api_client():
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    config = data_repo_client.Configuration()
    config.host = "https://data.terra.bio"
    config.access_token = creds.token
    api_client = data_repo_client.ApiClient(configuration=config)
    api_client.client_side_validation = False
    return api_client

# Function to pull existing AnVIL data from DUOS
def get_anvil_datasets_from_duos(duos_token, duos_env):
    # Determine the target URL from the env variable
    if duos_env == "prod":
        url = "https://consent.dsde-prod.broadinstitute.org"
    else:
        url = "https://consent.dsde-dev.broadinstitute.org"

    # Pull a list of existing AnVIL studies and datasets from DUOS
    results = []
    datasets = requests.get(
        url=f"{url}/api/dataset/v3",
        headers={"Authorization": f"Bearer {duos_token}"}
    ).json()
    datasets_to_process = len(datasets)
    datasets_processed = 0
    for dataset_entry in datasets:
        datasets_processed += 1
        print(f"Processing dataset {datasets_processed} of {datasets_to_process}...")
        dataset_id = dataset_entry["dataset_id"]
        dataset_details = requests.get(
            url=f"{url}/api/dataset/v2/{dataset_id}",
            headers={"Authorization": f"Bearer {duos_token}"}
        ).json() 
        if dataset_details.get("study"):
            study_id = dataset_details["study"]["studyId"]
            if dataset_details["study"].get("description") and "Platform: AnVIL" in dataset_details["study"]["description"]: 
                study_name = dataset_details["study"]["name"]
                study_phs = ""
                for prop_entry in dataset_details["study"]["properties"]:
                    if prop_entry["key"] == "dbGaPPhsID":
                        study_phs = prop_entry["value"]
                        break
                dataset_name = dataset_details["name"]
                dataset_identifier = dataset_details["datasetIdentifier"]
                data_use = dataset_details.get("dataUse")
                du_gru = data_use.get("generalUse") if data_use.get("generalUse") else False
                du_hmb = data_use.get("hmbResearch") if data_use.get("hmbResearch") else False
                du_disease = data_use.get("diseaseRestrictions") if data_use.get("diseaseRestrictions") else []
                du_poa = data_use.get("populationOriginsAncestry") if data_use.get("populationOriginsAncestry") else False
                du_ethics = data_use.get("ethicsApprovalRequired") if data_use.get("ethicsApprovalRequired") else False
                du_collab = data_use.get("collaboratorRequired") if data_use.get("collaboratorRequired") else False
                du_geog = data_use.get("geographicalRestrictions") if data_use.get("geographicalRestrictions") else ""
                du_genetic = data_use.get("geneticStudiesOnly") if data_use.get("geneticStudiesOnly") else False
                du_pub = data_use.get("publicationResults") if data_use.get("publicationResults") else False
                du_nmds = data_use.get("methodsResearch") if data_use.get("methodsResearch") else False
                du_npu = data_use.get("nonProfitUse") if data_use.get("nonProfitUse") else False
                du_other = data_use.get("other") if data_use.get("other") else ""
                snapshot_id = ""
                for prop_entry in dataset_details["properties"]:
                    if prop_entry["propertyName"] == "URL":
                        snapshot_url = prop_entry["propertyValue"]
                        if snapshot_url and "https://data.terra.bio/snapshots/" in snapshot_url:
                                snapshot_id = snapshot_url.replace("https://data.terra.bio/snapshots/", "")       
                results.append([study_id, study_name, study_phs, dataset_id, dataset_identifier, dataset_name, du_gru, du_hmb, du_disease, du_poa, du_ethics, du_collab, du_geog, du_genetic, du_pub, du_nmds, du_npu, du_other, snapshot_id])

    # Return results
    return results

# Display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Step 0: Review Existing AnVIL DUOS Entries

In [2]:
#############################################
## Input Parameters
#############################################

# Token for use in DUOS (use gcloud auth print-access-token to get this)
duos_token = ""

# DUOS Environment
duos_env = "prod"

#############################################
## Execution
#############################################

# Fetch results
results = get_anvil_datasets_from_duos(duos_token, duos_env)

# Display results
df_results = pd.DataFrame(results, columns = ["Study ID", "Study Name", "Study PHS", "Dataset ID", "Dataset Identifier", "Dataset Name", "GRU", "HMB", "DS", "POA", "IRB", "COL", "GS", "GSO", "PUB", "NMDS", "NPU", "OTHER", "Snapshot ID"])
df_results_sorted = df_results.sort_values(by=["Study ID", "Dataset ID"], ascending=[True, True], ignore_index=True)
print("\nResults:")
display(df_results)

Processing dataset 1 of 328...
Processing dataset 2 of 328...
Processing dataset 3 of 328...
Processing dataset 4 of 328...
Processing dataset 5 of 328...
Processing dataset 6 of 328...
Processing dataset 7 of 328...
Processing dataset 8 of 328...
Processing dataset 9 of 328...
Processing dataset 10 of 328...
Processing dataset 11 of 328...
Processing dataset 12 of 328...
Processing dataset 13 of 328...
Processing dataset 14 of 328...
Processing dataset 15 of 328...
Processing dataset 16 of 328...
Processing dataset 17 of 328...
Processing dataset 18 of 328...
Processing dataset 19 of 328...
Processing dataset 20 of 328...
Processing dataset 21 of 328...
Processing dataset 22 of 328...
Processing dataset 23 of 328...
Processing dataset 24 of 328...
Processing dataset 25 of 328...
Processing dataset 26 of 328...
Processing dataset 27 of 328...
Processing dataset 28 of 328...
Processing dataset 29 of 328...
Processing dataset 30 of 328...
Processing dataset 31 of 328...
Processing datase

Processing dataset 253 of 328...
Processing dataset 254 of 328...
Processing dataset 255 of 328...
Processing dataset 256 of 328...
Processing dataset 257 of 328...
Processing dataset 258 of 328...
Processing dataset 259 of 328...
Processing dataset 260 of 328...
Processing dataset 261 of 328...
Processing dataset 262 of 328...
Processing dataset 263 of 328...
Processing dataset 264 of 328...
Processing dataset 265 of 328...
Processing dataset 266 of 328...
Processing dataset 267 of 328...
Processing dataset 268 of 328...
Processing dataset 269 of 328...
Processing dataset 270 of 328...
Processing dataset 271 of 328...
Processing dataset 272 of 328...
Processing dataset 273 of 328...
Processing dataset 274 of 328...
Processing dataset 275 of 328...
Processing dataset 276 of 328...
Processing dataset 277 of 328...
Processing dataset 278 of 328...
Processing dataset 279 of 328...
Processing dataset 280 of 328...
Processing dataset 281 of 328...
Processing dataset 282 of 328...
Processing

Unnamed: 0,Study ID,Study Name,Study PHS,Dataset ID,Dataset Identifier,Dataset Name,GRU,HMB,DS,POA,IRB,COL,GS,GSO,PUB,NMDS,NPU,OTHER,Snapshot ID
0,43,A Genomic Atlas of Systemic Interindividual Epigenetic Variation in Humans (GTEx) (phs001746),phs001746,211,DUOS-000158,ANVIL_GTEx_BCM_GRU_CoRSIVs (GRU),True,False,[],False,False,False,,False,False,False,False,,c753046a-cf9b-4813-be68-cb3b9dd9866e
1,46,Broad Institute Center for Mendelian Genomics (phs001272),phs001272,230,DUOS-000177,ANVIL_CMG_Broad_Orphan_Manton_WES_20221117_ANV5_202304241513,True,False,[],False,False,False,,False,False,False,False,,
2,48,CCDG Neuropsychiatric: Autism Center of Excellence (ACE II) (phs002042),phs002042,233,DUOS-000180,ANVIL_CCDG_NYGC_NP_Autism_ACE2_DS_MDS_WGS (DS-ASD),False,False,[http://purl.obolibrary.org/obo/DOID_0060041],False,False,False,,False,False,False,False,,0b31081f-1bce-490a-bd0a-b1aa0fd0daf6
3,49,CCDG Neuropsychiatric: Multimodal Developmental Neurogenetics of Females (phs002043),phs002043,234,DUOS-000181,ANVIL_CCDG_NYGC_NP_Autism_PELPHREY_ACE_DS_WGS (DS-AASD),False,False,[http://purl.obolibrary.org/obo/DOID_0060041],False,False,False,,False,False,False,False,,553ba443-b8cc-4d8e-9743-e384116a1236
4,49,CCDG Neuropsychiatric: Multimodal Developmental Neurogenetics of Females (phs002043),phs002043,235,DUOS-000182,ANVIL_CCDG_NYGC_NP_Autism_PELPHREY_ACE_GRU_WGS (GRU),True,False,[],False,False,False,,False,False,False,False,,3fc2937c-dc08-400f-9458-3779de623bd0
5,50,CCDG- Neuropsychiatric: Autism- Study of Autism Genetics Exploration (SAGE) (phs001740),phs001740,236,DUOS-000183,ANVIL_CCDG_NYGC_NP_Autism_SAGE_WGS (DS-ASD-RD-IRB),False,False,[http://purl.obolibrary.org/obo/DOID_0060041],False,True,False,,False,False,False,False,,bfab39d1-1a38-4884-a139-be2809378e7b
6,51,CCDG- Neuropsychiatric: Autism- The Autism Simplex Collection (TASC) (phs001741),phs001741,237,DUOS-000184,ANVIL_ccdg_nygc_np_autism_tasc_wgs (DS-ASD-IRB),False,False,[http://purl.obolibrary.org/obo/DOID_0060041],False,True,False,,False,False,False,False,,99a1ace0-aa83-4d9d-9e9c-e9b6b0111ba2
7,52,CCDG-Neuropsychiatric: A Study of the Genetic Causes of Complex Pediatric Disorders (phs002004),phs002004,238,DUOS-000185,ANVIL_CCDG_NYGC_NP_Autism_CAG_DS_WGS (DS-AUT),False,False,[http://purl.obolibrary.org/obo/DOID_0060041],False,False,False,,False,False,False,False,,76ec3691-30f3-43cd-af8b-e73c80da90b9
8,53,CCDG-Neuropsychiatric: Autism- Autism Genetic Resource Exchange (AGRE) (phs001766),phs001766,239,DUOS-000186,ANVIL_CCDG_NYGC_NP_Autism_AGRE_WGS (DS-ASD-IRB),False,False,[http://purl.obolibrary.org/obo/DOID_0060041],False,True,False,,False,False,False,False,,e66e025f-e07c-4f0d-93ed-3ac609b570d5
9,54,CCDG-Neuropsychiatric: Victorian Collaborative AuTism Study (CATS) (phs002044),phs002044,240,DUOS-000187,ANVIL_CCDG_NYGC_NP_Autism_HFA_DS_WGS (DS-ASD-IRB),False,False,[http://purl.obolibrary.org/obo/DOID_0060041],False,True,False,,False,False,False,False,,62a4b183-9157-4320-96e6-32f79c561399


# Step 1: Collect Metadata for Review

In [10]:
#############################################
## Functions
#############################################

def coalesce(*arg): 
    remove_list = ["", "NA", "N/A", "NONE", "TBD", "UNKNOWN", "UNSPECIFIED"]
    # update to remove N/A, None, Null, TBD
    for input_item in arg:
        if input_item is False or input_item == []:
            return input_item
        elif input_item:
            if isinstance(input_item, list):
                temp_list = [ele for ele in input_item if ele is not None and ele.upper() not in remove_list]
                if temp_list:
                    return temp_list
                else:
                    return []
            else:
                if str(input_item).upper() not in remove_list:
                    return input_item
    return None

def format_description(input_string):
    output_string = input_string if input_string else ""
    output_string = re.sub("\n\n\t", " ", output_string)
    output_string = re.sub("\t", " ", output_string)
    output_string = re.sub("study.cgi\?study_id=|.\/study.cgi\?study_id=", "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=", output_string)
    return output_string

def format_phs_id(input_str):
    try:
        num = re.search("phs0*([0-9]+)", input_str, re.IGNORECASE).group(1)
    except:
        num = ""
    if num:
        output_str = "phs" + str(num).zfill(6)
    else:
        output_str = ""
    return output_str

def try_join(l):
    try:
        if isinstance(l, list):
            return ', '.join(map(str, l))
        else:
            return l
    except TypeError:
        return l
    
def val_study_type_enum(l):
    if l and l not in ["Observational", "Interventional", "Descriptive", "Analytical", "Prospective", "Retrospective", "Case report", "Case series", "Cross-sectional", "Cohort study"]:
        return 1
    else:
        return 0

def val_nih_inst_center_sub_enum(l):
    if l and l not in ["NCI", "NEI", "NHLBI", "NHGRI", "NIA", "NIAAA", "NIAID", "NIAMS", "NIBIB", "NICHD", "NIDCD", "NIDCR", "NIDDK", "NIDA", "NIEHS", "NIGMS", "NIMH", "NIMHD", "NINDS", "NINR", "NLM", "CC", "CIT", "CSR", "FIC", "NCATS", "NCCIH"]:
        return 1
    else:
        return 0

def val_nih_ic_supp_study_enum(l):
    if l and isinstance(l, list):
        for item in l:
            if item not in ["NCI", "NEI", "NHLBI", "NHGRI", "NIA", "NIAAA", "NIAID", "NIAMS", "NIBIB", "NICHD", "NIDCD", "NIDCR", "NIDDK", "NIDA", "NIEHS", "NIGMS", "NIMH", "NIMHD", "NINDS", "NINR", "NLM", "CC", "CIT", "CSR", "FIC", "NCATS", "NCCIH"]:
                return 1
        return 0
    else:
        return 0

def val_file_type_enum(l):
    if l and isinstance(l, list):
        for item in l:
            if item not in ["Arrays", "Genome", "Exome", "Survey", "Phenotype"]:
                return 1
        return 0
    else:
        return 0

def fetch_dataset_details(snapshot_id, ds_consent_map, duos_token, duos_env):
    
    # Initialize variables
    dataset_details_records = []

    # Determine the DUOS URL from the duos_env variable
    if duos_env == "prod":
        url = "https://consent.dsde-prod.broadinstitute.org"
    else:
        url = "https://consent.dsde-dev.broadinstitute.org"

    # Build DUOS lookups
    print(f"Building DUOS dataset and study lookups...")
    study_lookup = {}
    dataset_lookup = []
    datasets = requests.get(
        url=f"{url}/api/dataset/v3",
        headers={"Authorization": f"Bearer {duos_token}"}
    ).json()
    study_ids_processed = set()
    for dataset_entry in datasets:
        dataset_id = dataset_entry.get("dataset_id")
        dataset_name = dataset_entry.get("dataset_name")
        identifier = dataset_entry.get("identifier")
        study_id = dataset_entry.get("study_id")
        try:
            base_consent_group_name = re.search(r'(.*)_[0-9]{8}_ANV[0-9]+_[0-9]{12}$', dataset_name).group(1)
        except:
            base_consent_group_name = dataset_name
        if study_id:
            # Build dataset lookup
            dataset_lookup.append({
                "dataset_id": dataset_id,
                "consent_group_name": dataset_name, 
                "base_consent_group_name": base_consent_group_name,
                "identifier": identifier,
                "study_id": study_id
            })
            # Build study lookup
            if study_id not in study_ids_processed:
                study_ids_processed.add(study_id)
                study_details = requests.get(
                    url=f"{url}/api/dataset/registration/{identifier}",
                    headers={"Authorization": f"Bearer {duos_token}"}
                ).json()
                study_desc = study_details.get("studyDescription")
                if study_desc and "Platform: AnVIL" in study_desc:
                    study_phs = study_details.get("dbGaPPhsID")
                    if study_phs:
                        id_in_lookup = study_lookup.get(study_phs)
                        if id_in_lookup and id_in_lookup != study_id:
                            print(f"Warning: PHS ID {study_phs} tied to multiple studies in DUOS: {id_in_lookup}, {study_id}. Please review.")
                        else:
                            study_lookup[study_phs] = study_id

    # Loop through and process snapshots
    for snapshot_id in snapshot_id_list:
        
        # Initialize variables
        terra_dict = {}
        dbgap_xml_dict = {}
        dbgap_study_api_dict = {}
        dbgap_fhir_dict = {}
        final_results_dict = {}
        snapshot_phs_id = ""

        # Retrieve snapshot details
        print(f"Processing snapshot_id: {snapshot_id}...")
        final_results_dict = {}
        api_client = refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
        attempt_counter = 0
        snapshot_details = {}
        while attempt_counter <= 2:
            try:
                snapshot_details = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
                break
            except:
                sleep(5)
                attempt_counter += 1  
        snapshot_name = snapshot_details["name"]
        dataset_id = snapshot_details["source"][0]["dataset"]["id"]
        snapshot_phs_id = format_phs_id(snapshot_details["source"][0]["dataset"]["phs_id"])
        if snapshot_details["source"][0]["dataset"]["secure_monitoring_enabled"] == True:
            access_management = "controlled"
        else:
            access_management = "open"
        if snapshot_details["source"][0]["dataset_properties"].get("source_workspaces"):  
            source_workspace = snapshot_details["source"][0]["dataset_properties"]["source_workspaces"][0]
        else:
            source_workspace = None
        if snapshot_details["source"][0]["dataset_properties"].get("consent_name"):
            snapshot_consent_code = snapshot_details["source"][0]["dataset_properties"]["consent_name"]
        else:
            if access_management == "open":
                snapshot_consent_code = "NRES"
            else:
                snapshot_consent_code = None
        if snapshot_details["duos_firecloud_group"] != None:
            duos_id = snapshot_details["duos_firecloud_group"]["duos_id"]
        else:
            duos_id = None
        try:
            base_consent_group_name = re.search(r'(.*)_[0-9]{8}_ANV[0-9]+_[0-9]{12}$', snapshot_name).group(1)
        except:
            base_consent_group_name = snapshot_name
        if access_management == "open":
            consent_group_name = base_consent_group_name + " (NRES)"
        elif snapshot_consent_code:
            consent_group_name = base_consent_group_name + f" ({snapshot_consent_code})"
        else:
            consent_group_name = base_consent_group_name
        
        print("\tSnapshot PHS_ID: " + str(snapshot_phs_id))
        print("\tSnapshot Consent Code: " + str(snapshot_consent_code))
        print("\tSource Workspace: " + str(source_workspace))
        print("\tDUOS ID: " + str(duos_id))
        print("\tConsent Group Name: " + str(consent_group_name))
        
        # Attempt to match to a DUOS ID based on consent group name
        match_duos_id = ""
        for dataset in dataset_lookup:
            if dataset["base_consent_group_name"] == base_consent_group_name or dataset["consent_group_name"] == consent_group_name:
                match_duos_id = dataset["identifier"]
                break
        match_study_id = ""
        if snapshot_phs_id:
            match_study_id = study_lookup.get(snapshot_phs_id)

        # If a snapshot or match DUOS ID is present, use this to build the final result dictionary
        if duos_id or match_duos_id:

            # Pull existing DUOS study registration
            duos_id_to_use = coalesce(duos_id, match_duos_id)
            duos_dict = {}
            duos_dict = requests.get(
                url=f"{url}/api/dataset/registration/{duos_id_to_use}",
                headers={"Authorization": f"Bearer {duos_token}"}
            ).json()
            #print(duos_dict)

            # Pull dataset details from DUOS (to get data use info) 
            if not duos_dict.get("consentGroups"):
                duos_dict["consentGroups"] = [{"datasetId": None}]
            duos_dataset_id = duos_dict["consentGroups"][0].get("datasetId")
            duos_data_use_dict = {}
            if duos_dataset_id:
                dataset_details = requests.get(
                    url=f"{url}/api/dataset/v2/{duos_dataset_id}",
                    headers={"Authorization": f"Bearer {duos_token}"}
                ).json()
                duos_data_use_dict = dataset_details.get("dataUse")

            # Build final results dictionary
            if snapshot_consent_code:
                consent_code = snapshot_consent_code.upper().replace("_", "-")
            else:
                consent_code = ""
            final_results_dict["snapshot_id"] = snapshot_id
            final_results_dict["snapshot_phs_id"] = snapshot_phs_id
            final_results_dict["snapshot_duos_id"] = duos_id
            final_results_dict["match_duos_id"] = match_duos_id
            final_results_dict["match_study_id"] = match_study_id
            studyName = duos_dict.get("studyName")
            dbGaP_study_name = duos_dict.get("dbGaPStudyRegistrationName")
            if snapshot_phs_id and studyName and f" ({snapshot_phs_id})" not in studyName:
                final_results_dict["studyName"] = studyName + f" ({snapshot_phs_id})"
            else:
                final_results_dict["studyName"] = studyName
            final_results_dict["studyType"] = duos_dict.get("studyType")
            final_results_dict["studyDescription"] = duos_dict.get("studyDescription")
            final_results_dict["dataTypes"] = duos_dict.get("dataTypes")
            final_results_dict["phenotypeIndication"] = duos_dict.get("phenotypeIndication")
            final_results_dict["species"] = duos_dict.get("species")
            final_results_dict["piName"] = duos_dict.get("piName")
            final_results_dict["dataCustodianEmail"] = duos_dict.get("dataCustodianEmail")
            final_results_dict["publicVisibility"] = duos_dict.get("publicVisibility")
            final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if duos_dict.get("nihAnvilUse") and 'already' in duos_dict.get("nihAnvilUse").lower() else "I am NHGRI funded and I do not have a dbGaP PHS ID"
            final_results_dict["submittingToAnvil"] = duos_dict.get("submittingToAnvil")
            if snapshot_phs_id:
                final_results_dict["dbGaPPhsID"] = snapshot_phs_id
            else:
                final_results_dict["dbGaPPhsID"] = duos_dict.get("dbGaPPhsID")
            if snapshot_phs_id and dbGaP_study_name and f" ({snapshot_phs_id})" in dbGaP_study_name:
                final_results_dict["dbGaPStudyRegistrationName"] = dbGaP_study_name.replace(f" ({snapshot_phs_id})", "")
            else:
                final_results_dict["dbGaPStudyRegistrationName"] = duos_dict.get("dbGaPStudyRegistrationName")
            final_results_dict["embargoReleaseDate"] = duos_dict.get("embargoReleaseDate")
            final_results_dict["sequencingCenter"] = duos_dict.get("sequencingCenter")
            final_results_dict["piEmail"] = duos_dict.get("piEmail")
            final_results_dict["piInstitution"] = duos_dict.get("piInstitution")
            final_results_dict["nihGrantContractNumber"] = duos_dict.get("nihGrantContractNumber")
            final_results_dict["nihICsSupportingStudy"] = duos_dict.get("nihICsSupportingStudy")
            final_results_dict["nihProgramOfficerName"] = duos_dict.get("nihProgramOfficerName")
            final_results_dict["nihInstitutionCenterSubmission"] = duos_dict.get("nihInstitutionCenterSubmission")
            final_results_dict["nihInstitutionalCertificationFileName"] = duos_dict.get("nihInstitutionalCertificationFileName")
            final_results_dict["nihGenomicProgramAdministratorName"] = duos_dict.get("nihGenomicProgramAdministratorName")
            final_results_dict["multiCenterStudy"] = duos_dict.get("multiCenterStudy")
            final_results_dict["collaboratingSites"] = duos_dict.get("collaboratingSites")
            final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSR")
            final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation")
            final_results_dict["alternativeDataSharingPlan"] = duos_dict.get("alternativeDataSharingPlan")
            final_results_dict["alternativeDataSharingPlanReasons"] = duos_dict.get("alternativeDataSharingPlanReasons")
            final_results_dict["alternativeDataSharingPlanExplanation"] = duos_dict.get("alternativeDataSharingPlanExplanation")
            final_results_dict["alternativeDataSharingPlanFileName"] = duos_dict.get("alternativeDataSharingPlanFileName")
            final_results_dict["alternativeDataSharingPlanDataSubmitted"] = duos_dict.get("alternativeDataSharingPlanDataSubmitted")
            final_results_dict["alternativeDataSharingPlanDataReleased"] = duos_dict.get("alternativeDataSharingPlanDataReleased")
            final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = duos_dict.get("alternativeDataSharingPlanTargetDeliveryDate")
            final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = duos_dict.get("alternativeDataSharingPlanTargetPublicReleaseDate")
            final_results_dict["alternativeDataSharingPlanAccessManagement"] = duos_dict.get("alternativeDataSharingPlanAccessManagement")
            final_results_dict["consentGroups.consentGroupName"] = consent_group_name
            final_results_dict["consentGroups.accessManagement"] = access_management
            final_results_dict["consentGroups.numberOfParticipants"] = duos_dict["consentGroups"][0].get("numberOfParticipants")
            final_results_dict["consentCode"] = consent_code
            final_results_dict["consentGroups.generalResearchUse"] = coalesce(duos_dict["consentGroups"][0].get("generalResearchUse"), duos_data_use_dict.get("generalUse"), False)
            final_results_dict["consentGroups.hmb"] = coalesce(duos_dict["consentGroups"][0].get("hmb"), duos_data_use_dict.get("hmbResearch"), False)
            final_results_dict["consentGroups.diseaseSpecificUse"] = coalesce(duos_dict["consentGroups"][0].get("diseaseSpecificUse"), duos_data_use_dict.get("diseaseRestrictions"), [])
            final_results_dict["consentGroups.gs"] = coalesce(duos_dict["consentGroups"][0].get("gs"), duos_data_use_dict.get("geographicalRestrictions"))
            final_results_dict["consentGroups.poa"] = coalesce(duos_dict["consentGroups"][0].get("poa"), duos_data_use_dict.get("populationOriginsAncestry"), False)
            final_results_dict["consentGroups.nmds"] = coalesce(duos_dict["consentGroups"][0].get("nmds"), False)
            final_results_dict["consentGroups.gso"] = coalesce(duos_dict["consentGroups"][0].get("gso"), duos_data_use_dict.get("geneticStudiesOnly"), False)
            final_results_dict["consentGroups.pub"] = coalesce(duos_dict["consentGroups"][0].get("pub"), duos_data_use_dict.get("publicationResults"), False)
            final_results_dict["consentGroups.col"] = coalesce(duos_dict["consentGroups"][0].get("col"), duos_data_use_dict.get("collaboratorRequired"), False)
            final_results_dict["consentGroups.irb"] = coalesce(duos_dict["consentGroups"][0].get("irb"), duos_data_use_dict.get("ethicsApprovalRequired"), False)
            final_results_dict["consentGroups.npu"] = coalesce(duos_dict["consentGroups"][0].get("npu"), False)
            final_results_dict["consentGroups.otherPrimary"] = coalesce(duos_dict["consentGroups"][0].get("otherPrimary"), duos_data_use_dict.get("other"))
            final_results_dict["consentGroups.otherSecondary"] = duos_dict["consentGroups"][0].get("otherSecondary")
            final_results_dict["consentGroups.mor"] = duos_dict["consentGroups"][0].get("mor")
            final_results_dict["consentGroups.morDate"] = duos_dict["consentGroups"][0].get("morDate")
            final_results_dict["consentGroups.dataLocation"] = "TDR Location"
            final_results_dict["consentGroups.url"] = "https://data.terra.bio/snapshots/" + snapshot_id
            if duos_dict["consentGroups"][0].get("fileTypes") and duos_dict["consentGroups"][0]["fileTypes"].get("fileType"):
                final_results_dict["consentGroups.fileTypes.fileType"] = duos_dict["consentGroups"][0]["fileTypes"][0].get("fileType")
            else:
                final_results_dict["consentGroups.fileTypes.fileType"] = None
            if duos_dict["consentGroups"][0].get("fileTypes") and duos_dict["consentGroups"][0]["fileTypes"].get("functionalEquivalence"):
                final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = duos_dict["consentGroups"][0]["fileTypes"][0].get("functionalEquivalence")
            else:
                final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = None
            collab_site = duos_dict.get("collaboratingSites")
            if collab_site:
                final_results_dict["consortium"] = collab_site[0]
            else:
                final_results_dict["consortium"] = None
            dataset_details_records.append(final_results_dict)
            continue

        # Pull information from original workspace (if listed)
        workspace_phs_id = ""
        if source_workspace:
            # Establish credentials
            creds, project = google.auth.default()
            auth_req = google.auth.transport.requests.Request()
            creds.refresh(auth_req)

            # Pull workspace attributes
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    ws_attributes = requests.get(
                        url=f"https://api.firecloud.org/api/workspaces/anvil-datastorage/{source_workspace}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
                        headers={"Authorization": f"Bearer {creds.token}"}
                    ).json()
                    break
                except:
                    sleep(5)
                    attempt_counter += 1

            # Map to schema
            if ws_attributes.get("workspace"):
                terra_dict["studyName"] = coalesce(ws_attributes["workspace"]["attributes"].get("library:projectName"), source_workspace) 
                terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
                terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
                if ws_attributes["workspace"]["attributes"].get("library:dataCategory"):
                    terra_dict["dataTypes"] = []
                    for item in ws_attributes["workspace"]["attributes"]["library:dataCategory"]["items"]:
                        inner_list = item.split(",")
                        for inner_item in inner_list:
                            inner_item = inner_item.replace("'", "").strip()
                            terra_dict["dataTypes"].append(inner_item)
                terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
                terra_dict["species"] = "Homo sapiens"
                terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
                terra_dict["dataCustodianEmail"] = [ws_attributes["workspace"]["attributes"].get("library:contactEmail")]
                if ws_attributes["workspace"]["attributes"].get("tag:tags"):
                    for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
                        if "Consortium:" in tag:
                            terra_dict["consortium"] = tag.split(":")[1].strip()
                        elif "dbGaP:" in tag:
                            terra_dict["dbGaPPhsID"] = format_phs_id(tag.split(":")[1].strip())
                            if not snapshot_phs_id:
                                workspace_phs_id = format_phs_id(tag.split(":")[1].strip()) 
                                print(f"Warning: PHS ID ({workspace_phs_id}) found on workspace but not snapshot! Please resolve.")
                terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"].get("library:dataUseRestriction")
                if ws_attributes["workspace"]["attributes"].get("library:datatype"):
                    terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]
                if ws_attributes["workspace"]["attributes"].get("library:numSubjects"):
                    terra_dict["consentGroups.numberOfParticipants"] = ws_attributes["workspace"]["attributes"]["library:numSubjects"]
        #         print("------------------------------------------------------")
        #         print("terra_dict")
        #         print(terra_dict)

        # Pull study information from DUOS (if matched to DUOS Study based on PHS ID)
        if not match_study_id and workspace_phs_id:
            match_study_id = study_lookup.get(workspace_phs_id)
        duos_study_dict = {}
        if match_study_id:
            duos_study_dict = requests.get(
                    url=f"{url}/api/dataset/study/registration/{match_study_id}",
                    headers={"Authorization": f"Bearer {duos_token}"}
                ).json()
            collab_site = duos_study_dict.get("collaboratingSites")
            if collab_site:
                duos_study_dict["consortium"] = collab_site[0]
        
        # Pull information from dbGaP (if phs_id listed)
        dbgap_phs_id = coalesce(snapshot_phs_id, workspace_phs_id)
        if dbgap_phs_id:
            # Pull and parse XML
            phs_short = dbgap_phs_id.replace("phs", "")
            dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = requests.get(url=dbgap_url)
                    xml_data = xmltodict.parse(response.text)
                    break
                except:
                    sleep(5)
                    attempt_counter += 1
            study_uid = ""

            # Map to schema
            if xml_data["dbgapss"].get("Study"):
                if isinstance(xml_data["dbgapss"]["Study"], list):
                    study_data = xml_data["dbgapss"]["Study"][0]
                else:
                    study_data = xml_data["dbgapss"]["Study"] 
                study_uid = study_data.get("@uid")
                dbgap_xml_dict["studyName"] = study_data["StudyInfo"].get("StudyNameEntrez")
                dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
                dbgap_xml_dict["dbGaPPhsID"] = dbgap_phs_id
                dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
                if study_data["Authority"]["Persons"].get("Person"):
                    for ap_entry in study_data["Authority"]["Persons"]["Person"]:
                        if ap_entry["Role"] == "PI":
                            dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                            dbgap_xml_dict["piEmail"] = ap_entry["@email"]
                            dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
                        elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
                            dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                        elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
                            dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                ic_list = []
                if isinstance(study_data["Authority"]["ICs"]["IC"], list):
                    for ic_entry in study_data["Authority"]["ICs"]["IC"]:
                        ic_list.append(ic_entry["@name"])
                else:
                    ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
                dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
                dbgap_xml_dict["consentGroups.numberOfParticipants"] = study_data.get("@num_participants")
                dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")
        #             print("------------------------------------------------------")
        #             print("dbgap_xml_dict")
        #             print(dbgap_xml_dict)

            # Pull and parse Study API JSON
            if study_uid:
                dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
                attempt_counter = 0
                while attempt_counter <= 2:
                    try:
                        response = requests.get(url=dbgap_study_url)
                        study_api_data = json.loads(response.text)
                        break
                    except:
                        sleep(5)
                        attempt_counter += 1

                # Map to schema
                if study_api_data.get("error") == None:
                    dbgap_study_api_dict["studyName"] = study_api_data["data"].get("report_name")
                    dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
                    dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
                    dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
                    dbgap_study_api_dict["dbGaPPhsID"] = dbgap_phs_id
                    dbgap_study_api_dict["dbGaPStudyRegistrationName"] = study_api_data["data"].get("report_name")
                    for attr_entry in study_api_data["data"].get("attribution"):
                        if attr_entry.get("title") == "Principal Investigator":
                            dbgap_study_api_dict["piName"] = attr_entry.get("name")
                            dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
                            break
        #             print("------------------------------------------------------")
        #             print("dbgap_study_api_dict")
        #             print(dbgap_study_api_dict)

            # Pull and parse FHIR API JSON
            dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + dbgap_phs_id
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = requests.get(url=dbgap_fhir_url)
                    fhir_data = json.loads(response.text)
                    break
                except:
                    sleep(5)
                    attempt_counter += 1

            # Map to schema
            if fhir_data.get("entry"):
                dbgap_fhir_dict["studyName"] = fhir_data["entry"][0]["resource"].get("title")
                dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
                dbgap_fhir_dict["dbGaPPhsID"] = dbgap_phs_id
                dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
                # NIH ICs
                if "Organization/" in fhir_data["entry"][0]["resource"]["sponsor"].get("reference"):
                    dbgap_fhir_dict["nihICsSupportingStudy"] = [fhir_data["entry"][0]["resource"]["sponsor"].get("reference")[13:]]
                else:
                    ic_display = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
                    if ic_display == "National Human Genome Research Institute":
                        dbgap_fhir_dict["nihICsSupportingStudy"] = ["NHGRI"]
                    else:
                        dbgap_fhir_dict["nihICsSupportingStudy"] = [ic_display]
                # studyType
                if fhir_data["entry"][0]["resource"].get("category"):
                    for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
                        if cat_entry.get("coding"):
                            for coding_entry in cat_entry.get("coding"):
                                if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
                                    value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                                    if dbgap_fhir_dict.get("studyType") and value:
                                        dbgap_fhir_dict["studyType"] += f", {value}"
                                    elif value:
                                        dbgap_fhir_dict["studyType"] = value
                # dataTypes
                dt_list = []
                if fhir_data["entry"][0]["resource"].get("extension"): 
                    for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                        if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
                            for inner_ext_entry in ext_entry.get("extension"):
                                if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                                    for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                                        dt_list.append(coding_entry.get("code"))
                dbgap_fhir_dict["dataTypes"] = dt_list
                # phenotypeIndication
                if fhir_data["entry"][0]["resource"].get("focus"):
                    for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
                        if focus_entry.get("coding"):
                            for coding_entry in focus_entry.get("coding"):
                                value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                                if dbgap_fhir_dict.get("phenotypeIndication") and value:
                                    dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
                                elif value:
                                    dbgap_fhir_dict["phenotypeIndication"] = value
                # numberOfParticipants
                if fhir_data["entry"][0]["resource"].get("extension"):
                    for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                        if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
                            for inner_ext_entry in ext_entry.get("extension"):
                                if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                                    dbgap_fhir_dict["consentGroups.numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")
        #         print("------------------------------------------------------")
        #         print("dbgap_fhir_dict")
        #         print(dbgap_fhir_dict)

        # Reconcile information and create final results
        consent_code = coalesce(snapshot_consent_code, terra_dict.get("consentGroups.consentCode"), dbgap_fhir_dict.get("consentGroups.consentCode"), dbgap_xml_dict.get("consentGroups.consentCode"), dbgap_study_api_dict.get("consentGroups.consentCode"))
        if consent_code:
            consent_code = consent_code.upper().replace("_", "-")
        else:
            consent_code = ""
        consortium = coalesce(duos_study_dict.get("consortium"), terra_dict.get("consortium"), dbgap_fhir_dict.get("consortium"), dbgap_xml_dict.get("consortium"), dbgap_study_api_dict.get("consortium"))
        dbGaPPhsID = coalesce(duos_study_dict.get("dbGaPPhsID"), dbgap_fhir_dict.get("dbGaPPhsID"), dbgap_xml_dict.get("dbGaPPhsID"), dbgap_study_api_dict.get("dbGaPPhsID"), terra_dict.get("dbGaPPhsID"))
        studyName = coalesce(duos_study_dict.get("studyName"), dbgap_fhir_dict.get("studyName"), dbgap_xml_dict.get("studyName"), dbgap_study_api_dict.get("studyName"), terra_dict.get("studyName"))
        dbGaPStudyRegistrationName = coalesce(duos_study_dict.get("dbGaPStudyRegistrationName"), dbgap_fhir_dict.get("dbGaPStudyRegistrationName"), dbgap_xml_dict.get("dbGaPStudyRegistrationName"), dbgap_study_api_dict.get("dbGaPStudyRegistrationName"), terra_dict.get("dbGaPStudyRegistrationName"))
        if dbGaPPhsID and consent_code:
            study_consent = dbGaPPhsID + ":" + consent_code
            purl_doid = ds_consent_map.get(study_consent)
            if purl_doid:
                if not isinstance(purl_doid, list):
                    purl_doid = [purl_doid]
            else:
                purl_doid = []
        else:
            purl_doid = []
        final_results_dict["snapshot_id"] = snapshot_id
        final_results_dict["snapshot_phs_id"] = snapshot_phs_id
        final_results_dict["snapshot_duos_id"] = duos_id
        final_results_dict["match_duos_id"] = match_duos_id
        final_results_dict["match_study_id"] = match_study_id
        if dbGaPPhsID and studyName and f" ({dbGaPPhsID})" not in studyName:
            final_results_dict["studyName"] = studyName + f" ({dbGaPPhsID})"
        else:
            final_results_dict["studyName"] = studyName
        final_results_dict["studyType"] = coalesce(duos_study_dict.get("studyType"), dbgap_fhir_dict.get("studyType"), dbgap_xml_dict.get("studyType"), dbgap_study_api_dict.get("studyType"), terra_dict.get("studyType"))
        final_results_dict["studyDescription"] = format_description(coalesce(duos_study_dict.get("studyDescription"), dbgap_fhir_dict.get("studyDescription"), dbgap_xml_dict.get("studyDescription"), dbgap_study_api_dict.get("studyDescription"), terra_dict.get("studyDescription")))
        if final_results_dict["studyDescription"]:
            if "Platform: AnVIL" not in final_results_dict["studyDescription"]:
                final_results_dict["studyDescription"] = final_results_dict["studyDescription"] + "\nPlatform: AnVIL"
        else:
            final_results_dict["studyDescription"] = "Platform: AnVIL"
        final_results_dict["dataTypes"] = coalesce(duos_study_dict.get("dataTypes"), terra_dict.get("dataTypes"), dbgap_fhir_dict.get("dataTypes"), dbgap_xml_dict.get("dataTypes"), dbgap_study_api_dict.get("dataTypes"))
        final_results_dict["phenotypeIndication"] = coalesce(duos_study_dict.get("phenotypeIndication"), terra_dict.get("phenotypeIndication"), dbgap_fhir_dict.get("phenotypeIndication"), dbgap_xml_dict.get("phenotypeIndication"), dbgap_study_api_dict.get("phenotypeIndication"))
        final_results_dict["species"] = "Human"
        final_results_dict["piName"] = coalesce(duos_study_dict.get("piName"), dbgap_fhir_dict.get("piName"), dbgap_xml_dict.get("piName"), dbgap_study_api_dict.get("piName"), terra_dict.get("piName"), "None")
        final_results_dict["dataCustodianEmail"] = ["help@lists.anvilproject.org"]
        final_results_dict["publicVisibility"] = True
        final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if dbGaPPhsID else "I am NHGRI funded and I do not have a dbGaP PHS ID"
        final_results_dict["submittingToAnvil"] = True
        final_results_dict["dbGaPPhsID"] = dbGaPPhsID
        if dbGaPPhsID and dbGaPStudyRegistrationName and f" ({dbGaPPhsID})" in dbGaPStudyRegistrationName:
            final_results_dict["dbGaPStudyRegistrationName"] = dbGaPStudyRegistrationName.replace(f" ({dbGaPPhsID})", "")
        else:
            final_results_dict["dbGaPStudyRegistrationName"] = dbGaPStudyRegistrationName
        final_results_dict["embargoReleaseDate"] = coalesce(duos_study_dict.get("embargoReleaseDate"), dbgap_fhir_dict.get("embargoReleaseDate"), dbgap_xml_dict.get("embargoReleaseDate"), dbgap_study_api_dict.get("embargoReleaseDate"), terra_dict.get("embargoReleaseDate"))
        final_results_dict["sequencingCenter"] = None
        final_results_dict["piEmail"] = coalesce(duos_study_dict.get("piEmail"), dbgap_fhir_dict.get("piEmail"), dbgap_xml_dict.get("piEmail"), dbgap_study_api_dict.get("piEmail"), terra_dict.get("piEmail"))
        final_results_dict["piInstitution"] = coalesce(duos_study_dict.get("piInstitution"), dbgap_fhir_dict.get("piInstitution"), dbgap_xml_dict.get("piInstitution"), dbgap_study_api_dict.get("piInstitution"), terra_dict.get("piInstitution"))
        final_results_dict["nihGrantContractNumber"] = None
        final_results_dict["nihICsSupportingStudy"] = coalesce(duos_study_dict.get("nihICsSupportingStudy"), dbgap_fhir_dict.get("nihICsSupportingStudy"), dbgap_xml_dict.get("nihICsSupportingStudy"), dbgap_study_api_dict.get("nihICsSupportingStudy"), terra_dict.get("nihICsSupportingStudy"))
        final_results_dict["nihProgramOfficerName"] = coalesce(duos_study_dict.get("nihProgramOfficerName"), dbgap_fhir_dict.get("nihProgramOfficerName"), dbgap_xml_dict.get("nihProgramOfficerName"), dbgap_study_api_dict.get("nihProgramOfficerName"), terra_dict.get("nihProgramOfficerName"))
        final_results_dict["nihInstitutionCenterSubmission"] = "NHGRI"
        final_results_dict["nihInstitutionalCertificationFileName"] = None
        final_results_dict["nihGenomicProgramAdministratorName"] = coalesce(duos_study_dict.get("nihGenomicProgramAdministratorName"), dbgap_fhir_dict.get("nihGenomicProgramAdministratorName"), dbgap_xml_dict.get("nihGenomicProgramAdministratorName"), dbgap_study_api_dict.get("nihGenomicProgramAdministratorName"), terra_dict.get("nihGenomicProgramAdministratorName"))
        final_results_dict["multiCenterStudy"] = None
        final_results_dict["collaboratingSites"] = [consortium] if consortium else []
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = None
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = None
        final_results_dict["alternativeDataSharingPlan"] = False
        final_results_dict["alternativeDataSharingPlanReasons"] = []
        final_results_dict["alternativeDataSharingPlanExplanation"] = None
        final_results_dict["alternativeDataSharingPlanFileName"] = None
        final_results_dict["alternativeDataSharingPlanDataSubmitted"] = None
        final_results_dict["alternativeDataSharingPlanDataReleased"] = None
        final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = None
        final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = None
        final_results_dict["alternativeDataSharingPlanAccessManagement"] = None
        final_results_dict["consentGroups.consentGroupName"] = consent_group_name
        final_results_dict["consentGroups.accessManagement"] = access_management
        final_results_dict["consentGroups.numberOfParticipants"] = coalesce(terra_dict.get("consentGroups.numberOfParticipants"), dbgap_fhir_dict.get("consentGroups.numberOfParticipants"), dbgap_xml_dict.get("consentGroups.numberOfParticipants"), dbgap_study_api_dict.get("consentGroups.numberOfParticipants"), "0")
        final_results_dict["consentCode"] = consent_code
        final_results_dict["consentGroups.generalResearchUse"] = True if access_management == "controlled" and "GRU" in consent_code else False
        final_results_dict["consentGroups.hmb"] = True if access_management == "controlled" and "HMB" in consent_code else False
        if purl_doid:
            final_results_dict["consentGroups.diseaseSpecificUse"] = purl_doid
        else:
            final_results_dict["consentGroups.diseaseSpecificUse"] = []
        final_results_dict["consentGroups.gs"] = consent_code if access_management == "controlled" and "GS-" in consent_code else None
        final_results_dict["consentGroups.poa"] = True if access_management == "controlled" and "POA" in consent_code else False
        final_results_dict["consentGroups.nmds"] = True if access_management == "controlled" and "NMDS" in consent_code else False
        final_results_dict["consentGroups.gso"] = True if access_management == "controlled" and "GSO" in consent_code else False
        final_results_dict["consentGroups.pub"] = True if access_management == "controlled" and "PUB" in consent_code else False 
        final_results_dict["consentGroups.col"] = True if access_management == "controlled" and "COL-" in consent_code else False
        final_results_dict["consentGroups.irb"] = True if access_management == "controlled" and "IRB" in consent_code else False
        final_results_dict["consentGroups.npu"] = True if access_management == "controlled" and "NPU" in consent_code else False
        final_results_dict["consentGroups.otherPrimary"] = consent_code if (consent_code and access_management == "controlled" and not (final_results_dict["consentGroups.generalResearchUse"] or final_results_dict["consentGroups.hmb"] or final_results_dict["consentGroups.diseaseSpecificUse"] or final_results_dict["consentGroups.gs"] or final_results_dict["consentGroups.poa"] or final_results_dict["consentGroups.nmds"] or final_results_dict["consentGroups.gso"] or final_results_dict["consentGroups.pub"] or final_results_dict["consentGroups.col"] or final_results_dict["consentGroups.irb"] or final_results_dict["consentGroups.npu"])) else None
        final_results_dict["consentGroups.otherSecondary"] = None
        final_results_dict["consentGroups.mor"] = None
        final_results_dict["consentGroups.morDate"] = None
        final_results_dict["consentGroups.dataLocation"] = "TDR Location"
        final_results_dict["consentGroups.url"] = "https://data.terra.bio/snapshots/" + snapshot_id
        final_results_dict["consentGroups.fileTypes.fileType"] = coalesce(terra_dict.get("consentGroups.fileTypes.fileType"), dbgap_fhir_dict.get("consentGroups.fileTypes.fileType"), dbgap_xml_dict.get("consentGroups.fileTypes.fileType"), dbgap_study_api_dict.get("consentGroups.fileTypes.fileType"))
        final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = None
        final_results_dict["consortium"] = consortium
        dataset_details_records.append(final_results_dict)

    # Return results
    return dataset_details_records


#############################################
## Input Parameters
#############################################

# Specify the snapshots to pull data for:
snapshot_id_list = [
    'c53121e7-5464-4b08-9a79-9c3a435d5b39',
    '2429f987-345e-43f0-aae1-a4752a925066',
    'dd292226-35c4-4416-9fa7-5ee42a9b0441',
    'a37d9def-52ca-488e-9468-8e2e211fb3d5',
    '216ba0eb-9446-4538-af47-dd15c117b56c',
    '51620360-86eb-437f-b3dc-b9abd9eef4cc',
    'd771bc68-4ac4-4ed6-abcc-8269a16c7121',
    '3b5c564e-a310-4cda-b8e6-6b68f41a6f86',
    '32252585-907e-4e7d-ab50-8bc7e5eefcba',
    '8765e5c4-2b1b-4f5b-aa20-e877fb41295d',
    '37c0a1a2-ae3e-4573-a4b4-16c2e3228e09',
    '7609860a-bee5-41c5-b5a2-032ce367f44c',
    '944c0080-d5e3-4e9a-b418-9b59e8a8dd00',
    '5ff335b6-8bb6-4205-a8ee-e28d75f4ee4c',
    '374120e5-dff0-43dc-ae22-10d4e59e505f',
    '59cb4733-11d6-4a01-adaa-590510b6b1b8',
    'eb67973a-5186-44e9-8777-00e6471a23c7',
    'ccfec783-8b2f-4ef5-9dfc-c71a2de5b966',
    '3013571a-9ecf-4758-9e73-f6877d505d1d',
    '48390abd-821d-4af1-b563-a99b38e260da',
    '936af9f9-0421-4dc4-8646-2d7463200a06',
    'fbdb8c08-0d94-4db2-9dd1-1f48ed0e72b2',
    '2cc5b3db-286a-4160-a746-e4ea7bc6d4c4',
    '99a0c351-1533-4a50-bb13-4b75923080cc',
    '119d4480-b12f-4939-994c-40b249cb3ce4',
    '5a8c4d8a-d0f2-4717-81bc-28263e742cd7',
    '31506ce4-f1bf-44c3-9b19-f23d065dc136',
    'e1e25d3e-cad2-43ef-96c4-e741fdc8c9a6',
    '1624cf29-2dad-4f12-8146-4e8d91ec6c81',
    'c7f980b3-1dd6-4edf-9412-7c72b89ec3c2',
    '40eeef36-5305-4ca5-8f2b-cfe163de02d8',
    'b6baee06-c290-4848-9a4e-73fd765895e3',
    '7d21a199-9b42-488f-b371-eb8c04e913e3',
    'ecd6e2db-58f2-4b6b-961b-977041c29399',
    'b4f7c49a-e0a9-48e2-9981-457e75bda3fd',
    '88276cb9-cfe6-40af-bd85-52e19dcffb8c',
    '5e050276-a987-40c6-8dc2-46124ddc1a64',
    '410561ee-02b0-4dfe-b7ec-58d5b1cca4d1',
    '681d65dd-247d-4a38-a1a3-9c62551985a2',
    'c3fbbbe9-50ce-4723-a555-72f1bbef984d',
    'aa5caf41-21e6-4e84-a046-cdb9b1bd9e62',
    'b25bf30f-7ab1-4eb7-a740-dbb637f1f0e8',
    'acf0ed61-3fa3-4fc8-bee1-edacc169d36c',
    'a20e2c68-19f1-4217-8752-c822d51c6ede',
    'c110840b-4fb7-4ac6-963d-ccac0e443191',
    '6f15ad4d-2c42-4512-9c5b-e29d3dbf60e4',
    '93097bca-33d7-469c-b12e-0a2157c9eb84',
    '4f7c04bb-3ea7-4780-ad57-e7b19fe3d851',
    '995972f7-5477-4414-84cd-4de3565f0c97',
    'f7de2c35-eb9e-4727-afe2-f5756239e64b',
    '414103f1-3cec-4e80-bfd8-046e224c5988',
    '30613a35-0844-4d31-bbe3-94902f45cf96',
    '1db0f331-6207-40b9-89b8-9cbfb4fbdad2',
    'e5b2ba63-9978-4671-b939-dc115ca3c665',
    '6789dbc7-abf0-4291-a014-ccbf323e308c',
    '02e352b1-ab59-4cda-a2ba-cf9a960552b2',
    'c25614e5-c350-4aaa-8180-32151e259d87',
    'a72f9ce4-cfcc-4701-ab9d-67df41936da8',
    'e1e59839-9307-4c5f-906b-036eecc6197e',
    'ce3b35d7-9805-401c-a36e-d118d8fa4a1e',
    '236ff37d-335c-4561-80f6-4ffa8bd88b2f',
    '024cbb4f-c989-4ba4-b33d-a53790e3d6d7',
    '5a42e770-ed61-4a5b-b43c-cbf3a4744733',
    '7f1126cd-3dcb-4b7d-8d32-dfad3c43e9e2',
    '99f9f7aa-535c-44cb-b6f9-df8f479eb80a',
    'b5c0b07a-5653-4912-944f-80069cbf8360',
    '982af1af-7098-4274-ac33-9bb7b78d5880',
    '601f2346-7a1a-4b09-865f-764fb92e60a9',
    '4487c14a-49e0-4185-b3d8-4a40f5fa9721',
    '77590745-7d3d-4499-bb46-9a433d76a1cb',
    '8bbe3019-638a-4497-96b0-fc9fbd48ce6b',
    'a56397ee-005b-411d-9129-eb978f035de7',
    'e10e798c-2262-4a40-9942-c9dcfcc97aca',
    '1674e6c7-f3de-4cea-985d-a8c5b520f8e7',
    '6eda136b-5141-4cc7-a752-a757e06eb179',
    'ba2414b4-2854-4801-9719-6ee91971aba3',
    '8fa1f61c-a63f-432c-a623-dc939316d482',
    '3f5ef236-6b19-4e9f-9fb5-30dc8d9f6be6',
    'ba05015c-d9a5-40ea-9b9f-4ccb608a1233',
    '4017f0b6-5228-442c-82e6-eb449a6d9804',
    '0f37bf7e-230a-4482-8342-ed5e7333026d',
    'bd725fc0-94f0-48c4-952c-d4752e950b47',
    '9658265c-56f0-4fa4-bf6e-c64ea225d7a2',
    'f0d50b6b-f225-4989-8aa1-a9e9a441070d',
    'aa6b58c2-6eb3-4b4d-9e73-89cbb323ee26',
    'b0595994-56fc-45ee-bd2b-f497231dd79b',
    'feb70203-2abc-41ab-bfaa-3bf7cfa12606',
    '3fdc6bde-1142-41bf-a16b-76b68051837b',
    '08e33fe7-4757-4737-b0e0-227207f98ff5',
    'd42322cb-d761-4dad-8814-a96555164400',
    'cf30be0a-1a09-4a01-9a8a-35474a92aa71',
    '14e2cc61-7a85-400b-bdfd-729c7950592c',
    'b6f69c83-d069-417b-a6e3-b3fb5fdd21a6',
    'aff867bd-ac09-4dbc-9031-a145fcc36a5c',
    '613a8d12-7ac3-4d9d-996a-0f1bd2318b69',
    'ae5f32ce-35fe-498a-944c-b6b647570a93',
    'e1e6f206-0466-41ae-8603-edc210f2d448',
    'bf8681db-72bb-45c2-a9ca-64d418464ae2',
    'bffe9c59-8da4-407c-ab88-f7a394d24476',
    'a3665f39-21c3-4016-8bbe-e5dd96df6af0',
    '0c90a04d-719f-46cc-bfaa-8d209ff49dcd',
    'a3b32c40-6c50-49c8-8961-24abeffff0d5',
    'd72bad95-9750-4802-8d25-05f5bd943519',
    'f39bb497-1ac2-4017-b8c4-3ca223d57b61',
    '13569ac7-87e5-40d7-8acf-0574a18d1b09',
    'ccb2ab9f-a03c-40ef-a6c9-13fcac52e619',
    '824afdf1-50d9-462f-9f09-db5a1f646bd8',
    'b9314197-1618-4dd7-8441-38dfb1490389',
    '761e172c-f530-4154-b5b6-a1c52b0530e6',
    'e1c34b81-2435-4c12-87d7-3f995cfd4a0a',
    'cab35bdd-4b15-4836-8470-b922d5761602',
    'e62109c9-3f4c-4da4-8382-ae4d5a82bb1c',
    'c0c0d1fe-9d5d-4b84-911c-e74f34b2edb6',
    'c9d30b32-ae82-475e-a8bc-d88e0c489aee',
    '132e9681-5ee5-4b17-99b9-e444aa3eb658',
    '5925e258-4864-4c0b-9b17-03f25a02ef4c',
    '8aab2cf0-95fb-4b67-8da2-1c0c20be63c8',
    '376ff43a-6a26-4d95-b4c3-72c63d387349',
    '7075a479-b7a4-44fd-a140-8431e438b193',
    'feb8ba77-9077-448a-b8fd-0e1549ee70e4',
    '6bcb6897-c907-46aa-95d0-bfa417e27e45',
    '8d07bc3b-e043-49d4-b802-9f174fb98e77',
    'a9df5cdd-c041-4e74-9ee6-85d744b8fc6b',
    'a0b58558-82ff-4de2-835f-5f0af4ca29ec',
    '7d08223e-7dcf-4eac-9c67-9e4a619ff783',
    '3aec0ff3-cd28-4aa9-a47a-d9ab047285c9',
    '0c40923c-e4eb-45b8-9183-137e24eebc35',
    'fe3b687b-0462-4a02-941b-92ac61a65bb4',
    '3a15ecc8-d81d-4b11-9163-d94d63657000',
    '84ecc954-e50d-418e-aa92-94a933dc4bb0',
    'a7d82508-b20b-4190-96e6-210cd63bfbbf',
    '562943e3-55be-4df4-8c8b-8eade170f5f3',
    '660e4508-8f0d-4325-bace-40a19b2d53ea',
    'f4db72e3-3320-4bcd-8dc4-b3d864992fcc',
    '48f8e9c7-c4b6-40cc-88cb-67e999fd4947',
    '32077f3d-92ca-4d47-a6d5-2e06f1c3036f',
    '77863e88-84f0-46d8-ab38-ed8fca7d3a4e',
    '7c2974a1-7dfe-477d-bc2e-20c5a6a08643',
    '84ad1513-ac8f-45ca-98f8-e239a2eb09f8',
    '80a0e698-3d7e-43ce-a80a-2f7571d027ff',
    'a209c033-3364-4867-8fab-36ad15c77185',
    '27e7471a-2187-4209-b912-05cb7913cb82',
    'b0d76691-39f9-4fea-99e5-b6af5ab8b51a',
    '63b4bfba-706d-422c-8764-6cf6872ea616',
    '5157b8d6-89f5-42dc-8504-ebe5d4cda170',
    'd85c9132-dc6d-4035-914d-58147f210411',
    '63e21508-bc22-4bb1-8eac-4712d234c282',
    '1fda8281-dc94-4a01-b492-0cf385b51f26',
    '61620208-9698-4605-9f9a-3d6e734f2fb4',
    'ab1a4ab5-b26c-433e-8c79-418c80cddff0',
    '20ed15ad-6c34-4858-b73e-8fcf5119c70c',
    '7c10751a-3cb4-4a42-9c7f-44ed3a524157',
    'eaf0c529-923d-4977-88dc-c6c7022c560a',
    'ad6fafdf-91f1-4865-9a20-c8c749bef3f2',
    '45f66040-a698-492b-b59f-3047cc23394b',
    'c5e72ae7-4cb0-4455-9c5f-36d1dd2f3fba',
    '8b23004c-3207-4eb9-a0e9-cbbb3f8a807d',
    'eee54af2-6806-4a40-b76d-f4e222dccd8a',
    '7433f821-ef51-4bcd-b2a8-e0b7fc96eaa1',
    '4d0f0ef9-b6c5-4fcf-9fe8-60730ceb3cea',
    '10e9ac52-d181-4ceb-a1a3-550dd2b5bb8b',
    '50516407-200c-4eca-9412-3606417f8f23',
    'e0c6dfc2-6f4b-4ee8-96cf-aaaf7344d077',
    '0b685347-bb00-4713-8d71-e140c2be626a',
    'f376d6e2-1f38-484f-901f-afa80a261f07',
    '03f91de4-db9a-4d90-81c9-010f197e382a',
    'ad3dcca6-30bc-4f4a-ac73-af7552040611',
    '189c67e2-afd6-4087-bdd7-f2960b03f021',
    '50992782-4372-43bc-8b49-8b58e0a65e47',
    'a0ff171a-5825-40df-853b-3a969f3d7700',
    'aacb6053-ea9c-44e8-bdb9-0b86b68161fa',
    '4f9520a4-19b4-4860-83f6-f1dee915abc2',
    '39d5e826-b3e6-4fd2-8c74-56a3b68f92fa',
    'd7c8a22b-8b66-4825-b3bb-ff58fc00d294',
    '81680a18-cfe4-49a5-9545-2ad4d78e217d',
    'b5b8a461-912e-4a82-80b0-48a08fc4cc21',
    'a89808ff-de6b-496c-8204-2cd120f8a40a',
    '3096ba31-3ff4-4e2c-8bfb-c2940c3900af',
    '9786fd6d-3b6e-42f7-a889-cd9d51fe0670',
    '0ab0e90b-55d6-487e-b3e0-0e9694faeaf8',
    'fb356cb8-55b9-4e14-94c4-7bff0cc8a08d',
    '3380d1a3-7202-41bb-9d08-0958a712225f',
    'ec365580-d587-4f7e-92ef-6e3dc77eb2ed',
    '9716da3d-4523-492d-8d1a-b91009b66713',
    'c4c8a9a9-97bd-41a4-9070-9fff6e9712e8',
    '79df9b36-7fb1-4eb6-8a19-2c4859a50b41',
    '622d78ed-c06e-48ed-ba23-acb9d9e9fb71',
    '6a29d4f2-66ec-4b1e-bd72-d6af0d53247b',
    '76517186-09b3-4389-b7e6-faacf466d5a2',
    'f0c0e5de-05a9-45b0-85cc-64b71d5d983a',
    'cb5807f7-2c8c-49cc-9b42-7a4b93d6cd0c',
    '909e2c8a-8a58-4dc2-862c-7f90164bf4eb',
    '00311bcc-e88b-4172-b1bc-4c2bf1a2eca6',
    '5fc28947-e4eb-4e06-87f4-179712351a0d',
    '9fafd3ab-cc80-495d-bbef-8a360df839b6',
    'da67c6ce-2b68-4302-a556-f895dc2669c3',
    'd008b876-11a2-4788-a032-5e9a3aed0635',
    '197f4b5b-3bd9-4452-8f5f-cecf79cceb46',
    '60bd03ac-7bc6-4c91-b5ee-d8b81d378531',
    '6b77df14-4a2c-4325-9d75-9741080c4c90',
    'b07711f6-f0a8-46ff-a574-53ca8b1c407f',
    'd5155531-bb05-4e63-88dd-a00cfba70144',
    '9412de7c-e015-4944-bff4-a6ab7dfc26b5',
    '4f01acb5-f851-4af2-976a-fbb6f2e22883',
    'e9546f48-85b9-45d0-b734-5f9b78510b09',
    '188f3bd7-e9af-4f17-b4ab-5d77c297dcb1',
    '5107a92c-8fea-4589-a367-1a66b16e440e',
    'ff8029a8-9ed8-42a9-8cc9-1970faec129d',
    'e0c37053-3e7b-492e-ab61-4dc0689482d4',
    '4d6f1e45-d7f3-4522-a3ae-6fb3056bb237',
    'bab31567-c06d-4fc5-a95f-7146c890ea42',
    '10bda5a6-720e-44e4-9fea-f3dc7fcb430e',
    'db1d3440-5cf3-4e67-b232-2aa8b7237eff',
    '9c55b651-7379-4221-afe8-f1fe0d8c11b1',
    '6fdea8c7-69d9-466e-9fa2-aca30722ff68',
    '6a92c922-83b3-4acb-9b9d-36eeea3a7f0b',
    '45254a52-1157-43d6-a218-e6965bf2f6d2',
    'e5677706-5065-46d4-a519-d358c0b267c1',
    '8a75c087-e043-4364-adb3-9cb6d58fc3b4',
    'e04d1bfd-39ee-4e2d-bbc6-04c6398a6410',
    '457179a0-4bec-42c7-a4ee-a73116d16c31',
    'a13d48e4-be63-415d-9a88-6df2d8ce0c9a',
    'c930f337-5a82-48e4-b403-9615f0e4f951',
    'bbe983b9-6c26-460a-951b-28f048d148a7',
    '8a64c8f5-aa30-4cfd-aa62-d7cb4ba2b41b',
    '965c7172-46ff-4f12-8a8e-a17bf5bd4780',
    '5d6cc84c-f03a-485a-8f90-1b44c1fa55c9',
    '9656570b-c0c8-41af-a759-6ac3786e498c',
    'f3436925-4cd6-4409-bd6a-9f0714ff6f86',
    '47336244-8514-4842-b3fc-f9500dd12cd8',
    '0cbcaf9e-0d62-48c6-a0d8-69b9d78b50ed',
    '9272d799-00d2-48f9-b3fc-48c73b85a4f9',
    '62af84d8-f0f3-4b4a-b3a0-71262132b9e6',
    '2a1375fc-a976-4327-829f-d0d0f6155cc5',
    'bceb34d5-a22c-460c-94c1-14e3c14ad467',
    '190e0383-54b2-491e-8f5a-81167f6e1770',
    '6bcd4128-b24d-4448-8717-eb7519364147',
    '00002787-81de-47e4-a7a9-7da09bc95592',
    '9868d80a-ef8b-4dc7-bfeb-0c9dc488739b',
    'c98486d7-107c-49b7-a5d1-d36da16b3f66',
    '470dc943-2c98-4466-a6ab-0c134a4189fb',
    '6c50e533-41ab-4c09-9845-fe006b40ac3d',
    '57eb0f97-fc7b-49c3-821e-11c763ee6a94',
    '2302effc-1f0d-4618-a360-543e1892a549',
    'c13b2c4e-e5da-4384-a479-803dbbf3acc5',
    '533ba93b-506e-4547-9174-037a6b17835d',
    '768fd1b9-8785-44d3-bff3-353657ef1174',
    '310d443d-dd36-4884-843d-6d93596034a6',
    'd9f8c794-3d10-4150-8c82-31f6542777fb',
    '943f6d78-c341-4c95-8cf1-497293fa5d02',
    '918bd1b7-11e8-400d-8906-fabf3b30b7f9',
    '82703bd5-1ee1-4c9a-920c-824437b91dbf',
    'e36630d4-e842-4fcd-9ef0-b763df1ae0c1',
    '8f341ba5-a191-4899-b62e-68c50434b43a',
    'b35c33d8-19ce-4454-8a49-c5d403b3852f',
    '2e0c00b1-c743-48af-93a4-0599efb714f2',
    'fc24235e-8c3e-448b-88c1-ad607f1dea52',
    '67d9333e-874a-45ad-b550-d1489263a23c',
    '37e96cd2-487b-4969-a4f7-ea0fed69505a',
    'dddf6096-97fa-4387-9b2e-9ccc1f447eeb',
    'df6894ed-1854-49ae-9097-c3e5527b9174',
    '41aa42bf-c3ba-4ff4-8ec4-f4519edeeea4',
    '224b66ae-5714-40d3-ba64-14caca2ef232',
    '56078c29-a393-4c60-9e04-3674e02fe729',
    'cf26bb77-6a92-4e0d-9da6-b1f92087137a',
    'a35fc432-b9ba-4633-bef7-4e317ff34df5',
    'ab71d294-4ba9-44d4-8051-913b3d5ccff3',
    '19cf6f7c-07ce-410a-9332-ebb4e3237a70',
    '197bda16-0ad5-4085-bba8-11bdc038efc6',
    '12791f33-5f01-4cf5-bf99-3f9fde75077a',
    '100ce437-e8b4-42ec-a5e7-49b5318f3adf',
    '15728ae8-aa66-49cf-bffb-300c50f9c88e',
    '32b50544-21d8-4a8c-af88-7a4f134001f0',
    '98527f0d-e774-48f7-bc0c-409a07f2f540',
    'bcc52739-926b-4a37-ac9e-23d60400770f',
    '162ab47c-4d0d-478f-ab46-976f73b77359',
    'f885a740-5559-45ec-a05f-5f43fc6d2cd7',
    'c4990f27-a5d9-4fe0-9ab6-d579d358699d',
    '9efd748c-ad09-4765-b645-1b6ef6b5d402',
    '4f51b794-aaf3-4553-9d5c-509dc1e9e8f4',
]
# snapshot_id_list = [
#     '88276cb9-cfe6-40af-bd85-52e19dcffb8c'
# ]

# Specify a mapping from phs-consent to DOID for DS consent codes (replace "_" with "-" in consent first)
ds_consent_map = {
    'phs000298:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs000693:DS-BDIS': 'http://purl.obolibrary.org/obo/DOID_936',
    'phs000693:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs000744:DS-RD': 'http://purl.obolibrary.org/obo/DOID_15',
    'phs000744:DS-THAL-IRB': 'http://purl.obolibrary.org/obo/DOID_10241',
    'phs001222:DS-DRC-IRB-NPU': 'http://purl.obolibrary.org/obo/DOID_9351',
    'phs001227:DS-ATHSCL-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_1936',
    'phs001259:DS-CARD-MDS-GSO': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001487:DS-MULTIPLE-DISEASES-IRB-COL-NPU-RD': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001489:DS-EAED-IRB-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EAED-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EARET-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-ADULT-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-MULTI-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-NPU': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPCOM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBA-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-NPU-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBAID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-MBND-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs001489:DS-NSD-ADULTS-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001489:DS-NSD-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001506:DS-CVD-IRB': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001592:DS-CVD': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001642:DS-GR-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-DSDI-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-GID': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-IBD': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001642:DS-IBD-MDS': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001676:DS-AONDD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001740:DS-ASD-RD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001741:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001871:DS-CAD-IRB': 'http://purl.obolibrary.org/obo/DOID_3393',
    'phs001894:DS-EAC-PUB-GSO': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001901:DS-CVD-MDS': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs002004:DS-AUT': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002032:DS-SMA-MDS': 'http://purl.obolibrary.org/obo/DOID_12377',
    'phs002032:DS-MBND-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002041:DS-MLHLTH-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002041:DS-SZ-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002041:DS-SZRD-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002042:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002043:DS-AASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002044:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002206:DS-PEDD-IRB': 'http://purl.obolibrary.org/obo/DOID_4',
    'phs002282:DS-CVDRF': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs002502:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-MDS': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-NPU': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'ph2002502:DS-MLHLTH-IRB-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'ph2002502:DS-MH': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002502:DS-MBND-MDS': 'http://purl.obolibrary.org/obo/DOID_1289',
    'phs003200:DS-MSC-MDS': ['http://purl.obolibrary.org/obo/DOID_1909', 'http://purl.obolibrary.org/obo/DOID_4159']
}

# Token for use in DUOS (use gcloud auth print-access-token to get this)
duos_token = ""

# DUOS Environment
duos_env = "prod"

#############################################
## Execution
#############################################
dataset_details_records = fetch_dataset_details(snapshot_id_list, ds_consent_map, duos_token, duos_env)
output = pd.DataFrame(dataset_details_records)
output_sorted = output.sort_values(by=["studyName", "consentGroups.consentGroupName"], ascending=[True, True], ignore_index=True)

#############################################
## Validation and Output
#############################################
# Create copy of dataframe for unique value validation
output_unique_val = output_sorted.copy()

# Convert study list fields to strings
list_fields = ["dataTypes", "dataCustodianEmail", "nihICsSupportingStudy", "collaboratingSites", "alternativeDataSharingPlanReasons"]
for field in list_fields:
    output_unique_val[field] = [try_join(l) for l in output_unique_val[field]]

# Get unique values per study-level field, by study
study_level_col_list = []
for col in output_unique_val.columns:
    if "consentGroups." not in col and col not in ["studyName", "snapshot_id", "consortium", "consentCode", "snapshot_duos_id", "match_duos_id"]:
        study_level_col_list.append(col)
df_unique = output_unique_val.groupby("studyName")[study_level_col_list].nunique()
df_unique["unique_value_validation"] = df_unique.max(axis=1)
df_unique["unique_value_validation"] = ["Pass" if l <= 1 else "Fail" for l in df_unique["unique_value_validation"]]

# Create copy of dataframe for enum validation
output_enum_val = output_sorted.copy()

# Validate enum fields
output_enum_val["studyType"] = [val_study_type_enum(l) for l in output_enum_val["studyType"]]
output_enum_val["nihInstitutionCenterSubmission"] = [val_nih_inst_center_sub_enum(l) for l in output_enum_val["nihInstitutionCenterSubmission"]]
output_enum_val["nihICsSupportingStudy"] = [val_nih_ic_supp_study_enum(l) for l in output_enum_val["nihICsSupportingStudy"]]
output_enum_val["consentGroups.fileTypes.fileType"] = [val_file_type_enum(l) for l in output_enum_val["consentGroups.fileTypes.fileType"]]
study_enum_cols = ["studyType", "nihInstitutionCenterSubmission", "nihICsSupportingStudy"]
df_study_enum = output_enum_val.groupby("studyName")[study_enum_cols].sum()
df_study_enum["study_enum_value_validation"] = df_study_enum.max(axis=1)
df_study_enum["study_enum_value_validation"] = ["Pass" if l < 1 else "Fail" for l in df_study_enum["study_enum_value_validation"]]
consent_group_enum_cols = ["consentGroups.fileTypes.fileType"]
df_consent_group_enum = output_enum_val.groupby("consentGroups.consentGroupName")[consent_group_enum_cols].sum()
df_consent_group_enum["consent_group_enum_value_validation"] = df_consent_group_enum.max(axis=1)
df_consent_group_enum["consent_group_enum_value_validation"] = ["Pass" if l < 1 else "Fail" for l in df_consent_group_enum["consent_group_enum_value_validation"]]

# Join validation dataframes to original dataframe
output_sorted_validated = output_sorted.join(df_unique["unique_value_validation"], on="studyName", how="left")
output_sorted_validated = output_sorted_validated.join(df_study_enum["study_enum_value_validation"], on="studyName", how="left")
output_sorted_validated = output_sorted_validated.join(df_consent_group_enum["consent_group_enum_value_validation"], on="consentGroups.consentGroupName", how="left")

# Display outputs
print("----------------------------------------------------------------------------------------------------")
print("----------------------------------------------------------------------------------------------------")
print("Validated Metadata Output:")
display(output_sorted_validated.style.hide(axis="index"))
print("\n")
print("Unique Study Value Validation Results:")
df_unique.reset_index(inplace=True)
display(df_unique.style.hide(axis="index"))
print("\n")
print("Study Enum Value Validation Results:")
df_study_enum.reset_index(inplace=True)
display(df_study_enum.style.hide(axis="index"))
print("\n")
print("Consent Group Enum Value Validation Results:")
df_consent_group_enum.reset_index(inplace=True)
display(df_consent_group_enum.style.hide(axis="index"))


Building DUOS dataset and study lookups...
Processing snapshot_id: c53121e7-5464-4b08-9a79-9c3a435d5b39...
	Snapshot PHS_ID: phs002041
	Snapshot Consent Code: NA
	Source Workspace: AnVIL_NIMH_Broad_WGSPD1_McCarroll_COGS_DS_10XLRGenomes
	DUOS ID: None
	Consent Group Name: ANVIL_NIMH_Broad_WGSPD1_McCarroll_COGS_DS_10XLRGenomes (NA)
Processing snapshot_id: 2429f987-345e-43f0-aae1-a4752a925066...
	Snapshot PHS_ID: 
	Snapshot Consent Code: Consortia Access Only
	Source Workspace: anvil_ccdg_broad_ai_ibd_daly_bernstein_wes
	DUOS ID: None
	Consent Group Name: ANVIL_ccdg_broad_ai_ibd_daly_bernstein_wes (Consortia Access Only)
Processing snapshot_id: dd292226-35c4-4416-9fa7-5ee42a9b0441...
	Snapshot PHS_ID: phs001489
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_AUSALF_HMB_IRB_GSRS_GSA-MD
	DUOS ID: None
	Consent Group Name: ANVIL_ccdg_broad_np_epilepsy_ausalf_hmb_irb_gsrs_gsa_md (TBD)
Processing snapshot_id: a37d9def-52ca-488e-9468-8e2e211fb3d5...
	Snapshot PHS_ID:

	Snapshot PHS_ID: 
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_CVD_AF_EAST_WES
	DUOS ID: None
	Consent Group Name: ANVIL_ccdg_broad_cvd_af_east_wes (TBD)
Processing snapshot_id: 1624cf29-2dad-4f12-8146-4e8d91ec6c81...
	Snapshot PHS_ID: 
	Snapshot Consent Code: NA
	Source Workspace: AnVIL_CCDG_Broad_CVD_AF_ENGAGE_DS_WES
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broad_CVD_AF_ENGAGE_DS_WES (NA)
Processing snapshot_id: c7f980b3-1dd6-4edf-9412-7c72b89ec3c2...
	Snapshot PHS_ID: 
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_WES
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_WES (TBD)
Processing snapshot_id: 40eeef36-5305-4ca5-8f2b-cfe163de02d8...
	Snapshot PHS_ID: phs002726
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_CVD_AF_Figtree_BioHeart_WES
	DUOS ID: None
	Consent Group Name: ANVIL_ccdg_broad_cvd_af_figtree_bioheart_wes (TBD)
Processing snapshot_id: b6baee06-c290-4848-9a4e-73fd76589

	Snapshot PHS_ID: 
	Snapshot Consent Code: Consortia Access Only
	Source Workspace: AnVIL_CCDG_Broad_NP_Autism_State-Sanders_WGS
	DUOS ID: None
	Consent Group Name: ANVIL_ccdg_broad_np_autism_state_sanders_wgs (Consortia Access Only)
Processing snapshot_id: ce3b35d7-9805-401c-a36e-d118d8fa4a1e...
	Snapshot PHS_ID: phs000920
	Snapshot Consent Code: DS-LD-RD
	Source Workspace: AnVIL_CCDG_NYGC_AI_Asthma_Gala2_WGS
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_NYGC_AI_Asthma_Gala2_WGS (DS-LD-RD)
Processing snapshot_id: 236ff37d-335c-4561-80f6-4ffa8bd88b2f...
	Snapshot PHS_ID: phs000496
	Snapshot Consent Code: Consortia Access Only
	Source Workspace: AnVIL_CCDG_NYGC_NP_Alz_WHICAP_WGS
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_NYGC_NP_Alz_WHICAP_WGS (Consortia Access Only)
Processing snapshot_id: 024cbb4f-c989-4ba4-b33d-a53790e3d6d7...
	Snapshot PHS_ID: 
	Snapshot Consent Code: DS-ASD-IRB-PUB-COL-NPO-GSO
	Source Workspace: AnVIL_CCDG_NYGC_NP_Autism_AFS_DS_WGS
	DUOS ID: None
	Consent Grou

	Snapshot PHS_ID: phs000220
	Snapshot Consent Code: GRU
	Source Workspace: AnVIL_PAGE_MEC_GRU_WGS
	DUOS ID: DUOS-000246
	Consent Group Name: ANVIL_PAGE_MEC_GRU_WGS (GRU)
Processing snapshot_id: cf30be0a-1a09-4a01-9a8a-35474a92aa71...
	Snapshot PHS_ID: 
	Snapshot Consent Code: HMB
	Source Workspace: AnVIL_PAGE_SoL_HMB_WGS
	DUOS ID: None
	Consent Group Name: ANVIL_PAGE_SoL_HMB_WGS (HMB)
Processing snapshot_id: 14e2cc61-7a85-400b-bdfd-729c7950592c...
	Snapshot PHS_ID: phs001033
	Snapshot Consent Code: GRU
	Source Workspace: AnVIL_PAGE_Stanford_Global_Reference_Panel_GRU_WGS
	DUOS ID: DUOS-000244
	Consent Group Name: ANVIL_PAGE_Stanford_Global_Reference_Panel_GRU_WGS (GRU)
Processing snapshot_id: b6f69c83-d069-417b-a6e3-b3fb5fdd21a6...
	Snapshot PHS_ID: phs000227
	Snapshot Consent Code: HMB-IRB
	Source Workspace: AnVIL_PAGE_WHI_HMB-IRB_WGS
	DUOS ID: None
	Consent Group Name: ANVIL_PAGE_WHI_HMB_IRB_WGS (HMB-IRB)
Processing snapshot_id: aff867bd-ac09-4dbc-9031-a145fcc36a5c...
	Snapshot PHS_I

	Snapshot PHS_ID: phs002726
	Snapshot Consent Code: NA
	Source Workspace: AnVIL_CCDG_Broad_CVD_AF_Figtree_BioHeart_Arrays
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broad_CVD_AF_Figtree_BioHeart_Arrays (NA)
Processing snapshot_id: a9df5cdd-c041-4e74-9ee6-85d744b8fc6b...
	Snapshot PHS_ID: 
	Snapshot Consent Code: NA
	Source Workspace: AnVIL_CCDG_Broad_CVD_AF_Figtree_BioHeart_HMB_WES
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broad_CVD_AF_Figtree_BioHeart_HMB_WES (NA)
Processing snapshot_id: a0b58558-82ff-4de2-835f-5f0af4ca29ec...
	Snapshot PHS_ID: phs002236
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_CVD_AF_GAPP_DS-MDS_Arrays
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broad_CVD_AF_GAPP_DS_MDS_Arrays (TBD)
Processing snapshot_id: 7d08223e-7dcf-4eac-9c67-9e4a619ff783...
	Snapshot PHS_ID: phs001933
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_Arrays
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_

	Snapshot PHS_ID: 
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_CHEUBB_HMB-IRB-MDS_GSA-MD
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broad_NP_Epilepsy_CHEUBB_HMB_IRB_MDS_GSA_MD (TBD)
Processing snapshot_id: ad6fafdf-91f1-4865-9a20-c8c749bef3f2...
	Snapshot PHS_ID: 
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_CYPCYP_HMB-NPU-MDS_GSA-MD
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broad_NP_Epilepsy_CYPCYP_HMB_NPU_MDS_GSA_MD (TBD)
Processing snapshot_id: 45f66040-a698-492b-b59f-3047cc23394b...
	Snapshot PHS_ID: 
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_CZEMTH_GRU_GSA-MD
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broad_NP_Epilepsy_CZEMTH_GRU_GSA_MD (TBD)
Processing snapshot_id: c5e72ae7-4cb0-4455-9c5f-36d1dd2f3fba...
	Snapshot PHS_ID: 
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_DEUPUM_HMB-MDS_GSA-MD
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broad_

	Snapshot PHS_ID: 
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_LEBABM_GRU_GSA-MD
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broad_NP_Epilepsy_LEBABM_GRU_GSA_MD (TBD)
Processing snapshot_id: 9716da3d-4523-492d-8d1a-b91009b66713...
	Snapshot PHS_ID: 
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_LTUUHK_HMB_NPU_MDS_GSA-MD
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broad_NP_Epilepsy_LTUUHK_HMB_NPU_MDS_GSA_MD (TBD)
Processing snapshot_id: c4c8a9a9-97bd-41a4-9070-9fff6e9712e8...
	Snapshot PHS_ID: 
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_NZLUTO_EPIL_BC_ID_MDS_GSA-MD
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broad_NP_Epilepsy_NZLUTO_EPIL_BC_ID_MDS_GSA_MD (TBD)
Processing snapshot_id: 79df9b36-7fb1-4eb6-8a19-2c4859a50b41...
	Snapshot PHS_ID: 
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_TURBZU_GRU_GSA-MD
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Broa

	Snapshot PHS_ID: phs001489
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_ZAFAGN_DS-EPI-COMO-MDS_GSA-MD
	DUOS ID: None
	Consent Group Name: ANVIL_ccdg_broad_np_epilepsy_zafagn_ds_epi_como_mds_gsa_md (TBD)
Processing snapshot_id: 9c55b651-7379-4221-afe8-f1fe0d8c11b1...
	Snapshot PHS_ID: 
	Snapshot Consent Code: None
	Source Workspace: AnVIL_CCDG_Freeze2_VCFs
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_Freeze2_VCFs
Processing snapshot_id: 6fdea8c7-69d9-466e-9fa2-aca30722ff68...
	Snapshot PHS_ID: phs000298
	Snapshot Consent Code: None
	Source Workspace: AnVIL_CCDG_NHGRI_Broad_ASD_Daly_phs000298_WES_vcf
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_NHGRI_Broad_ASD_Daly_phs000298_WES_vcf
Processing snapshot_id: 6a92c922-83b3-4acb-9b9d-36eeea3a7f0b...
	Snapshot PHS_ID: phs000496
	Snapshot Consent Code: Consortia Access Only
	Source Workspace: AnVIL_CCDG_NYGC_NP_Alz_EFIGA_WGS
	DUOS ID: None
	Consent Group Name: ANVIL_CCDG_NYGC_NP_Alz_EFIGA_WGS (Consortia Acce

	Snapshot PHS_ID: phs001272
	Snapshot Consent Code: Consortia Access Only
	Source Workspace: AnVIL_CMG_Broad_Muscle_Kang_WES
	DUOS ID: None
	Consent Group Name: ANVIL_CMG_Broad_Muscle_Kang_WES (Consortia Access Only)
Processing snapshot_id: c13b2c4e-e5da-4384-a479-803dbbf3acc5...
	Snapshot PHS_ID: phs001272
	Snapshot Consent Code: Consortia Access Only
	Source Workspace: AnVIL_CMG_Broad_Muscle_Kang_WGS
	DUOS ID: None
	Consent Group Name: ANVIL_CMG_Broad_Muscle_Kang_WGS (Consortia Access Only)
Processing snapshot_id: 533ba93b-506e-4547-9174-037a6b17835d...
	Snapshot PHS_ID: phs001272
	Snapshot Consent Code: HMB-MDS
	Source Workspace: AnVIL_CMG_Broad_Muscle_Myoseq_WES
	DUOS ID: None
	Consent Group Name: ANVIL_CMG_Broad_Muscle_Myoseq_WES (HMB-MDS)
Processing snapshot_id: 768fd1b9-8785-44d3-bff3-353657ef1174...
	Snapshot PHS_ID: 
	Snapshot Consent Code: HMB-MDS
	Source Workspace: AnVIL_CMG_Broad_Muscle_Myoseq_WGS
	DUOS ID: None
	Consent Group Name: ANVIL_CMG_Broad_Muscle_Myoseq_WGS (HMB-MD

Processing snapshot_id: f885a740-5559-45ec-a05f-5f43fc6d2cd7...
	Snapshot PHS_ID: phs001616
	Snapshot Consent Code: TBD
	Source Workspace: AnVIL_eMERGE_PGRNseq
	DUOS ID: None
	Consent Group Name: ANVIL_eMERGE_PGRNseq (TBD)
Processing snapshot_id: c4990f27-a5d9-4fe0-9ab6-d579d358699d...
	Snapshot PHS_ID: phs000298
	Snapshot Consent Code: TBD
	Source Workspace: anvil_ccdg_asc_ndd_daly_talkowski_cdcseed_asd_gsa-md
	DUOS ID: None
	Consent Group Name: ANVIL_ccdg_asc_ndd_daly_talkowski_cdcseed_asd_gsa_md (TBD)
Processing snapshot_id: 9efd748c-ad09-4765-b645-1b6ef6b5d402...
	Snapshot PHS_ID: phs002502
	Snapshot Consent Code: NA
	Source Workspace: anvil_ccdg_asc_ndd_daly_talkowski_cdcseed_asd_hmb_wes
	DUOS ID: None
	Consent Group Name: ANVIL_ccdg_asc_ndd_daly_talkowski_cdcseed_asd_hmb_wes (NA)
Processing snapshot_id: 4f51b794-aaf3-4553-9d5c-509dc1e9e8f4...
	Snapshot PHS_ID: phs000298
	Snapshot Consent Code: TBD
	Source Workspace: anvil_ccdg_asc_ndd_daly_talkowski_schloesser_asd_gsa-md
	DUOS ID

snapshot_id,snapshot_phs_id,snapshot_duos_id,match_duos_id,match_study_id,studyName,studyType,studyDescription,dataTypes,phenotypeIndication,species,piName,dataCustodianEmail,publicVisibility,nihAnvilUse,submittingToAnvil,dbGaPPhsID,dbGaPStudyRegistrationName,embargoReleaseDate,sequencingCenter,piEmail,piInstitution,nihGrantContractNumber,nihICsSupportingStudy,nihProgramOfficerName,nihInstitutionCenterSubmission,nihInstitutionalCertificationFileName,nihGenomicProgramAdministratorName,multiCenterStudy,collaboratingSites,controlledAccessRequiredForGenomicSummaryResultsGSR,controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation,alternativeDataSharingPlan,alternativeDataSharingPlanReasons,alternativeDataSharingPlanExplanation,alternativeDataSharingPlanFileName,alternativeDataSharingPlanDataSubmitted,alternativeDataSharingPlanDataReleased,alternativeDataSharingPlanTargetDeliveryDate,alternativeDataSharingPlanTargetPublicReleaseDate,alternativeDataSharingPlanAccessManagement,consentGroups.consentGroupName,consentGroups.accessManagement,consentGroups.numberOfParticipants,consentCode,consentGroups.generalResearchUse,consentGroups.hmb,consentGroups.diseaseSpecificUse,consentGroups.gs,consentGroups.poa,consentGroups.nmds,consentGroups.gso,consentGroups.pub,consentGroups.col,consentGroups.irb,consentGroups.npu,consentGroups.otherPrimary,consentGroups.otherSecondary,consentGroups.mor,consentGroups.morDate,consentGroups.dataLocation,consentGroups.url,consentGroups.fileTypes.fileType,consentGroups.fileTypes.functionalEquivalence,consortium,unique_value_validation,study_enum_value_validation,consent_group_enum_value_validation
57eb0f97-fc7b-49c3-821e-11c763ee6a94,,,,,AnVIL CMG,Parent-Offspring Trios,Rare muscular disease samples submitted to the Broad Center for Mendelian Genomics by the Kids Neuroscience Centre of the Sydney Children's Hospitals Network. Cohort consists of WGS samples aligned to hg38. Platform: AnVIL,['Raw Sequencing data'],Mendelian disorders,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CMG'],,,False,[],,,,,,,,ANVIL_CMG_Broad_Muscle_KNC_WGS (DS-NIC-EMP-LENF),controlled,6.0,DS-NIC-EMP-LENF,False,False,[],,False,False,False,False,False,False,False,DS-NIC-EMP-LENF,,,,TDR Location,https://data.terra.bio/snapshots/57eb0f97-fc7b-49c3-821e-11c763ee6a94,['Whole Genome'],,CMG,Pass,Fail,Fail
48390abd-821d-4af1-b563-a99b38e260da,,,,,AnVIL_CCDG_Baylor_CVD_HemStroke_GOCHA_DS_WGS,,Platform: AnVIL,['Genotyping Array data'],hemorrhagic stroke,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CCDG'],,,False,[],,,,,,,,ANVIL_ccdg_baylor_cvd_hemstroke_gocha_ds_wgs (TBD),controlled,0.0,,False,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/48390abd-821d-4af1-b563-a99b38e260da,['Whole Genome'],,CCDG,Pass,Pass,Fail
32252585-907e-4e7d-ab50-8bc7e5eefcba,,,,,AnVIL_CCDG_Broad_NP_Epilepsy_TWNCGM_HMB-NPU-ADULTS_WES,Case-Control,AnVIL_CCDG_Broad_NP_Epilepsy_TWNCGM_HMB-NPU-ADULTS_WES Platform: AnVIL,['Raw Sequencing data'],epilepsy,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CCDG'],,,False,[],,,,,,,,ANVIL_ccdg_broad_np_epilepsy_twncgm_hmb_npu_adults_wes (HMB-NPU-ADULTS),controlled,1017.0,HMB-NPU-ADULTS,False,True,[],,False,False,False,False,False,False,True,,,,,TDR Location,https://data.terra.bio/snapshots/32252585-907e-4e7d-ab50-8bc7e5eefcba,['Exome'],,CCDG,Pass,Fail,Pass
51620360-86eb-437f-b3dc-b9abd9eef4cc,,,,,AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_GSA-MD,Case-Control,Platform: AnVIL,['Genotyping Array data'],epilepsy,Human,Benjamin Neale,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CCDG'],,,False,[],,,,,,,,ANVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_GSA_MD (TBD),controlled,46.0,,False,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/51620360-86eb-437f-b3dc-b9abd9eef4cc,['Genotyping Array'],,CCDG,Pass,Fail,Fail
d771bc68-4ac4-4ed6-abcc-8269a16c7121,,,,,AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_WES,Case Set,AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_WES Platform: AnVIL,['Raw Sequencing data'],epilepsy,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CCDG'],,,False,[],,,,,,,,ANVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_WES (GRU-GSRS),controlled,47.0,GRU-GSRS,True,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/d771bc68-4ac4-4ed6-abcc-8269a16c7121,['Exome'],,CCDG,Pass,Fail,Pass
c930f337-5a82-48e4-b403-9615f0e4f951,,,,,AnVIL_CMG_Broad_Brain_Engle_WGS,Parent-Offspring Trios,AnVIL_CMG_Broad_Brain_Engle_WGS Platform: AnVIL,['Raw Sequencing data'],Mendelian disorders,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CMG'],,,False,[],,,,,,,,ANVIL_cmg_broad_brain_engle_wgs (TBD),controlled,95.0,,False,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/c930f337-5a82-48e4-b403-9615f0e4f951,['Whole Genome'],,CMG,Pass,Fail,Fail
00002787-81de-47e4-a7a9-7da09bc95592,,,,,AnVIL_CMG_Broad_Kidney_Southampton_WGS,,AnVIL_CMG_Broad_Kidney_Southampton_WGS Platform: AnVIL,['Raw Sequencing data'],,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,[],,,False,[],,,,,,,,ANVIL_CMG_Broad_Kidney_Southampton_WGS (NA),controlled,3.0,,False,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/00002787-81de-47e4-a7a9-7da09bc95592,['Whole Genome'],,,Pass,Pass,Fail
768fd1b9-8785-44d3-bff3-353657ef1174,,,,,AnVIL_CMG_Broad_Muscle_Myoseq_WGS,Parent-Offspring Trios,AnVIL_CMG_Broad_Muscle_Myoseq_WGS Platform: AnVIL,['Raw Sequencing data'],Mendelian disorders,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CMG'],,,False,[],,,,,,,,ANVIL_CMG_Broad_Muscle_Myoseq_WGS (HMB-MDS),controlled,13.0,HMB-MDS,False,True,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/768fd1b9-8785-44d3-bff3-353657ef1174,['Whole Genome'],,CMG,Pass,Fail,Fail
310d443d-dd36-4884-843d-6d93596034a6,,,,,AnVIL_CMG_Broad_Muscle_OGrady_WES,Parent-Offspring Trios,AnVIL_CMG_Broad_Muscle_OGrady_WES Platform: AnVIL,['Raw Sequencing data'],Mendelian disorders,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CMG'],,,False,[],,,,,,,,ANVIL_CMG_Broad_Muscle_OGrady_WES (GRU),controlled,97.0,GRU,True,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/310d443d-dd36-4884-843d-6d93596034a6,['Exome'],,CMG,Pass,Fail,Pass
82703bd5-1ee1-4c9a-920c-824437b91dbf,,,,,AnVIL_CMG_Broad_Orphan_Chung_WGS,,AnVIL_CMG_Broad_Orphan_Chung_WGS Platform: AnVIL,['Raw Sequencing data'],,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,[],,,False,[],,,,,,,,ANVIL_CMG_Broad_Orphan_Chung_WGS (NA),controlled,120.0,,False,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/82703bd5-1ee1-4c9a-920c-824437b91dbf,['Whole Genome'],,,Pass,Pass,Fail




Unique Study Value Validation Results:


studyName,snapshot_phs_id,match_study_id,studyType,studyDescription,dataTypes,phenotypeIndication,species,piName,dataCustodianEmail,publicVisibility,nihAnvilUse,submittingToAnvil,dbGaPPhsID,dbGaPStudyRegistrationName,embargoReleaseDate,sequencingCenter,piEmail,piInstitution,nihGrantContractNumber,nihICsSupportingStudy,nihProgramOfficerName,nihInstitutionCenterSubmission,nihInstitutionalCertificationFileName,nihGenomicProgramAdministratorName,multiCenterStudy,collaboratingSites,controlledAccessRequiredForGenomicSummaryResultsGSR,controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation,alternativeDataSharingPlan,alternativeDataSharingPlanReasons,alternativeDataSharingPlanExplanation,alternativeDataSharingPlanFileName,alternativeDataSharingPlanDataSubmitted,alternativeDataSharingPlanDataReleased,alternativeDataSharingPlanTargetDeliveryDate,alternativeDataSharingPlanTargetPublicReleaseDate,alternativeDataSharingPlanAccessManagement,unique_value_validation
AnVIL CMG,1,1,1,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CCDG_Baylor_CVD_HemStroke_GOCHA_DS_WGS,1,1,0,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_TWNCGM_HMB-NPU-ADULTS_WES,1,1,1,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_GSA-MD,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_WES,1,1,1,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CMG_Broad_Brain_Engle_WGS,1,1,1,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CMG_Broad_Kidney_Southampton_WGS,1,1,0,1,1,0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CMG_Broad_Muscle_Myoseq_WGS,1,1,1,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CMG_Broad_Muscle_OGrady_WES,1,1,1,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CMG_Broad_Orphan_Chung_WGS,1,1,0,1,1,0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass




Study Enum Value Validation Results:


studyName,studyType,nihInstitutionCenterSubmission,nihICsSupportingStudy,study_enum_value_validation
AnVIL CMG,1,0,0,Fail
AnVIL_CCDG_Baylor_CVD_HemStroke_GOCHA_DS_WGS,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_TWNCGM_HMB-NPU-ADULTS_WES,1,0,0,Fail
AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_GSA-MD,1,0,0,Fail
AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_WES,1,0,0,Fail
AnVIL_CMG_Broad_Brain_Engle_WGS,1,0,0,Fail
AnVIL_CMG_Broad_Kidney_Southampton_WGS,0,0,0,Pass
AnVIL_CMG_Broad_Muscle_Myoseq_WGS,1,0,0,Fail
AnVIL_CMG_Broad_Muscle_OGrady_WES,1,0,0,Fail
AnVIL_CMG_Broad_Orphan_Chung_WGS,0,0,0,Pass




Consent Group Enum Value Validation Results:


consentGroups.consentGroupName,consentGroups.fileTypes.fileType,consent_group_enum_value_validation
ANVIL_ALSCompute_Collection_GRU (GRU),0,Pass
ANVIL_CCDG_Baylor_CVD_AFib_BioVU_WGS (Consortia Access Only),0,Pass
ANVIL_CCDG_Baylor_CVD_AFib_Groningen_WGS (GRU),0,Pass
ANVIL_CCDG_Baylor_CVD_AFib_VAFAR_HMB_IRB_WGS (HMB-IRB),0,Pass
ANVIL_CCDG_Baylor_CVD_EOCAD_BioMe_WGS (GRU),0,Pass
ANVIL_CCDG_Baylor_CVD_EOCAD_SoL_WGS (HMB-NPU),1,Fail
ANVIL_CCDG_Baylor_CVD_HHRC_Brownsville_GRU_WGS (GRU),0,Pass
ANVIL_CCDG_Baylor_CVD_HemStroke_BNI_HMB_WGS (HMB),1,Fail
ANVIL_CCDG_Baylor_CVD_HemStroke_Duke_DS_WGS (DS),1,Fail
ANVIL_CCDG_Baylor_CVD_HemStroke_ERICH_WGS (HMB-IRB),1,Fail


In [None]:
ds_consent_map = {
    'phs000298:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs000693:DS-BDIS': 'http://purl.obolibrary.org/obo/DOID_936',
    'phs000693:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs000744:DS-RD': 'http://purl.obolibrary.org/obo/DOID_15',
    'phs000744:DS-THAL-IRB': 'http://purl.obolibrary.org/obo/DOID_10241',
    'phs001222:DS-DRC-IRB-NPU': 'http://purl.obolibrary.org/obo/DOID_9351',
    'phs001227:DS-ATHSCL-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_1936',
    'phs001259:DS-CARD-MDS-GSO': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001487:DS-MULTIPLE-DISEASES-IRB-COL-NPU-RD': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001489:DS-EAED-IRB-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EAED-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EARET-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-ADULT-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-MULTI-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-NPU': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPCOM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBA-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-NPU-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBAID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-MBND-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs001489:DS-NSD-ADULTS-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001489:DS-NSD-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001506:DS-CVD-IRB': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001592:DS-CVD': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001642:DS-GR-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-DSDI-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-GID': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-IBD': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001642:DS-IBD-MDS': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001676:DS-AONDD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001740:DS-ASD-RD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001741:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001871:DS-CAD-IRB': 'http://purl.obolibrary.org/obo/DOID_3393',
    'phs001894:DS-EAC-PUB-GSO': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001901:DS-CVD-MDS': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs002004:DS-AUT': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002032:DS-SMA-MDS': 'http://purl.obolibrary.org/obo/DOID_12377',
    'phs002032:DS-MBND-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002041:DS-MLHLTH-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002041:DS-SZ-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002041:DS-SZRD-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002042:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002043:DS-AASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002044:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002206:DS-PEDD-IRB': 'http://purl.obolibrary.org/obo/DOID_4',
    'phs002282:DS-CVDRF': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs002502:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-MDS': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-NPU': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'ph2002502:DS-MLHLTH-IRB-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'ph2002502:DS-MH': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002502:DS-MBND-MDS': 'http://purl.obolibrary.org/obo/DOID_1289',
    'phs003200:DS-MSC-MDS': ['http://purl.obolibrary.org/obo/DOID_1909', 'http://purl.obolibrary.org/obo/DOID_4159']
}

for key, val in ds_consent_map.items():
    print(key+"|"+val)

# Step 2: Load Reviewed Metadata into DUOS

In [None]:
#############################################
## Functions
#############################################

def format_list(input_list, min_items):
    if input_list:
        if isinstance(input_list, list):
            return input_list
        elif isinstance(input_list, str):
            return format_list(ast.literal_eval(input_list), min_items)
        else:
            return []
    else:
        if min_items > 0:
            i = 0
            temp_list = []
            while i < min_items:
                temp_list.append("Unknown")
                i += 1
            return temp_list
        else:
            return []
    
def format_file_types(ft_list, fe):
    if ft_list:
        output_list = []
        formatted_ft_list = format_list(ft_list, 0)
        for ft in formatted_ft_list:
            ft_dict = {"fileType": ft}
            if fe:
                ft_dict["functionalEquivalence"] = fe
            else:
                ft_dict["functionalEquivalence"] = "Unknown"
            output_list.append(ft_dict)
        return output_list
    else:
        return []
    
def upload_to_duos(input_file, token, env, dac_id, study_upload_list, preview_only):
    
    # Determine the target URL from the env variable
    if env == "prod":
        url = "https://consent.dsde-prod.broadinstitute.org"
    else:
        url = "https://consent.dsde-dev.broadinstitute.org"
    
    # Pull down specified file from the cloud
    results_log = []
    print(f"Downloading input file {input_file}...")
    try:
        input_df = pd.read_csv(input_file, delimiter = "\t", encoding='unicode_escape')
        input_df = input_df.astype(object).where(pd.notnull(input_df),None)
        input_df.fillna("",inplace=True)
        input_dict = input_df.to_dict(orient="records")
        results_log.append(["Input File Download", "Succeeded", ""])
    except Exception as e:
        msg = f"Error downloading input file ({input_file}): {str(e)}"
        results_log.append(["Input File Download", "Failed", msg])
        print(msg)
        return results_log

    # Parse and build DUOS schema for inputted file
    print("Parsing input file and formatting into DUOS schema...")
    upload_dict = {}
    study_lookup = {}
    try:
        # Determine data submitter id
        response = requests.get(
            url=f"{url}/api/user/me",
            headers={"Authorization": f"Bearer {token}"}
        ).json()
        data_submitter_id = response["userId"]
        # Build dictionary for upload
        existing_dataset_cnt = 0
        new_dataset_cnt = 0
        for input_entry in input_dict:
            snapshot_id = input_entry["snapshot_id"]
            dataset_id = str(int(input_entry["target_dataset_id"])) if input_entry["target_dataset_id"] else input_entry["target_dataset_id"]
            dataset_name = input_entry["consentGroups.consentGroupName"]
            study_id = str(int(input_entry["target_study_id"])) if input_entry["target_study_id"] else input_entry["target_study_id"]
            study_name = input_entry["studyName"]
            if study_id:
                study_lookup[study_name] = study_id
            tar_ds_id = dataset_id if dataset_id else "ID_TBD"
            tar_st_id = study_id if study_id else "ID_TBD"
            access_type = input_entry["consentGroups.accessManagement"]
            print(f"Parsing and formatting metadata for snapshot {snapshot_id} from the input file. Target study is: {study_name} ({tar_st_id}). Target consent group is: {dataset_name} ({tar_ds_id})")

            # If this is an existing dataset in the specified existing study, provide limited consent group information (for updates only)
            if dataset_id:
                existing_dataset_cnt += 1
                consent_group_dict = {
                            "consentGroupName": dataset_name,
                            "datasetId": dataset_id,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            # If this is a new dataset that is open access, provide limited consent group information
            elif access_type == "open":
                new_dataset_cnt += 1
                consent_group_dict = {
                            "consentGroupName": dataset_name,
                            "accessManagement": access_type,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            # If this is a new dataset that is NOT open access, provide the full consent group information
            else:
                new_dataset_cnt += 1
                consent_group_dict = {
                            "consentGroupName": dataset_name,
                            "dataAccessCommitteeId": dac_id,
                            "accessManagement": access_type,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "generalResearchUse": input_entry["consentGroups.generalResearchUse"],
                            "hmb": input_entry["consentGroups.hmb"],
                            "diseaseSpecificUse": format_list(input_entry["consentGroups.diseaseSpecificUse"], 0),
                            "gs": input_entry["consentGroups.gs"],
                            "poa": input_entry["consentGroups.poa"],
                            "nmds": input_entry["consentGroups.nmds"],
                            "gso": input_entry["consentGroups.gso"],
                            "pub": input_entry["consentGroups.pub"],
                            "col": input_entry["consentGroups.col"],
                            "irb": input_entry["consentGroups.irb"],
                            "npu": input_entry["consentGroups.npu"],
                            "otherPrimary": input_entry["consentGroups.otherPrimary"],
                            #"otherSecondary": input_entry["consentGroups.otherSecondary"], --> Excluding for now, per JL's request
                            #"mor": input_entry["consentGroups.mor"], --> Date formatting validation for morDate, exclude for now
                            #"morDate": input_entry["consentGroups.morDate"], --> Date formatting validation, exclude for now
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }

            # If the study associated with the record is not already in the upload dictionary, create a new study dict and append the consent group dict
            study_dict = {}
            consent_group_list = []
            if study_name not in upload_dict.keys():
                consent_group_list.append(consent_group_dict)
                study_dict = {
                    "studyName": study_name,
                    #"studyType": input_entry["studyType"], --> Enumeration, exclude for now
                    "studyDescription": input_entry["studyDescription"],
                    "dataTypes": format_list(input_entry["dataTypes"], 1),
                    "phenotypeIndication": input_entry["phenotypeIndication"],
                    "species": input_entry["species"],
                    "piName": input_entry["piName"] if input_entry["piName"] else "NA",
                    "dataSubmitterUserId": data_submitter_id,
                    "dataCustodianEmail": format_list(input_entry["dataCustodianEmail"], 0),
                    "publicVisibility": input_entry["publicVisibility"],
                    "nihAnvilUse": input_entry["nihAnvilUse"],
                    "submittingToAnvil": input_entry["submittingToAnvil"],
                    "dbGaPPhsID": input_entry["dbGaPPhsID"],
                    "dbGaPStudyRegistrationName": input_entry["studyName"],
                    #"embargoReleaseDate": input_entry["embargoReleaseDate"], --> Date formatting validation, exclude for now
                    "sequencingCenter": input_entry["sequencingCenter"],
                    "piEmail": input_entry["piEmail"],
                    #"piInstitution": input_entry["piInstitution"], --> Integer ID for registered institutions, exclude for now
                    "piInstitution": 0,
                    "nihGrantContractNumber": "Unknown", # Required currently
                    "nihICsSupportingStudy": format_list(input_entry["nihICsSupportingStudy"], 0),
                    "nihProgramOfficerName": input_entry["nihProgramOfficerName"],
                    "nihInstitutionCenterSubmission": input_entry["nihInstitutionCenterSubmission"],
                    "nihInstitutionalCertificationFileName": input_entry["nihInstitutionalCertificationFileName"],
                    "nihGenomicProgramAdministratorName": input_entry["nihGenomicProgramAdministratorName"],
                    "collaboratingSites": format_list(input_entry["collaboratingSites"], 0),
                    "alternativeDataSharingPlan": input_entry["alternativeDataSharingPlan"],
                    "alternativeDataSharingPlanExplanation": input_entry["alternativeDataSharingPlanExplanation"],
                    "alternativeDataSharingPlanReasons": ["Other"] if input_entry["alternativeDataSharingPlan"] == True and input_entry["alternativeDataSharingPlanReasons"] == "[]" else format_list(input_entry["alternativeDataSharingPlanReasons"], 0), 
                    "consentGroups": consent_group_list
                }
                upload_dict[study_name] = study_dict
            # If the study is already in the upload dictionary, create an updated study dict and extend its list of consent groups
            else:
                study_dict = upload_dict[study_name].copy()
                for consent_group in study_dict["consentGroups"]:
                    if consent_group["consentGroupName"] != consent_group_dict["consentGroupName"]:
                        consent_group_list.append(consent_group)
                consent_group_list.append(consent_group_dict)
                study_dict["consentGroups"] = consent_group_list
                upload_dict[study_name] = study_dict
        msg = f"Input file formatting complete. Existing Datasets: {existing_dataset_cnt} New Datasets: {new_dataset_cnt}"
        print(msg)
        results_log.append(["Input File Parsing and Formatting", "Succeeded", msg])
    except Exception as e:
        msg = f"Error parsing and formatting input file: {str(e)}"
        results_log.append(["Input File Parsing and Formatting", "Failed", msg])
        print(msg)
        #return results_log

    # Loop through studies to upload and augment with an missing existing datasets
    print("Augmenting upload set with missing existing datasets...")
    for study in upload_dict.keys():
        if study in study_upload_list or len(study_upload_list) == 0:
            study_id = study_lookup.get(study)
            if study_id:
                try:
                    # Identify existing datasets for that are not present in the upload dict for a particular study
                    study_details = requests.get(
                            url=f"{url}/api/dataset/study/{study_id}",
                            headers={"Authorization": f"Bearer {token}"}
                        ).json()
                    study_datasets_in_duos = set(study_details.get("datasetIds"))
                    study_datasets_in_input = set()
                    study_datasets_diff = set()
                    for datasets in upload_dict[study]["consentGroups"]:
                        if datasets.get("datasetId"):
                            study_datasets_in_input.add(datasets.get("datasetId"))
                    for dataset_in_duos in study_datasets_in_duos:
                        if str(dataset_in_duos) not in study_datasets_in_input:
                            study_datasets_diff.add(dataset_in_duos)
                    # Add missing datasets to the upload dict
                    temp_cg = upload_dict[study]["consentGroups"].copy()
                    for missing_dataset_id in study_datasets_diff:
                        dataset_details = requests.get(
                            url=f"{url}/api/dataset/v2/{missing_dataset_id}",
                            headers={"Authorization": f"Bearer {token}"}
                        ).json()
                        name = dataset_details["name"]
                        data_loc = ""
                        data_loc_url = ""
                        num_participants = 0
                        for prop_entry in dataset_details["properties"]:
                            if prop_entry["propertyName"] == "Data Location":
                                data_loc = prop_entry["propertyValue"]
                            elif prop_entry["propertyName"] == "# of participants":
                                num_participants = prop_entry["propertyValue"]
                        consent_group_dict = {
                            "consentGroupName": dataset_details["name"],
                            "datasetId": missing_dataset_id,
                            "numberOfParticipants": num_participants,
                            "dataLocation": data_loc,
                            "url": data_loc_url,
                            "fileTypes": []
                        }
                        temp_cg.append(consent_group_dict)
                    upload_dict[study]["consentGroups"] = temp_cg
                except:
                    print(f"WARNING: Issue retrieving study details for study_id {study_id}. May cause issues with upload downstream.")

    # Preview of upload input dictionary
    if preview_only:
        # Build a preview of the upload dictionary data
        print("Building upload set preview...")
        output_preview = []
        study_id_set = set()
        for study_name, study_dict in upload_dict.items():
            if study_name in study_upload_list or len(study_upload_list) == 0:
                study_id = study_lookup.get(study_name) if study_lookup.get(study_name) else f"ID_TBD ({study_name})"
                study_id_set.add(study_id)
                study_phs = study_dict["dbGaPPhsID"]
                for consent_group in study_dict["consentGroups"]:
                    dataset_id = consent_group.get("datasetId")
                    dataset_name = consent_group.get("consentGroupName")
                    snapshot_url = consent_group.get("url")
                    if snapshot_url and "https://data.terra.bio/snapshots/" in snapshot_url:
                        snapshot_id = snapshot_url.replace("https://data.terra.bio/snapshots/", "")
                        record_src = "upload"
                    else:
                        snapshot_id = ""
                        record_src = "upload_aug"
                    if dataset_id:
                        dataset_details = requests.get(
                            url=f"{url}/api/dataset/v2/{dataset_id}",
                            headers={"Authorization": f"Bearer {token}"}
                        ).json()
                        dataset_identifier = dataset_details.get("datasetIdentifier")
                        duos_data_use_dict = dataset_details.get("dataUse")
                        du_gru = duos_data_use_dict.get("generalUse") if duos_data_use_dict.get("generalUse") else False
                        du_hmb = duos_data_use_dict.get("hmbResearch") if duos_data_use_dict.get("hmbResearch") else False
                        du_disease = duos_data_use_dict.get("diseaseRestrictions") if duos_data_use_dict.get("diseaseRestrictions") else []
                        du_poa = duos_data_use_dict.get("populationOriginsAncestry") if duos_data_use_dict.get("populationOriginsAncestry") else False
                        du_ethics = duos_data_use_dict.get("ethicsApprovalRequired") if duos_data_use_dict.get("ethicsApprovalRequired") else False
                        du_collab = duos_data_use_dict.get("collaboratorRequired") if duos_data_use_dict.get("collaboratorRequired") else False
                        du_geog = duos_data_use_dict.get("geographicalRestrictions") if duos_data_use_dict.get("geographicalRestrictions") else ""
                        du_genetic = duos_data_use_dict.get("geneticStudiesOnly") if duos_data_use_dict.get("geneticStudiesOnly") else False
                        du_pub = duos_data_use_dict.get("publicationResults") if duos_data_use_dict.get("publicationResults") else False
                        du_nmds = duos_data_use_dict.get("methodsResearch") if duos_data_use_dict.get("methodsResearch") else False
                        du_npu = duos_data_use_dict.get("nonProfitUse") if duos_data_use_dict.get("nonProfitUse") else False
                        du_other = duos_data_use_dict.get("other") if duos_data_use_dict.get("other") else ""
                    else:
                        dataset_id = f"ID_TBD ({dataset_name})"
                        dataset_identifier = "ID_TBD"
                        du_gru = consent_group.get("generalResearchUse") if consent_group.get("generalResearchUse") else False
                        du_hmb = consent_group.get("hmb") if consent_group.get("hmb") else False
                        du_disease = consent_group.get("diseaseSpecificUse") if consent_group.get("diseaseSpecificUse") else []
                        du_poa = consent_group.get("poa") if consent_group.get("poa") else False
                        du_ethics = consent_group.get("irb") if consent_group.get("irb") else False
                        du_collab = consent_group.get("col") if consent_group.get("col") else False
                        du_geog = consent_group.get("gs") if consent_group.get("gs") else ""
                        du_genetic = consent_group.get("gso") if consent_group.get("gso") else False
                        du_pub = consent_group.get("pub") if consent_group.get("pub") else False
                        du_nmds = consent_group.get("nmds") if consent_group.get("nmds") else False
                        du_npu = consent_group.get("npu") if consent_group.get("npu") else False
                        du_other = consent_group.get("otherPrimary") if consent_group.get("otherPrimary") else ""
                    output_preview.append([study_id, study_name, study_phs, dataset_id, dataset_identifier, dataset_name, du_gru, du_hmb, du_disease, du_poa, du_ethics, du_collab, du_geog, du_genetic, du_pub, du_nmds, du_npu, du_other, snapshot_id, record_src])

        # Add in AnVIL datasets not in the upload dictionary 
        if len(study_upload_list) == 0:
            anvil_datasets_in_duos = get_anvil_datasets_from_duos(token, env)
            for dataset in anvil_datasets_in_duos:
                dataset_exists = False
                for output_dataset in output_preview:
                    if str(dataset[3]) == str(output_dataset[3]):
                        dataset_exists = True
                        break
                if not dataset_exists:
                    rec_to_add = dataset.copy()
                    rec_to_add.append("prod_add")
                    output_preview.append(rec_to_add)

        # Display output preview
        df_results = pd.DataFrame(output_preview, columns = ["Study ID", "Study Name", "Study PHS", "Dataset ID", "Dataset Identifier", "Dataset Name", "GRU", "HMB", "DS", "POA", "IRB", "COL", "GS", "GSO", "PUB", "NMDS", "NPU", "OTHER", "Snapshot ID", "Record Source"])
        print("\nOutput Preview:")
        display(df_results)
    
    else:
        print("Uploading studies to DUOS...")
        for study in upload_dict.keys():
            if study in study_upload_list or len(study_upload_list) == 0:
                study_id = study_lookup.get(study)
                # For studies that don't exist in DUOS, create a new study
                if not study_id:
                    print("Study does NOT currently exist in DUOS. Registering new study...")
                    try:
                        new_study_response = requests.post(
                            url=f"{url}/api/dataset/v3",
                            headers={"Authorization": f"Bearer {token}"},
                            files = {
                                "dataset": json.dumps(upload_dict[study]),
                                "alternativeDataSharingPlan": "",
                                "consentGroups[0].nihInstitutionalCertificationFile": ""  
                            }
                        ).json()
                        if new_study_response.get("studyId"):
                            study_id = new_study_response["studyId"]
                            msg = f"Study registration succeeded! Study Id: {study_id}"
                            results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                            print(msg)
                        else:
                            err_msg = new_study_response["message"]
                            msg = f"Study registration failed: {err_msg}"
                            results_log.append([f"New Study Registration - {study}", "Failed", msg])
                            print(msg)
                    except Exception as e:
                        msg = f"Study registration failed: {str(e)}"
                        results_log.append([f"New Study Registration - {study}", "Failed", msg])
                        print(msg)
                # For studies that already exist in DUOS, update the existing study
                else:
                    print("Study DOES currently exist in DUOS. Updating study...")
                    try:
                        # Update study in DUOS
                        update_study_response = requests.put(
                            url=f"{url}/api/dataset/study/{study_id}",
                            headers={"Authorization": f"Bearer {token}"},
                            files = {
                                "dataset": json.dumps(upload_dict[study]),
                                "alternativeDataSharingPlan": "",
                                "consentGroups[0].nihInstitutionalCertificationFile": ""  
                            }
                        ).json()   
                        if update_study_response.get("studyId"):
                            study_id = update_study_response["studyId"]
                            msg = f"Study registration succeeded! Study Id: {study_id}"
                            results_log.append([f"Study Registration Update - {study}", "Succeeded", msg])
                            print(msg)
                        else:
                            err_msg = update_study_response["message"]
                            msg = f"Study registration failed: {err_msg}"
                            results_log.append([f"Study Registration Update - {study}", "Failed", msg])
                            print(msg)
                    except Exception as e:
                        msg = f"Study registration failed: {str(e)}"
                        results_log.append([f"Study Registration Update - {study}", "Failed", msg])
                        print(msg)
    
    # Return results
    return results_log


#############################################
## Input Parameters
#############################################

# Cloud path to file to process
input_file_gcs_path = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/anvil_dataset_metadata_20240808.tsv"

# User token (use gcloud auth print-access-token to get this)
duos_token = ""

# Environment
duos_env = "prod"

# Target DAC identifier
dac_id = 2

# Study Upload List (to limit the studies upload, leave empty for all)
study_upload_list = [
    "Center for Common Disease Genomics [CCDG] - Autoimmune: Inflammatory Bowel Disease (IBD) Exomes and Genomes (phs001642)"
]

# Specifies whether the upload should run (False) or if only a preview of the upload should be displaye
preview_only = True

#############################################
## Execution
#############################################

upload_results = upload_to_duos(input_file_gcs_path, duos_token, duos_env, dac_id, study_upload_list, preview_only)
df_results = pd.DataFrame(upload_results, columns = ["Item", "Status", "Message"])
print("\nUpload Results:")
display(df_results)


# Step 3: Attach DUOS IDs to Snapshots

## Add DUOS IDs Based on Snapshot Listed in DUOS -- NEEDS FIX (8/13/24)

In [None]:
#############################################
## Functions
#############################################

def link_duos_ids_to_snapshots(snapshot_id_list, env, token):
    results_log = []

    # Determine the target URL from the env variable
    if env == "prod":
        url = "https://consent.dsde-prod.broadinstitute.org"
    else:
        url = "https://consent.dsde-dev.broadinstitute.org"

    # Pull a list of existing datasets and studies from DUOS and build lookup dicts
    print("Building lookup between Snapshot and DUOS ID...")
    try:
        datasets = requests.get(
            url=f"{url}/api/dataset/v2?asCustodian=false",
            headers={"Authorization": f"Bearer {token}"}
        ).json()
        snapshot_lookup = {}
        for dataset_entry in datasets:
            try:
                url = ""
                snapshot = False
                for prop_entry in dataset_entry["properties"]:
                    if prop_entry["propertyName"] == "URL":
                        url = prop_entry["propertyValue"]
                    elif prop_entry["propertyName"] == "Data Location" and prop_entry["propertyValue"] == "TDR Location":
                        snapshot = True
                if snapshot == True:
                    snapshot_id = re.search("([a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})", url, re.IGNORECASE).group(1)
                    duos_id = dataset_entry["datasetIdentifier"]
                    snapshot_lookup[snapshot_id] = duos_id
            except:
                pass
        results_log.append(["Snapshot Lookup Creation", "Success", ""])
    except Exception as e:
        msg = f"Error building lookup between Snapshot and DUOS ID: {str(e)}"
        results_log.append(["Snapshot Lookup Creation", "Failed", msg])
        print(msg)
        return results_log

    # Loop through input snapshots and link DUOS IDs to them
    print("Linking DUOS IDs to Snapshots...")
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    api_client = refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    duos_api = data_repo_client.DuosApi(api_client=api_client)
    for snapshot_id in snapshot_id_list:
        print(f"\tProcessing snapshot ID = {snapshot_id}")
        duos_id = snapshot_lookup.get(snapshot_id)
        if duos_id:
            # Link the DUOS ID to the snapshot
            print(f"\t\t- Linking DUOS ID {duos_id} to snapshot.")
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = snapshots_api.link_duos_dataset_to_snapshot(id=snapshot_id, duos_id=duos_id).to_dict()
                    if response.get("linked"):
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Success", ""])
                        break
                    elif response.get("message"):
                        response_message = response.get("message")
                        msg = f"Error linking DUOS ID to Snapshot: {response_message}"
                        if attempt_counter >= 2:
                            results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Failed", msg])
                            break
                except Exception as e:
                    msg = f"Error linking DUOS ID to Snapshot: {str(e)}"
                    if attempt_counter >= 2:
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Failed", msg])
                    sleep(5)
                    attempt_counter += 1  
            
            # Fetch the DUOS user group associated with the DUOS ID
            print(f"\t\t- Fetching DUOS user group from DUOS ID {duos_id}.")
            duos_group = ""
            attempt_counter = 0
            while attempt_counter <= 2:
                try:  
                    response = duos_api.retrieve_duos_firecloud_group(duos_id=duos_id).to_dict()
                    duos_group = response["firecloud_group_email"]
                    results_log.append([f"DUOS User Group Fetching ({duos_id})", "Success", ""])
                    break
                except Exception as e:
                    msg = f"Error fetching DUOS user group for DUOS ID {duos_id}: {str(e)}"
                    if attempt_counter >= 2:
                        results_log.append([f"DUOS User Group Fetching ({duos_id})", "Failed", msg])
                    sleep(5)
                    attempt_counter += 1 
            
            # Add the DUOS user group to any DAC groups on the snapshot
            print(f"\t\t- Adding DUOS user group {duos_group} to snapshot DAC user group(s).")
            dac_groups = []
            try:
                response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id).to_dict()
                if response.get("auth_domain"):
                    dac_groups = response["auth_domain"]
                if dac_groups:
                    print(f"\t\t\t- DAC user group(s) found on snapshot: {dac_groups}.")
                    for dac_group in dac_groups:
                        response = requests.put(
                            url=f"https://api.firecloud.org/api/groups/{dac_group}/member/{duos_group}",
                            headers={"Authorization": f"Bearer {creds.token}"}
                        )
                        if response.status_code != 204:
                            results_log.append([f"DUOS Group to DAC Group Addition ({duos_group} - {dac_group})", "Failed", "Error adding DUOS group to DAC group."])
                        else:
                            results_log.append([f"DUOS Group to DAC Group Addition ({duos_group} - {dac_group})", "Success", ""])
                else:
                    msg = f"No DAC user group(s) found on snapshot."
                    print(f"\t\t\t- {msg}")
                    results_log.append([f"DUOS Group to DAC Group Addition ({snapshot_id} - {duos_id})", "Warning", msg])   
            except Exception as e:
                msg = f"Error adding DUOS Group to DAC Group: {str(e)}"
                results_log.append([f"DUOS Group to DAC Group Addition ({snapshot_id} - {duos_id})", "Failed", msg])
        
        else:
            results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id})", "Failed", "No DUOS ID found for the snapshot."])
    return results_log

#############################################
## Input Parameters
#############################################

# User token (use gcloud auth print-access-token to get this)
token = ""

# Environment
env = "prod"

# Snapshot list
snapshot_id_list = [
    '737d454c-88be-477f-ae2c-ef473e2106ce',
    '253e2b36-1674-482b-bfbd-4e0b05cdfe63',
    '3f53e841-ca9d-4b55-b390-590718533561',
    '01cf2450-604b-43e5-9f4e-9ec4e0bf0a61',
    '85b0b351-cd0a-4efe-95a4-e39273c42831',
    'c9037419-367e-439c-a247-b0dae7c24146',
    'd7b2b2c6-72fd-4084-af34-a86edfe3ac47',
    'd63a63ce-24c8-413a-89c0-4bd4c82370c0',
    '1bb208f2-ecf3-4589-a9bd-b6e94178584d',
    '5773565d-ad7c-4f51-8b4f-f1ee5dffc08a',
    '2e5c5fe3-3af4-4c34-a85e-af6b4135f089',
    '27068295-b3c0-4260-9447-9ca96814d46f',
    '060c707a-2f0d-4730-bbd6-d25489abfcf6',
    '7e59197f-b859-4279-add3-de24bbc7e52b',
    '624fef99-e4ce-4c12-a3d9-90995b5da970',
    'a68d3145-81c2-41f8-9944-5e4a5058934a',
    'a3b18d45-96c2-4526-8fde-65ab3265868f',
    '3ec72891-87d2-431f-850c-e52013330ea8',
    '87d02347-d169-4ce0-9027-3c8e11e48c40',
    '61b6ae23-ca19-4d31-bad3-2281a8528886',
    '7c4edc65-bfe6-4ede-a68a-c0b9d2564f29',
    'f330517e-46fd-4de3-8063-015b524a7324',
    'f0d8bb27-1695-4faf-8b27-4b95260b8f17',
    '17d14df1-cb64-4aae-8049-c1728a3c0c81',
    '434f85e2-4435-483c-8099-b03c8ba794ed',
    '5bba97dc-d6ab-4329-912f-148c8b807056',
    '4c722626-c559-4f5a-84bd-8d7d46983e1e',
    '6df525e1-b143-4e6f-b667-80c783ae1b66',
    '079eb53c-e2b6-4da6-ab5f-fc2136a3ecc1',
    '1a26532c-16e6-4f1c-81f9-8f07a8181421',
    '3ac713b5-3645-4381-ac66-ecbc281a2ab8',
    '4911bd18-5db9-418a-9dc0-0ea28ae937d6',
    'bbd04481-0b9d-4c21-ba65-a43638116e0f',
    '2b78a3ac-8bca-4938-bc7c-26a60f9c04ac',
    '4bb891fc-fcae-40cc-bf59-73716de7e04e',
    '574e0d42-e712-4a86-be7a-4b3a95187bcd',
    '56078c29-a393-4c60-9e04-3674e02fe729',
    '099d2585-1379-4333-b3b1-ffc0d26d95c5',
    'ab71d294-4ba9-44d4-8051-913b3d5ccff3',
    '90fe2016-e79c-456c-a5f9-3a31149fcd65',
    'e43974fd-cee1-4d8c-a436-6846d7d24129',
    '0d607d21-c9c7-4852-83e3-76825176ee0a',
    '0a356156-961d-4829-b9b5-c07fbc73dacc',
    '18a28450-31ec-4e4a-a305-dbbdd226ae3c',
    'f7d225d9-1675-483d-a1eb-9ef750301cd4',
    'd4b02f5f-7a62-4cad-8ffc-d3deb0fab445',
    '4c8ce027-8094-4f5d-bf62-22b1d51b3c1e',
    'c753046a-cf9b-4813-be68-cb3b9dd9866e',
    'eb7948be-1007-4b0e-b9b6-a5c40bbb9596',
    '7639a9e0-275c-49a8-80c1-cdb01ce23e1c',
    'aa2bfacc-c28c-4192-960c-b1389cf68516',
    'd7349942-f8ff-4ad6-b075-8f39652a7789',
    'b9e0de2a-4085-4226-a073-1744914cbbd4',
    '44b1f60b-e74c-4430-9378-d4a75e2de72f',
    'a4c62d7f-34f0-4e2e-9e46-c762d3ab0ff2',
    '6a5b3be6-d1de-4f23-a431-b08e7ab231b8',
    '5208772d-21f9-46b0-8167-0b05b57296b8',
    '36690013-e8bc-43a5-9ba9-83317537557c',
    '172bada7-f1c5-41c4-836d-05381beaed9a',
    '9a1e873b-b1db-4d3e-a83b-ed6c5b3f3ecc',
    '2c6de04e-104d-42c8-8448-97d74985dacb',
    '452bcafd-ab45-4e24-b5e0-13fcf22b0755',
    'fbafdd31-21a0-44c5-ae4d-724839beff61',
    '2a1882d9-88ca-4849-bcc1-f6914f593407',
    '3838993f-59ba-4dec-8110-ac3ea387ab91',
    'bf2f4106-cee9-419c-b4d1-d7b03a6293d5',
    'a6c36f5e-b86c-4164-85ae-8bf0df2e4a90',
    '11a7572f-02b9-4f88-8c2c-802dfb1f94b7',
    '5e547934-c339-410e-a013-dfefed50f4b8',
    'ffa84feb-ca0e-43d3-a04d-a402a8e24a3b',
    '2be072bd-2153-4050-9358-e4b95297a9bf',
    '7c19d852-e36a-4353-afea-10e501601d9a',
    'fd3843fe-ee5d-4784-b0d2-6673f9886d30',
    '84703c54-a9dd-400c-9701-2fc40922e3e3',
    '00297802-e20a-413f-b389-a6f764b6600e',
    'c853d4c0-d4be-433d-964e-e30bdc35480e',
    '3e85b06a-a6ea-4ce8-a655-44b1fce12138',
    '6e674477-522f-4adc-8c50-76910a6a282b',
    '504089f1-c59d-48fe-84ef-858bd3eb3043',
    '0565b2e4-ade1-46e7-80bf-ca647a89a8b8',
    '1cf943bc-9ffe-4fd0-a92d-6fdcf68da743',
    'bb11d621-e471-4ca9-b9ae-cf06c99db297',
    '7b875b4b-a6c5-4c92-a252-cd5ff203089e',
    '97b3d565-3c32-4fd5-be49-c16f0bae84e7',
    'ea08adf0-2383-41ae-a91a-88c7b8f6f42b',
    '5b8c745a-972b-455c-8021-ee24fdbce9a5',
    'bebf0200-8458-4467-b001-ff436564e942',
    '1c16f983-c090-457a-aca7-4181d16e225b',
    'b259ac6c-3358-4faa-abfe-c9d614b76915',
    '1a119cfe-3178-4f06-800b-b2aec50218b8',
    '33c73ae8-f829-438d-bdb1-da0be8f3773f',
    '3d6afb8e-dbcd-4972-8281-ae546b23356c',
    '42fd7b4a-461d-4a4f-bb02-856e7124dce1',
    '08f28ada-3fa1-41f3-a7eb-5b4ff8325145',
    '189a0802-8538-41f8-ad51-8bb2a736783b',
    'e0dc36c3-ff48-4ab5-881f-899578e08dd4',
    '9052b5fc-8ac8-41ea-8a82-6860b8d2c33d',
    'b8bc131f-68d6-4c56-bd37-55c1b0e27d2e',
]
snapshot_id_list = ['737d454c-88be-477f-ae2c-ef473e2106ce']

#############################################
## Execution
#############################################

results = link_duos_ids_to_snapshots(snapshot_id_list, env, token)
df_results = pd.DataFrame(results, columns = ["Item", "Status", "Message"])
print("\nLinking Results:")
display(df_results)


## Add DUOS IDs Directly

In [3]:
#############################################
## Functions
#############################################

def direct_link_duos_ids_to_snapshots(snapshot_duos_list):
    results_log = []

    # Loop through input snapshots and link DUOS IDs to them
    print("Linking DUOS IDs to Snapshots...")
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    api_client = refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    duos_api = data_repo_client.DuosApi(api_client=api_client)
    for ss_duos_entry in snapshot_duos_list:
        snapshot_id = ss_duos_entry[0]
        duos_id = ss_duos_entry[1]
        print(f"\tProcessing snapshot ID = {snapshot_id}")
        if duos_id:
            # Link the DUOS ID to the snapshot
            print(f"\t\t- Linking DUOS ID {duos_id} to snapshot.")
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = snapshots_api.link_duos_dataset_to_snapshot(id=snapshot_id, duos_id=duos_id).to_dict()
                    if response.get("linked"):
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Success", ""])
                        break
                    elif response.get("message"):
                        response_message = response.get("message")
                        msg = f"Error linking DUOS ID to Snapshot: {response_message}"
                        if attempt_counter >= 2:
                            results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Failed", msg])
                            break
                except Exception as e:
                    msg = f"Error linking DUOS ID to Snapshot: {str(e)}"
                    if attempt_counter >= 2:
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Failed", msg])
                    sleep(5)
                    attempt_counter += 1  
            
            # Fetch the DUOS user group associated with the DUOS ID
            print(f"\t\t- Fetching DUOS user group from DUOS ID {duos_id}.")
            duos_group = ""
            attempt_counter = 0
            while attempt_counter <= 2:
                try:  
                    response = duos_api.retrieve_duos_firecloud_group(duos_id=duos_id).to_dict()
                    duos_group = response["firecloud_group_email"]
                    results_log.append([f"DUOS User Group Fetching ({duos_id})", "Success", ""])
                    break
                except Exception as e:
                    msg = f"Error fetching DUOS user group for DUOS ID {duos_id}: {str(e)}"
                    if attempt_counter >= 2:
                        results_log.append([f"DUOS User Group Fetching ({duos_id})", "Failed", msg])
                    sleep(5)
                    attempt_counter += 1 
            
            # Add the DUOS user group to any DAC groups on the snapshot
            print(f"\t\t- Adding DUOS user group {duos_group} to snapshot DAC user group(s).")
            dac_groups = []
            try:
                response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id).to_dict()
                if response.get("auth_domain"):
                    dac_groups = response["auth_domain"]
                if dac_groups:
                    print(f"\t\t\t- DAC user group(s) found on snapshot: {dac_groups}.")
                    for dac_group in dac_groups:
                        response = requests.put(
                            url=f"https://api.firecloud.org/api/groups/{dac_group}/member/{duos_group}",
                            headers={"Authorization": f"Bearer {creds.token}"}
                        )
                        if response.status_code != 204:
                            results_log.append([f"DUOS Group to DAC Group Addition ({duos_group} - {dac_group})", "Failed", "Error adding DUOS group to DAC group."])
                        else:
                            results_log.append([f"DUOS Group to DAC Group Addition ({duos_group} - {dac_group})", "Success", ""])
                else:
                    msg = f"No DAC user group(s) found on snapshot."
                    print(f"\t\t\t- {msg}")
                    results_log.append([f"DUOS Group to DAC Group Addition ({snapshot_id} - {duos_id})", "Warning", msg])   
            except Exception as e:
                msg = f"Error adding DUOS Group to DAC Group: {str(e)}"
                results_log.append([f"DUOS Group to DAC Group Addition ({snapshot_id} - {duos_id})", "Failed", msg])
        
        else:
            results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id})", "Failed", "No DUOS ID found for the snapshot."])
    return results_log

#############################################
## Input Parameters
#############################################

# Snapshot list
snapshot_duos_list = [
    #['snapshot_id', 'duos_id']
    ['cdcfc6ac-6c9f-4d99-a8c3-4d1e5d171261', 'DUOS-000219'],
    ['a8b8b258-2b61-40a8-95b2-68247cf29eb3', 'DUOS-000353'],
    ['1c6223bf-9665-4b54-bbc3-40847ebaf92c', 'DUOS-000243'],
    ['c0d588bc-5a77-490f-86b0-fcbe3f06654c', 'DUOS-000240'],
    ['1985e363-b6da-47ec-8c92-dabcd587e6b6', 'DUOS-000235'],
    ['c6185024-b442-45c1-8822-fd3e2fb57adf', 'DUOS-000462'],
    ['4277cb35-8d37-49c7-8bb1-eeb57a68b739', 'DUOS-000463'],
    ['ba20eb5d-3553-4f0c-92eb-5793697da74d', 'DUOS-000461'],
    ['369754cd-6296-4312-9160-066f1a1d5194', 'DUOS-000377'],
    ['a4fb1f72-5f1e-4788-93cd-2d398362a0ce', 'DUOS-000374'],
    ['2054866e-b906-42cd-9e1a-d3eaf1b6057e', 'DUOS-000375'],
    ['568ba9a0-95ba-486b-bab1-efc3798e5f41', 'DUOS-000376'],
    ['d5f3d7ce-5a55-4508-b662-e624a83b304d', 'DUOS-000252'],
    ['ce99c021-32fc-4278-bad0-03f9dadeeed8', 'DUOS-000236'],
    ['00c1a1f1-6bfc-478b-8b32-5d5911081638', 'DUOS-000272'],
    ['2283bdf2-82c2-45c8-9e2c-f5855bd6e103', 'DUOS-000372'],
]

#############################################
## Execution
#############################################

results = direct_link_duos_ids_to_snapshots(snapshot_duos_list)
df_results = pd.DataFrame(results, columns = ["Item", "Status", "Message"])
print("\nLinking Results:")
display(df_results)

Linking DUOS IDs to Snapshots...
	Processing snapshot ID = cdcfc6ac-6c9f-4d99-a8c3-4d1e5d171261
		- Linking DUOS ID DUOS-000219 to snapshot.
		- Fetching DUOS user group from DUOS ID DUOS-000219.
		- Adding DUOS user group DUOS-000219-users@firecloud.org to snapshot DAC user group(s).
			- DAC user group(s) found on snapshot: ['AUTH_anvil_nhgri_broad_ibd_daly_turner_wes'].
	Processing snapshot ID = a8b8b258-2b61-40a8-95b2-68247cf29eb3
		- Linking DUOS ID DUOS-000353 to snapshot.
		- Fetching DUOS user group from DUOS ID DUOS-000353.
		- Adding DUOS user group DUOS-000353-users@firecloud.org to snapshot DAC user group(s).
			- DAC user group(s) found on snapshot: ['AUTH_anvil_nhgri_broad_ibd_daly_winter_wes'].
	Processing snapshot ID = 1c6223bf-9665-4b54-bbc3-40847ebaf92c
		- Linking DUOS ID DUOS-000243 to snapshot.
		- Fetching DUOS user group from DUOS ID DUOS-000243.
		- Adding DUOS user group DUOS-000243-users@firecloud.org to snapshot DAC user group(s).
			- No DAC user group(s) fo

Unnamed: 0,Item,Status,Message
0,DUOS ID to Snapshot Linkage (cdcfc6ac-6c9f-4d99-a8c3-4d1e5d171261 - DUOS-000219),Success,
1,DUOS User Group Fetching (DUOS-000219),Success,
2,DUOS Group to DAC Group Addition (DUOS-000219-users@firecloud.org - AUTH_anvil_nhgri_broad_ibd_daly_turner_wes),Success,
3,DUOS ID to Snapshot Linkage (a8b8b258-2b61-40a8-95b2-68247cf29eb3 - DUOS-000353),Success,
4,DUOS User Group Fetching (DUOS-000353),Success,
5,DUOS Group to DAC Group Addition (DUOS-000353-users@firecloud.org - AUTH_anvil_nhgri_broad_ibd_daly_winter_wes),Success,
6,DUOS ID to Snapshot Linkage (1c6223bf-9665-4b54-bbc3-40847ebaf92c - DUOS-000243),Success,
7,DUOS User Group Fetching (DUOS-000243),Success,
8,DUOS Group to DAC Group Addition (1c6223bf-9665-4b54-bbc3-40847ebaf92c - DUOS-000243),Warning,No DAC user group(s) found on snapshot.
9,DUOS ID to Snapshot Linkage (c0d588bc-5a77-490f-86b0-fcbe3f06654c - DUOS-000240),Success,


# Script Development

## Fetch parameters from snapshot/dataset

In [None]:
# Parameters
snapshot_id = "099d2585-1379-4333-b3b1-ffc0d26d95c5"

# Retrieve snapshot details
api_client = refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
snapshot_details = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
dataset_id = snapshot_details["source"][0]["dataset"]["id"]
phs_id = snapshot_details["source"][0]["dataset"]["phs_id"]

# Retrieve dataset details
dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["PROPERTIES"]).to_dict()
if dataset_details["properties"].get("auth_domains"):
    auth_domain = dataset_details["properties"]["auth_domains"][0]
if dataset_details["properties"].get("source_workspaces"):
    source_workspace = dataset_details["properties"]["source_workspaces"][0]

# Print output
print(phs_id)
print(source_workspace)

## Pulling Workspace Attributes

In [None]:
# Parameters
ws_project = "anvil-datastorage"
ws_name = "AnVIL_GREGOR_RELEASE_01_HMB"

# Establish credentials
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

# Pull workspace attributes
ws_attributes = requests.get(
    url=f"https://api.firecloud.org/api/workspaces/{ws_project}/{ws_name}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Map to schema
terra_dict = {}
terra_dict["studyName"] = ws_attributes["workspace"]["attributes"].get("library:projectName")
terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
#terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
terra_dict["dataTypes"] = ws_attributes["workspace"]["attributes"].get("library:dataCategory")["items"]
terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
terra_dict["species"] = "Homo sapiens"
terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
terra_dict["dataCustodianEmail"] = ws_attributes["workspace"]["attributes"].get("library:contactEmail")
if ws_attributes["workspace"]["attributes"].get("tag:tags"):
    for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
        if "Consortium:" in tag:
            terra_dict["consortium"] = tag.split(":")[1].strip()
        elif "dbGaP:" in tag:
            terra_dict["dbGaPPhsID"] = tag.split(":")[1].strip()
terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"]["library:dataUseRestriction"] 
terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]

# View schema
print(terra_dict)


In [None]:
ws_attributes

In [None]:
ws_attributes

## dbGaP XML Parse

In [None]:
# Parameters
phs_id = "phs003047"
#phs_id = "phs000693"

# Pull and parse XML
phs_short = phs_id.replace("phs", "")
dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
response = requests.get(url=dbgap_url)
xml_data = xmltodict.parse(response.text)

# Map to schema
dbgap_xml_dict = {}
if isinstance(xml_data["dbgapss"]["Study"], list):
    study_data = xml_data["dbgapss"]["Study"][0]
else:
    study_data = xml_data["dbgapss"]["Study"] 
dbgap_xml_dict["studyName"] = study_data["StudyInfo"].get("StudyNameEntrez")
dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
dbgap_xml_dict["dbGaPPhsID"] = phs_id
dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
for ap_entry in study_data["Authority"]["Persons"]["Person"]:
    if ap_entry["Role"] == "PI":
        dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
        dbgap_xml_dict["piEmail"] = ap_entry["@email"]
        dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
    elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
        dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
    elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
        dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
ic_list = []
if isinstance(study_data["Authority"]["ICs"]["IC"], list):
    for ic_entry in study_data["Authority"]["ICs"]["IC"]:
        ic_list.append(ic_entry["@name"])
else:
    ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
dbgap_xml_dict["numberOfParticipants"] = study_data.get("@num_participants")
dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")

# View schema
print(dbgap_xml_dict)


In [None]:
study_data

In [None]:
study_data

## dbGaP Study API

In [None]:
# Parameters
study_uid = 483191234

# Pull and parse JSON
dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
response = requests.get(url=dbgap_study_url)
study_api_data = json.loads(response.text)

# Map to schema
dbgap_study_api_dict = {}
if study_api_data.get("error") == None:
    dbgap_study_api_dict["studyName"] = study_api_data["data"].get("report_name")
    dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
    dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
    dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
    for attr_entry in study_api_data["data"].get("attribution"):
        if attr_entry.get("title") == "Principal Investigator":
            dbgap_study_api_dict["piName"] = attr_entry.get("name")
            dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
            break

# View schema
print(dbgap_study_api_dict)

In [None]:
study_api_data

## dbGaP FHIR API

In [None]:
# Parameters
#phs_id = "phs003047"
phs_id = "phs000693"

# Pull and parse JSON
dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + phs_id
response = requests.get(url=dbgap_fhir_url)
fhir_data = json.loads(response.text)

# Map to schema
dbgap_fhir_dict = {}
dbgap_fhir_dict["studyName"] = fhir_data["entry"][0]["resource"].get("title")
dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
dbgap_fhir_dict["dbGaPPhsID"] = phs_id
dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
dbgap_fhir_dict["nihICsSupportingStudy"] = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
# studyType
for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
    for coding_entry in cat_entry.get("coding"):
        if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
            value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
            if dbgap_fhir_dict.get("studyType") and value:
                dbgap_fhir_dict["studyType"] += f", {value}"
            elif value:
                dbgap_fhir_dict["studyType"] = value
# dataTypes
dt_list = []
for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
        for inner_ext_entry in ext_entry.get("extension"):
            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                    dt_list.append(coding_entry.get("code"))
dbgap_fhir_dict["dataTypes"] = dt_list
# phenotypeIndication
for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
    for coding_entry in focus_entry.get("coding"):
        value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
        if dbgap_fhir_dict.get("phenotypeIndication") and value:
            dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
        elif value:
            dbgap_fhir_dict["phenotypeIndication"] = value
# numberOfParticipants
for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
        for inner_ext_entry in ext_entry.get("extension"):
            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                dbgap_fhir_dict["numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")

# View schema
print(dbgap_fhir_dict)

In [None]:
fhir_data

# Utilities

## Delete Studies from DUOS (Dev Only)

In [None]:
# Inputs
token = "ya29.a0AXooCguIqhDnXCIIEY-Gy6yFVCasjKPakIN5zVrWuZWDXsrGCn5IOQx5ZiUNr8Evm1nFL53QgqrcYiNhi2-cQEUA1auPTwVA9LNv0FnM5DNv0ydBkHQ8OdZrjz5RwLTP3uvIrpTa48AKM3j9yJYW8P6FOF50v6YU2209UPCbpM4aCgYKAaYSARMSFQHGX2MifHPLRm_u_vRMl5mtFr8zww0178"
study_id_list = [
    '5918',
    '5919',
    '5920',
    '5921',
    '5922',
    '5923',
    '5924',
    '5925',
    '5926',
    '5927',
    '5928',
    '5929',
    '5930',
    '5931',
    '5932',
    '5933',
    '5934',
    '5935',
    '5936',
    '5937',
    '5938',
    '5939',
    '5940',
    '5941',
    '5942',
    '5943',
    '5944',
    '5945',
    '5946',
    '5947',
    '5948',
    '5949',
    '5950',
    '5951',
    '5952',
    '5953',
    '5954',
    '5955',
    '5956',
    '5957',
    '5958',
    '5959',
    '5960',
    '5961',
    '5962',
    '5963',
    '5964',
    '5965',
    '5966',
    '5967',
    '5968',
    '5969',
    '6034',
    '6035',
    '6036',
    '6037',
    '6039',
    '6040',
    '6041',
    '6042',
    '6043',
    '6044',
    '6045',
    '6046',
    '6047',
    '6048',
    '6049',
    '6050',
    '6051',
    '6052',
    '6053',
    '6054',
    '6055',
    '6056',
    '6057',
    '6058',
    '6059',
    '6060',
    '6061',
    '6062',
    '6063',
    '6064',
    '6065',
    '6066',
    '6068',
    '6069',
    '6070',
    '6071',
    '6072',
    '6073',
    '6074',
]

# Delete studies
for study_id in study_id_list:
    print(f"Deleting study ID {study_id}")
    response = requests.delete(
        url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/study/{study_id}",
        headers={"Authorization": f"Bearer {token}"} 
    )
    if response.status_code == 200:
        print("Study deleted successfully.")
    else:
        msg = response.json()["message"]
        print(f"Error deleting study: {msg}")
    

## Delete Datasets from DUOS (Dev Only)

In [None]:
# Collect datasets
for key,val in dataset_lookup.items():
    if val["create_date"] == "Jul 25, 2024" or val["create_date"] == "Jul 24, 2024":
        print(val["id"])

In [None]:
# Inputs
token = "ya29.a0AXooCguIqhDnXCIIEY-Gy6yFVCasjKPakIN5zVrWuZWDXsrGCn5IOQx5ZiUNr8Evm1nFL53QgqrcYiNhi2-cQEUA1auPTwVA9LNv0FnM5DNv0ydBkHQ8OdZrjz5RwLTP3uvIrpTa48AKM3j9yJYW8P6FOF50v6YU2209UPCbpM4aCgYKAaYSARMSFQHGX2MifHPLRm_u_vRMl5mtFr8zww0178"
dataset_id_list = [ 
    '2153',
    '2154',
    '2155',
    '2156',
    '2157',
    '2158',
    '2159',
    '2160',
    '2161',
    '2162',
    '2163',
    '2164',
    '2165',
    '2166',
    '2167',
    '2168',
    '2169',
    '2170',
    '2171',
    '2172',
    '2173',
    '2174',
    '2175',
    '2176',
    '2177',
    '2178',
    '2179',
    '2180',
    '2181',
    '2182',
    '2183',
    '2184',
    '2185',
    '2186',
    '2187',
    '2188',
    '2189',
    '2190',
    '2191',
    '2192',
    '2193',
    '2194',
    '2195',
    '2196',
    '2197',
    '2198',
    '2199',
    '2200',
    '2201',
    '2202',
    '2203',
    '2204',
    '2205',
    '2206',
    '2207',
    '2208',
    '2209',
    '2210',
    '2211',
    '2212',
    '2213',
    '2214',
    '2215',
    '2216',
    '2217',
    '2218',
    '2219',
    '2220',
    '2221',
    '2222',
    '2223',
    '2224',
    '2225',
    '2226',
    '2227',
    '2228',
    '2229',
    '2230',
    '2231',
    '2232',
    '2233',
    '2234',
    '2235',
    '2236',
    '2237',
    '2238',
    '2239',
    '2240',
    '2241',
    '2242',
    '2243',
    '2244',
    '2245',
    '2246',
    '2247',
    '2248',
    '2249',
    '2250',
    '2251',
    '2252',
    '2253',
    '2254',
    '2255',
    '2256',
    '2257',
    '2258',
    '2259',
    '2260',
    '2261',
    '2262',
    '2263',
    '2264',
    '2265',
    '2266',
    '2267',
    '2268',
    '2269',
    '2270',
    '2271',
    '2272',
    '2273',
    '2274',
    '2275',
    '2276',
    '2277',
    '2278',
    '2279',
    '2280',
    '2281',
    '2282',
    '2283',
    '2284',
    '2285',
    '2286',
    '2287',
    '2288',
    '2289',
    '2290',
    '2291',
    '2292',
    '2293',
    '2294',
    '2295',
    '2296',
    '2297',
    '2298',
    '2299',
    '2300',
    '2301',
    '2302',
    '2370',
    '2371',
    '2372',
    '2373',
    '2374',
    '2375',
    '2376',
    '2377',
    '2378',
    '2379',
    '2380',
    '2381',
    '2382',
    '2383',
    '2384',
    '2385',
    '2386',
    '2387',
    '2388',
    '2389',
    '2390',
    '2391',
    '2392',
    '2393',
    '2394',
    '2395',
    '2396',
    '2397',
    '2398',
    '2399',
    '2400',
    '2401',
    '2402',
    '2403',
    '2404',
    '2405',
    '2406',
    '2407',
    '2408',
    '2409',
    '2410',
    '2411',
    '2412',
    '2413',
    '2414',
    '2415',
    '2416',
    '2417',
    '2418',
    '2419',
    '2420',
    '2421',
    '2422',
    '2423',
    '2424',
    '2425',
    '2426',
    '2427',
    '2428',
    '2429',
    '2430',
    '2431',
    '2432',
    '2433',
    '2434',
    '2435',
    '2436',
    '2437',
    '2438',
    '2439',
    '2440',
    '2441',
    '2442',
    '2443',
    '2444',
    '2445',
    '2446',
    '2447',
    '2448',
    '2449',
    '2450',
    '2451',
    '2452',
    '2453',
    '2454',
    '2455',
    '2456',
    '2457',
    '2458',
    '2459',
    '2460',
    '2461',
    '2462',
    '2463',
    '2464',
    '2465',
    '2466',
    '2467',
    '2468',
    '2469',
    '2470',
    '2471',
    '2472',
    '2473',
    '2474',
    '2475',
    '2476',
    '2477',
    '2478',
    '2479',
    '2480',
    '2481',
    '2482',
    '2483',
    '2484',
    '2485',
    '2486',
    '2487',
    '2488',
    '2489',
    '2490',
    '2491',
    '2492',
    '2493',
    '2494',
    '2495',
    '2496',
    '2497',
    '2498',
    '2499',
    '2500',
    '2501',
    '2502',
    '2503',
    '2504',
    '2505',
    '2506',
    '2507',
    '2508',
    '2509',
    '2510',
    '2511',
    '2512',
    '2513',
    '2514',
    '2515',
    '2516',
    '2517',
    '2518',
    '2519',
    '2520',
    '2521',
    '2522',
    '2523',
    '2524',
    '2525',
    '2526',
    '2527',
    '2528',
    '2529',
    '2530',
    '2531',
    '2532',
    '2533',
    '2534',
    '2535',
    '2536',
    '2537',
    '2538',
    '2539',
    '2540',
    '2541',
    '2542',
    '2543',
    '2544',
    '2545',
    '2546',
    '2547',
    '2548',
    '2549',
    '2550',
    '2551',
    '2552',
    '2553',
    '2554',
    '2555',
    '2556',
    '2557',
    '2558',
    '2559',
    '2560',
    '2561',
    '2562',
    '2563',
    '2564',
    '2568',
    '2569',
    '2570',
    '2571',
    '2572',
    '2573',
    '2574',
    '2575',
    '2576',
    '2577',
    '2578',
    '2579',
    '2580',
]

# Delete datasets
for dataset_id in dataset_id_list:
    print(f"Deleting dataset ID {dataset_id}")
    response = requests.delete(
        url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/index/{dataset_id}",
        headers={"Authorization": f"Bearer {token}"} 
    )
    if response.status_code == 200:
        print("Dataset deleted successfully.")
    else:
        msg = response.json()["message"]
        print(f"Error deleting dataset: {msg}")

## Collect AnVIL Studies and Datasets DUOS

In [None]:
# Inputs
token = "ya29.a0AcM612xxfjn8ZGJsmC2glkZi2naoNAc34QmAh5iKxdm6Pgi-AdmjaR-gW7xVKuIJTa4U8BSPC0gXWBMM6yC0INFxZmF9h3Md2bbHQrlaQ_oR72L6dsiuSH5kMKM3l6ke2mA6yW9tNZCB5-7gRNFNvWpe0BlvShtC7xmtXD2GnqIaCgYKAX4SARASFQHGX2MiXO5RXj3BUdjkkGFdBg-RJw0178"
env = "prod"

# Determine the target URL from the env variable
if env == "prod":
    url = "https://consent.dsde-prod.broadinstitute.org"
else:
    url = "https://consent.dsde-dev.broadinstitute.org"

# Pull a list of existing AnVIL studies and datasets from DUOS
studies_processed = set()
results = []
datasets = requests.get(
    url=f"{url}/api/dataset/v2?asCustodian=true",
    headers={"Authorization": f"Bearer {token}"}
).json()
for dataset_entry in datasets:
    if dataset_entry.get("study") and dataset_entry["study"]["studyId"] not in studies_processed:
        study_id = dataset_entry["study"]["studyId"]
        if dataset_entry["study"].get("description") and "Platform: AnVIL" in dataset_entry["study"]["description"]: 
            study_name = dataset_entry["study"]["name"]
            study_phs = ""
            for prop_entry in dataset_entry["study"]["properties"]:
                if prop_entry["key"] == "dbGaPPhsID":
                    study_phs = prop_entry["value"]
                    break
            for dataset_id in dataset_entry["study"]["datasetIds"]:
                dataset_details = requests.get(
                    url=f"{url}/api/dataset/v2/{dataset_id}",
                    headers={"Authorization": f"Bearer {token}"}
                ).json()
                dataset_name = dataset_details["name"]
                dataset_identifier = dataset_details["datasetIdentifier"]
                snapshot_id = ""
                for prop_entry in dataset_entry["properties"]:
                    if prop_entry["propertyName"] == "URL":
                        snapshot_url = prop_entry["propertyValue"]
                        if snapshot_url:
                            if "https://data.terra.bio/snapshots/" in snapshot_url:
                                snapshot_id = snapshot_url.replace("https://data.terra.bio/snapshots/", "")
                        
                results.append([study_id, study_name, study_phs, dataset_id, dataset_identifier, dataset_name, snapshot_id])
        studies_processed.add(study_id)

# Display results
df_results = pd.DataFrame(results, columns = ["Study ID", "Study Name", "Study PHS", "Dataset ID", "Dataset Identifier", "Dataset Name", "Snapshot ID"])
print("\nResults:")
display(df_results)

In [None]:
datasets = requests.get(
    url=f"{url}/api/dataset/v2?asCustodian=true",
    headers={"Authorization": f"Bearer {token}"}
)

In [None]:
datasets

In [None]:
# Inputs
token = "ya29.a0AcM612xxfjn8ZGJsmC2glkZi2naoNAc34QmAh5iKxdm6Pgi-AdmjaR-gW7xVKuIJTa4U8BSPC0gXWBMM6yC0INFxZmF9h3Md2bbHQrlaQ_oR72L6dsiuSH5kMKM3l6ke2mA6yW9tNZCB5-7gRNFNvWpe0BlvShtC7xmtXD2GnqIaCgYKAX4SARASFQHGX2MiXO5RXj3BUdjkkGFdBg-RJw0178"
env = "prod"

# Determine the target URL from the env variable
if env == "prod":
    url = "https://consent.dsde-prod.broadinstitute.org"
else:
    url = "https://consent.dsde-dev.broadinstitute.org"

# Pull a list of existing AnVIL studies and datasets from DUOS
results = []
datasets = requests.get(
    url=f"{url}/api/dataset/v3",
    headers={"Authorization": f"Bearer {token}"}
).json()
datasets_to_process = len(datasets)
datasets_processed = 0
for dataset_entry in datasets:
    datasets_processed += 1
    print(f"Processing dataset {datasets_processed} of {datasets_to_process}...")
    dataset_id = dataset_entry["dataset_id"]
    dataset_details = requests.get(
        url=f"{url}/api/dataset/v2/{dataset_id}",
        headers={"Authorization": f"Bearer {token}"}
    ).json() 
    if dataset_details.get("study"):
        study_id = dataset_details["study"]["studyId"]
        if dataset_details["study"].get("description") and "Platform: AnVIL" in dataset_details["study"]["description"]: 
            study_name = dataset_details["study"]["name"]
            study_phs = ""
            for prop_entry in dataset_details["study"]["properties"]:
                if prop_entry["key"] == "dbGaPPhsID":
                    study_phs = prop_entry["value"]
                    break
            dataset_name = dataset_details["name"]
            dataset_identifier = dataset_details["datasetIdentifier"]
            snapshot_id = ""
            for prop_entry in dataset_details["properties"]:
                if prop_entry["propertyName"] == "URL":
                    snapshot_url = prop_entry["propertyValue"]
                    if snapshot_url and "https://data.terra.bio/snapshots/" in snapshot_url:
                            snapshot_id = snapshot_url.replace("https://data.terra.bio/snapshots/", "")       
            results.append([study_id, study_name, study_phs, dataset_id, dataset_identifier, dataset_name, snapshot_id])

# Display results
df_results = pd.DataFrame(results, columns = ["Study ID", "Study Name", "Study PHS", "Dataset ID", "Dataset Identifier", "Dataset Name", "Snapshot ID"])
print("\nResults:")
display(df_results)

In [None]:
study_list

In [None]:
dataset_lookup